首页 > 美文鉴赏

数据分析：线性回归分析之研究二手房价的影响因素，建立房价预测模型

更新时间:2023-07-10 01:10:23 阅读：评论：0

数据分析：线性回归分析之研究⼆⼿房价的影响因素，建⽴房价预测模型

# -*- coding: utf-8 -*-

'''

研究⼆⼿房价的影响因素，建⽴房价预测模型，数据存放在“sndHsPr.csv”中。

分析思路：

在对房价的影响因素进⾏模型研究之前，⾸先对各变量进⾏描述性分析，

以初步判断房价的影响因素，进⽽建⽴房价预测模型

变量说明如下：

dist-所在区

roomnum-室的数量

halls-厅的数量

AREA-房屋⾯积

floor-楼层

subway-是否临近地铁

school-是否学区房

price-平⽶单价

步骤如下：

（⼀）因变量分析：单位⾯积房价分析

（⼆）⾃变量分析：

2.1 ⾃变量⾃⾝分布分析

2.2 ⾃变量对因变量影响分析

（三）建⽴房价预测模型

3.1 线性回归模型

3.2 对因变量取对数的线性模型

3.3 考虑交互项的对数线性

（四）预测：假设有⼀家三⼝，⽗母为了能让孩⼦在东城区上学，

想买⼀套邻近地铁的两居室，⾯积是70平⽅⽶，中层楼层，那么房价⼤约是多少呢？

'''

import pandas as pd

import os

from scipy import stats

import statsmodels.api as sm

from statsmodels.formula.api import ols

import matplotlib.pyplot as plt

import aborn as sns

# 解决matplotlib中⽂显⽰以及负号(-)显⽰

# 数据导⼊和数据清洗

os.chdir(r'F:\python_data_analysis\data_file')

data = pd.read_csv('sndHsPr.csv')

print('数据预览: \n{}'.format(data.head()))

dist_columns = {'chaoyang': '朝阳区',

'haidian': '海淀区',

'fengtai': '丰台区',

'xicheng': '西城区',

'dongcheng': '东城区',

'shijingshan': '⽯景⼭区'}

data['dist'] = data['dist'].map(dist_columns)

data['price'] = data['price']/10000

print(data.head())

# 描述性统计分析

# 频数统计

for i in range(7):

if i != 3:

lumns.values[i], ':')

坝上高原print(lumns.values[i]].agg(['value_counts']).T)

print('===================================================')

# 两个连续性变量AREA和price的描述性统计

print('AREA :')

print(pd.DataFrame(data.AREA.agg(['mean', 'min', 'max', 'median', 'std'])).T)

print('===================================================')

print('连续型因变量price的描述性统计分析 :')

print(data[['price']].describe().T)

# 地区占⽐情况饼图

data['dist'].value_counts().plot(kind='pie', autopct='%6.3f%%')

plt.show()

# 房价的分布直⽅图

plt.hist(data['price'], bins=20)

plt.show()

# 各地区平均房价的差异柱状图

plt.xlabel('平均房价')

plt.ylabel('地区', rotation=0)

plt.show()

# 各地区房价的分布盒须图

dist_price_df = data[['dist', 'price']]

dist_price_df['dist'] = dist_price_df['dist'].astype('category')

dist_price_df['dist'].cat.t_categories(['⽯景⼭区', '丰台区', '朝阳区', '海淀区', '东城区', '西城区'], inplace=True) sns.boxplot(x='dist', y='price', data=dist_price_df)

plt.show()

# 有⽆地铁对房价的影响柱状图

plt.show()

# 是否学区房对房价的影响柱状图

plt.show()

# 有⽆地铁，是否学区房的交叉分析

工作规划模板sub_sch = pd.crosstab(data.subway, data.school)

print('有⽆地铁，是否学区房的交叉分析:')

print(sub_sch)

sub_sch.div(sub_sch.sum(1), axis=0).plot(kind='bar', stacked=True)

plt.show()

# 卧室个数对房价的影响

plt.show()

sns.boxplot(x='roomnum', y='price', data=data)

plt.show()

# 厅个数对房价的影响

plt.show()

sns.boxplot(x='halls', y='price', data=data)

plt.show()

# 不同楼层对房价的影响

plt.show()

sns.boxplot(x='floor', y='price', data=data)

plt.show()

# 分层抽样

# 抽样⽅法分为三种; 简单随机抽样simple_random, 系统抽样systematic, 分层抽样stratified

富春江# 抽样⽅法分为三种; 简单随机抽样simple_random, 系统抽样systematic, 分层抽样stratified import numpy as np

import math

import random

def get_sample(df, sampling, k, stratified_col=None):

len_df = len(df)

if k < 0:

rai AsrtionError('请确保输⼊的抽样数k⼤于零')

elif k >= 1:

asrt isinstance(k, int), '请确保输⼊的抽样数k为整数'

sampling_by_n = True

if sampling is 'stratified':

layers_num = df.groupby(by=stratified_col)[stratified_col[0]].count().count()

if k*layers_num >= len_df:

rai AsrtionError('请确保抽样数乘分层数不得超过总样本量')

el:

sampling_by_n = Fal

if sampling in ('simple_random', 'systematic'):

k = il(k*len_df)

if sampling is 'simple_random':

print('进⾏简单随机抽样')

idx = random.sample(range(len_df), k)

sample_result = df.iloc[idx, :].copy()

return sample_result

elif sampling is 'systematic':

print('进⾏系统抽样')

start = 0

step = len_df//k + 1

idx = range(len_df)[start::step]

sample_result = df.iloc[idx, :].copy()

return sample_result

新店开业图片elif sampling is 'stratified':

asrt stratified_col is not None, '请确保输⼊的分层列名不为空'

asrt all(np.in1d(stratified_col, df.columns)), '请检查输⼊的包含分层列名的列表'

print('进⾏分层抽样')

grouped = df.groupby(by=stratified_col)[stratified_col[0]].count()

if sampling_by_n == True:

grouped_s = grouped.map(lambda x: k)

el:

grouped_s = grouped.map(lambda x: il(x * k))

sample_result = pd.DataFrame(lumns)

for df_idx in grouped_s.index:

df_new = df

网名if len(stratified_col) == 1:

df_new = df_new[df_new[stratified_col[0]] == df_idx]

el:

for i in range(len(df_idx)):

df_new = df_new[df_new[stratified_col[i]] == df_idx[i]]

idx = random.sample(range(len(df_new)), grouped_s[df_idx])

grouped_df = df_new.iloc[idx, :].copy()

sample_result = sample_result.append(grouped_df)

return sample_result

el:

rai AsrtionError('sampling is illegal')

# 采⽤分层抽样, k=400

data_sampled = get_sample(data, sampling='stratified', k=400, stratified_col=['dist'])

'''

# 两样本T检验

# 两个分类变量subway, school

sub_1 = data[data['subway'] == 1]['price']

sub_0 = data[data['subway'] == 0]['price']

sch_1 = data[data['school'] == 1]['price']

sch_0 = data[data['school'] == 0]['price']

#⽅差齐性检验

w_statistic, p_value = stats.levene(sub_1, sub_0, center='median')

print('w_statistic: {}, p_value: {}'.format(w_statistic, p_value))

t_statistic, p_value = st_ind(sub_1, sub_0, equal_var=True)

print('t_statistic: {}, p_value: {}'.format(t_statistic, p_value))

w_statistic, p_value = stats.levene(sch_1, sch_0, center='median')

print('w_statistic: {}, p_value: {}'.format(w_statistic, p_value))

t_statistic, p_value = st_ind(sch_1, sch_0, equal_var=True)

print('t_statistic: {}, p_value: {}'.format(t_statistic, p_value))

'''

# ⽅差分析

# 六个分类变量 dist, roomnum, halls, floor, subway, school

print('price ~ dist⽅差分析的P值: {}'.format(sm.stats.anova_lm(ols('price ~ C(dist)', data=data_sampled).fit()).values[0, 4]))

print('price ~ roomnum⽅差分析的P值: {}'.format(sm.stats.anova_lm(ols('price ~ C(roomnum)', data=data_sampled).fit()).values[0, 4])) print('price ~ halls⽅差分析的P值: {}'.format(sm.stats.anova_lm(ols('price ~ C(halls)', data=data_sampled).fit()).values[0, 4]))

print('price ~ floor⽅差分析的P值: {}'.format(sm.stats.anova_lm(ols('price ~ C(floor)', data=data_sampled).fit()).values[0, 4]))

print('price ~ subway⽅差分析的P值: {}'.format(sm.stats.anova_lm(ols('price ~ C(subway)', data=data_sampled).fit()).values[0,4])) print('price ~ school⽅差分析的P值: {}'.format(sm.stats.anova_lm(ols('price ~ C(school)', data=data_sampled).fit()).values[0,4]))

# 从⽅差分析的结果可知, ⾃变量roomnum/halls对因变量price的影响不显著

# 将厅的个数切分成⼆分类变量:'⽆厅'/'有厅'

data_sampled['style_new'] = data_sampled['halls'].map(lambda x: '⽆厅' if x == 0 el '有厅')

print(data_sampled.head())

# 将多分类变量dist, floor⽣成哑变量下愚

data_dummy = pd.get_dummies(data_sampled[['dist', 'floor']])

print(data_dummy.head())

# 移除dist_⽯景⼭区, floor_high两个哑变量，作为参照组

data_dummy.drop(['dist_⽯景⼭区', 'floor_high'], axis=1, inplace=True)

# 将⽣成的哑变量与抽样数据集合并成新的数据集

data_concated = pd.concat([data_dummy, data_sampled[['AREA', 'subway','roomnum', 'school', 'style_new', 'price']]], axis=1)

print(data_concated.head())

print(lumns)

# 建⽴线性回归模型

lm_0 = ols('price ~ C(dist) + C(style_new) + C(floor) + subway + school + AREA', data=data_sampled).fit()

print('不对分类型⾃变量进⾏哑变量处理:')

print(lm_0.summary())

print('price ~ C(dist) + C(style_new) + C(floor) + subway + school + AREA 线性回归模型的R2值: {}'.format(lm_0.rsquared))

print('================================================')

lm_1 = ols('price ~ dist_东城区 + dist_丰台区 + dist_朝阳区 + dist_海淀区 + dist_西城区 + '

'floor_middle + floor_low + style_new + subway + school + AREA', data=data_concated).fit()

print('对分类型⾃变量进⾏哑变量处理:')

print(lm_1.summary())

print('price ~ dist_东城区 + dist_丰台区 + dist_朝阳区 + dist_海淀区 + dist_西城区 + '

'floor_middle + floor_low + style_new + subway + school + AREA 线性回归模型的R2值: {}'.format(lm_1.rsquared))

print('================================================')

data_concated['predict_1'] = lm_1.predict(data_concated)

data_concated['resid_1'] = sid

data_concated.plot(x='predict_1', y='resid_1', kind='scatter')

plt.show()

#由模型诊断图(散点图)可知, 存在异⽅差现象

# 对price取对数对连续型⾃变量AREA取对数

data_concated['price_ln'] = np.log(data_concated['price'])

data_concated['AREA_ln'] = np.log(data_concated['AREA'])

lm_2 = ols('price_ln ~ dist_东城区 + dist_丰台区 + dist_朝阳区 + dist_海淀区 + dist_西城区 + '

'floor_middle + floor_low + style_new + subway + school + AREA_ln', data=data_concated).fit()

print('对price取对数对连续型⾃变量AREA取对数对分类型⾃变量进⾏哑变量处理:')

print(lm_2.summary())

print('price_ln ~ dist_东城区 + dist_丰台区 + dist_朝阳区 + dist_海淀区 + dist_西城区 + '

'floor_middle + floor_low + style_new + subway + school + AREA_ln 线性回归模型的R2值: {}'.format(lm_2.rsquared))

#由模型诊断图(散点图)可知, 异⽅差现象得到消除

data_concated['predict_2'] = lm_2.predict(data_concated)

data_concated['resid_2'] = sid

data_concated.plot(x='predict_2', y='resid_2', kind='scatter')

plt.show()

# 由于⽯景⼭区学区房均价低于⾮学区房

# 考虑地区dist与学区房school的交互作⽤

import aborn as sns

# pd.pivot_table(data, index='dist', columns='school', values='price', aggfunc='mean')

sns.barplot(x='dist', y='price', hue='school', data=data)

plt.show()

# 描述统计⽯景⼭区⾮学区房与学区房的平均房价

print('⽯景⼭区⾮学区房 : {:.2f}万元/每平⽅⽶, ⽯景⼭区学区房 : {:.2f}万元/每平⽅⽶'.format(

data[(data['dist'] == '⽯景⼭区')&(data['school'] == 0)]['price'].mean(),

data[(data['dist'] == '⽯景⼭区')&(data['school'] == 1)]['price'].mean()))

# 描述统计其余五个区⾮学区房与学区房的平均房价

护庭十三番dists = ['丰台区', '海淀区', '西城区', '东城区', '朝阳区']

for i in dists:

print('{}⾮学区房 : {:.2f}万元/每平⽅⽶, {}学区房 : {:.2f}万元/每平⽅⽶'.format(

i, data[(data['dist'] == i) & (data['school'] == 0)]['price'].mean(),

i, data[(data['dist'] == i) & (data['school'] == 1)]['price'].mean()))

书画落款印章# 对⽐⽯景⼭区⾮学区房与学区房数量

sch0_account = data[(data['dist'] == '⽯景⼭区')&(data['school'] == 0)].shape[0]

sch1_account = data[(data['dist'] == '⽯景⼭区')&(data['school'] == 1)].shape[0]

sch_ratio = sch1_account/(sch0_account + sch1_account)

print('⽯景⼭区⾮学区房数量 : {}, ⽯景⼭区学区房数量 : {}, ⾮学区房占⽐: {:.4f}%'.format(sch0_account, sch1_account, sch_ratio*100))

dists = ['⽯景⼭区', '丰台区', '朝阳区', '海淀区', '东城区', '西城区']

df = pd.DataFrame()

sch_0 = []

sch_1 = []

for i in dists:

sch_0.append(data[(data['dist'] == i) & (data['school'] == 0)]['price'].mean())

sch_1.append(data[(data['dist'] == i) & (data['school'] == 1)]['price'].mean())

df['dist'] = pd.Series(dists)

df['no_school'] = pd.Series(sch_0)

df['school'] = pd.Series(sch_1)

print(df)

df1 = df['no_school'].T.values

df2 = df['school'].T.values

plt.figure(figsize=(10, 6))

x1 = range(0, len(df))

x2 = [i + 0.3 for i in x1]

plt.bar(x1, df1, color='r', width=0.3, alpha=0.6, label='⾮学区房')

本文发布于:2023-07-10 01:10:23，感谢您对本站的认可！

本文链接：https://www.wtabcd.cn/fanwen/fan/89/1075053.html

上一篇：湖南省邵阳市2021年中考英语真题试卷

下一篇：山东省“山东学情”2021-2022学年高二上学期12月联考试题英语(人教版) Word版含答案

标签：房价变量学区模型分析影响分层抽样

留言与评论（共有 0 条评论）