数据分析:线性回归分析之研究二手房价的影响因素,建立房价预测模型

更新时间:2023-07-10 01:10:23 阅读: 评论:0

数据分析:线性回归分析之研究⼆⼿房价的影响因素,建⽴房价预测模型
# -*- coding: utf-8 -*-
'''
研究⼆⼿房价的影响因素,建⽴房价预测模型,数据存放在“sndHsPr.csv”中。
分析思路:
在对房价的影响因素进⾏模型研究之前,⾸先对各变量进⾏描述性分析,
以初步判断房价的影响因素,进⽽建⽴房价预测模型
变量说明如下:
dist-所在区
roomnum-室的数量
halls-厅的数量
AREA-房屋⾯积
floor-楼层
subway-是否临近地铁
school-是否学区房
price-平⽶单价
步骤如下:
(⼀)因变量分析:单位⾯积房价分析
(⼆)⾃变量分析:
2.1 ⾃变量⾃⾝分布分析
2.2 ⾃变量对因变量影响分析
(三)建⽴房价预测模型
3.1 线性回归模型
3.2  对因变量取对数的线性模型
3.3  考虑交互项的对数线性
(四)预测:假设有⼀家三⼝,⽗母为了能让孩⼦在东城区上学,
想买⼀套邻近地铁的两居室,⾯积是70平⽅⽶,中层楼层,那么房价⼤约是多少呢?
'''
import pandas as pd
import os
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
import aborn as sns
# 解决matplotlib中⽂显⽰以及负号(-)显⽰
# 数据导⼊和数据清洗
os.chdir(r'F:\python_data_analysis\data_file')
data = pd.read_csv('sndHsPr.csv')
print('数据预览: \n{}'.format(data.head()))
dist_columns = {'chaoyang': '朝阳区',
'haidian': '海淀区',
'fengtai': '丰台区',
'xicheng': '西城区',
'dongcheng': '东城区',
'shijingshan': '⽯景⼭区'}
data['dist'] = data['dist'].map(dist_columns)
data['price'] = data['price']/10000
print(data.head())
# 描述性统计分析
# 频数统计
for i in range(7):
if i != 3:
lumns.values[i], ':')
lumns.values[i], ':')
坝上高原print(lumns.values[i]].agg(['value_counts']).T)
print('===================================================')
# 两个连续性变量AREA和price的描述性统计
print('AREA :')
print(pd.DataFrame(data.AREA.agg(['mean', 'min', 'max', 'median', 'std'])).T)
print('===================================================')
print('连续型因变量price的描述性统计分析 :')
print(data[['price']].describe().T)
# 地区占⽐情况饼图
data['dist'].value_counts().plot(kind='pie', autopct='%6.3f%%')
plt.show()
# 房价的分布直⽅图
plt.hist(data['price'], bins=20)
plt.show()
# 各地区平均房价的差异柱状图
plt.xlabel('平均房价')
plt.ylabel('地区', rotation=0)
plt.show()
# 各地区房价的分布盒须图
dist_price_df = data[['dist', 'price']]
dist_price_df['dist'] = dist_price_df['dist'].astype('category')
dist_price_df['dist'].cat.t_categories(['⽯景⼭区', '丰台区', '朝阳区', '海淀区', '东城区', '西城区'], inplace=True) sns.boxplot(x='dist', y='price', data=dist_price_df)
plt.show()
# 有⽆地铁对房价的影响柱状图
plt.show()
# 是否学区房对房价的影响柱状图
plt.show()
# 有⽆地铁,是否学区房的交叉分析
工作规划模板sub_sch = pd.crosstab(data.subway, data.school)
print('有⽆地铁,是否学区房的交叉分析:')
print(sub_sch)
sub_sch.div(sub_sch.sum(1), axis=0).plot(kind='bar', stacked=True)
plt.show()
# 卧室个数对房价的影响
plt.show()
sns.boxplot(x='roomnum', y='price', data=data)
plt.show()
# 厅个数对房价的影响
plt.show()
sns.boxplot(x='halls', y='price', data=data)
plt.show()
# 不同楼层对房价的影响
plt.show()
sns.boxplot(x='floor', y='price', data=data)
plt.show()
# 分层抽样
# 抽样⽅法分为三种; 简单随机抽样simple_random, 系统抽样systematic, 分层抽样stratified
富春江# 抽样⽅法分为三种; 简单随机抽样simple_random, 系统抽样systematic, 分层抽样stratified import numpy as np
import math
import random
def get_sample(df, sampling, k, stratified_col=None):
len_df = len(df)
if k < 0:
rai AsrtionError('请确保输⼊的抽样数k⼤于零')
elif k >= 1:
asrt isinstance(k, int), '请确保输⼊的抽样数k为整数'
sampling_by_n = True
if sampling is 'stratified':
layers_num = df.groupby(by=stratified_col)[stratified_col[0]].count().count()
if k*layers_num >= len_df:
rai AsrtionError('请确保抽样数乘分层数不得超过总样本量')
el:
sampling_by_n = Fal
if sampling in ('simple_random', 'systematic'):
k = il(k*len_df)
if sampling is 'simple_random':
print('进⾏简单随机抽样')
idx = random.sample(range(len_df), k)
sample_result = df.iloc[idx, :].copy()
return sample_result
elif sampling is 'systematic':
print('进⾏系统抽样')
start = 0
step = len_df//k + 1
idx = range(len_df)[start::step]
sample_result = df.iloc[idx, :].copy()
return sample_result
新店开业图片elif sampling is 'stratified':
asrt stratified_col is not None, '请确保输⼊的分层列名不为空'
asrt all(np.in1d(stratified_col, df.columns)), '请检查输⼊的包含分层列名的列表'
print('进⾏分层抽样')
grouped = df.groupby(by=stratified_col)[stratified_col[0]].count()
if sampling_by_n == True:
grouped_s = grouped.map(lambda x: k)
el:
grouped_s = grouped.map(lambda x: il(x * k))
sample_result = pd.DataFrame(lumns)
for df_idx in grouped_s.index:
df_new = df
网名if len(stratified_col) == 1:
df_new = df_new[df_new[stratified_col[0]] == df_idx]
el:
for i in range(len(df_idx)):
df_new = df_new[df_new[stratified_col[i]] == df_idx[i]]
idx = random.sample(range(len(df_new)), grouped_s[df_idx])
grouped_df = df_new.iloc[idx, :].copy()
sample_result = sample_result.append(grouped_df)
return sample_result
el:
rai AsrtionError('sampling is illegal')
# 采⽤分层抽样, k=400
data_sampled = get_sample(data, sampling='stratified', k=400, stratified_col=['dist'])
data_sampled = get_sample(data, sampling='stratified', k=400, stratified_col=['dist'])
'''
# 两样本T检验
# 两个分类变量subway, school
sub_1 = data[data['subway'] == 1]['price']
sub_0 = data[data['subway'] == 0]['price']
sch_1 = data[data['school'] == 1]['price']
sch_0 = data[data['school'] == 0]['price']
#⽅差齐性检验
w_statistic, p_value = stats.levene(sub_1, sub_0, center='median')
print('w_statistic: {}, p_value: {}'.format(w_statistic, p_value))
t_statistic, p_value = st_ind(sub_1, sub_0, equal_var=True)
print('t_statistic: {}, p_value: {}'.format(t_statistic, p_value))
w_statistic, p_value = stats.levene(sch_1, sch_0, center='median')
print('w_statistic: {}, p_value: {}'.format(w_statistic, p_value))
t_statistic, p_value = st_ind(sch_1, sch_0, equal_var=True)
print('t_statistic: {}, p_value: {}'.format(t_statistic, p_value))
'''
# ⽅差分析
# 六个分类变量 dist, roomnum, halls, floor, subway, school
print('price ~ dist⽅差分析的P值: {}'.format(sm.stats.anova_lm(ols('price ~ C(dist)', data=data_sampled).fit()).values[0, 4]))
print('price ~ roomnum⽅差分析的P值: {}'.format(sm.stats.anova_lm(ols('price ~ C(roomnum)', data=data_sampled).fit()).values[0, 4])) print('price ~ halls⽅差分析的P值: {}'.format(sm.stats.anova_lm(ols('price ~ C(halls)', data=data_sampled).fit()).values[0, 4]))
print('price ~ floor⽅差分析的P值: {}'.format(sm.stats.anova_lm(ols('price ~ C(floor)', data=data_sampled).fit()).values[0, 4]))
print('price ~ subway⽅差分析的P值: {}'.format(sm.stats.anova_lm(ols('price ~ C(subway)', data=data_sampled).fit()).values[0,4])) print('price ~ school⽅差分析的P值: {}'.format(sm.stats.anova_lm(ols('price ~ C(school)', data=data_sampled).fit()).values[0,4]))
# 从⽅差分析的结果可知, ⾃变量roomnum/halls对因变量price的影响不显著
# 将厅的个数切分成⼆分类变量:'⽆厅'/'有厅'
data_sampled['style_new'] = data_sampled['halls'].map(lambda x: '⽆厅' if x == 0 el '有厅')
print(data_sampled.head())
# 将多分类变量dist, floor⽣成哑变量下愚
data_dummy = pd.get_dummies(data_sampled[['dist', 'floor']])
print(data_dummy.head())
# 移除dist_⽯景⼭区, floor_high两个哑变量,作为参照组
data_dummy.drop(['dist_⽯景⼭区', 'floor_high'], axis=1, inplace=True)
# 将⽣成的哑变量与抽样数据集合并成新的数据集
data_concated = pd.concat([data_dummy, data_sampled[['AREA', 'subway','roomnum', 'school', 'style_new', 'price']]], axis=1)
print(data_concated.head())
print(lumns)
# 建⽴线性回归模型
lm_0 = ols('price ~ C(dist) + C(style_new) + C(floor) + subway + school + AREA', data=data_sampled).fit()
print('不对分类型⾃变量进⾏哑变量处理:')
print(lm_0.summary())
print('price ~ C(dist) + C(style_new) + C(floor) + subway + school + AREA 线性回归模型的R2值: {}'.format(lm_0.rsquared))
print('================================================')
lm_1 = ols('price ~ dist_东城区 + dist_丰台区 + dist_朝阳区 + dist_海淀区 + dist_西城区 + '
'floor_middle + floor_low + style_new + subway + school + AREA', data=data_concated).fit()
print('对分类型⾃变量进⾏哑变量处理:')
print(lm_1.summary())
print('price ~ dist_东城区 + dist_丰台区 + dist_朝阳区 + dist_海淀区 + dist_西城区 + '
'floor_middle + floor_low + style_new + subway + school + AREA 线性回归模型的R2值: {}'.format(lm_1.rsquared))
print('================================================')
data_concated['predict_1'] = lm_1.predict(data_concated)
data_concated['resid_1'] = sid
data_concated['resid_1'] = sid
data_concated.plot(x='predict_1', y='resid_1', kind='scatter')
plt.show()
#由模型诊断图(散点图)可知, 存在异⽅差现象
# 对price取对数对连续型⾃变量AREA取对数
data_concated['price_ln'] = np.log(data_concated['price'])
data_concated['AREA_ln'] = np.log(data_concated['AREA'])
lm_2 = ols('price_ln ~ dist_东城区 + dist_丰台区 + dist_朝阳区 + dist_海淀区 + dist_西城区 + '
'floor_middle + floor_low + style_new + subway + school + AREA_ln', data=data_concated).fit()
print('对price取对数对连续型⾃变量AREA取对数对分类型⾃变量进⾏哑变量处理:')
print(lm_2.summary())
print('price_ln ~ dist_东城区 + dist_丰台区 + dist_朝阳区 + dist_海淀区 + dist_西城区 + '
'floor_middle + floor_low + style_new + subway + school + AREA_ln 线性回归模型的R2值: {}'.format(lm_2.rsquared))
#由模型诊断图(散点图)可知, 异⽅差现象得到消除
data_concated['predict_2'] = lm_2.predict(data_concated)
data_concated['resid_2'] = sid
data_concated.plot(x='predict_2', y='resid_2', kind='scatter')
plt.show()
# 由于⽯景⼭区学区房均价低于⾮学区房
# 考虑地区dist与学区房school的交互作⽤
import aborn as sns
# pd.pivot_table(data, index='dist', columns='school', values='price', aggfunc='mean')
sns.barplot(x='dist', y='price', hue='school', data=data)
plt.show()
# 描述统计⽯景⼭区⾮学区房与学区房的平均房价
print('⽯景⼭区⾮学区房 : {:.2f}万元/每平⽅⽶, ⽯景⼭区学区房 : {:.2f}万元/每平⽅⽶'.format(
data[(data['dist'] == '⽯景⼭区')&(data['school'] == 0)]['price'].mean(),
data[(data['dist'] == '⽯景⼭区')&(data['school'] == 1)]['price'].mean()))
# 描述统计其余五个区⾮学区房与学区房的平均房价
护庭十三番dists = ['丰台区', '海淀区', '西城区', '东城区', '朝阳区']
for i in dists:
print('{}⾮学区房 : {:.2f}万元/每平⽅⽶, {}学区房 : {:.2f}万元/每平⽅⽶'.format(
i, data[(data['dist'] == i) & (data['school'] == 0)]['price'].mean(),
i, data[(data['dist'] == i) & (data['school'] == 1)]['price'].mean()))
书画落款印章# 对⽐⽯景⼭区⾮学区房与学区房数量
sch0_account = data[(data['dist'] == '⽯景⼭区')&(data['school'] == 0)].shape[0]
sch1_account = data[(data['dist'] == '⽯景⼭区')&(data['school'] == 1)].shape[0]
sch_ratio = sch1_account/(sch0_account + sch1_account)
print('⽯景⼭区⾮学区房数量 : {}, ⽯景⼭区学区房数量 : {}, ⾮学区房占⽐: {:.4f}%'.format(sch0_account, sch1_account, sch_ratio*100))
dists = ['⽯景⼭区', '丰台区', '朝阳区', '海淀区', '东城区', '西城区']
df = pd.DataFrame()
sch_0 = []
sch_1 = []
for i in dists:
sch_0.append(data[(data['dist'] == i) & (data['school'] == 0)]['price'].mean())
sch_1.append(data[(data['dist'] == i) & (data['school'] == 1)]['price'].mean())
df['dist'] = pd.Series(dists)
df['no_school'] = pd.Series(sch_0)
df['school'] = pd.Series(sch_1)
print(df)
df1 = df['no_school'].T.values
df2 = df['school'].T.values
plt.figure(figsize=(10, 6))
x1 = range(0, len(df))
x2 = [i + 0.3 for i in x1]
plt.bar(x1, df1, color='r', width=0.3, alpha=0.6, label='⾮学区房')

本文发布于:2023-07-10 01:10:23,感谢您对本站的认可!

本文链接:https://www.wtabcd.cn/fanwen/fan/89/1075053.html

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。

标签:房价   变量   学区   模型   分析   影响   分层   抽样
相关文章
留言与评论(共有 0 条评论)
   
验证码:
推荐文章
排行榜
Copyright ©2019-2022 Comsenz Inc.Powered by © 专利检索| 网站地图