机器学习⼊门之房价预测(线性回归)#!/usr/bin/env python
# coding: utf-8
# In[1]:
# 1.定义问题
# 2.导⼊数据
# 导⼊类库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import aborn as sns
from sklearn.linear_model import LinearRegression
七小
del_lection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
get_ipython().run_line_magic('matplotlib', 'inline')
import warnings
warnings.filterwarnings('ignore')
# 显⽰所有列
pd.t_option('display.max_columns', None)
# 导⼊数据
train_data = pd.read_csv('../data/train.csv')
189测速
test_data = pd.read_csv('../data/test.csv')
# In[2]:
# 3.理解数据
# 数据信息
train_data.info()
# In[3]:
# 数据维度
train_data.shape
# In[4]:
# 前5个数据
train_data.head(5)
淡淡墨香# In[5]:
# 描述性统计数据
train_data.describe().T
# In[6]:
意难忘第八季
# 4.数据可视化
# 分析SalePrice
train_data['SalePrice'].describe()
sns.distplot(train_data['SalePrice'])
plt.show()
# In[7]:
# 关系矩阵
corr = ()
f, ax = plt.subplots(figsize=(20, 9))
shoppsns.heatmap(corr, vmax=1, vmin=-1,square=True)
# In[8]:
# 选取特征
# 选择相关系数绝对值⼤于0.5的特征(共⼗个)
()[()['SalePrice'].values > abs(0.5)]
# In[9]:
cols = ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea', 'SalePrice'] train_data = train_data[cols]
train_data.info()
# In[10]:
什么是吃苦# 5.建⽴模型
过期牛奶能喝吗# 分离数据集
X = train_data.values[:, 0:10]
Y = train_data.values[:, 10]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
# 建模
model = LinearRegression()
# 预测数据
model.fit(X_train,Y_train)
y_pred = model.predict(X_test)
print('cost:'+ str(np.sum(abs(y_pred-Y_test)/len(y_pred))))
# In[11]:
# 由于原始数据所得cost太⼤,所以接下来对数据进⾏归⼀化处理(误差反⽽更⼤了,不知道为什么)
X_scaled = StandardScaler().fit_transform(X)
Y_scaled = StandardScaler().fit_shape(-1, 1))
脾胃虚弱的症状和调理
X_scaled_train, X_scaled_test, Y_scaled_train, Y_scaled_test = train_test_split(X_scaled, Y_scaled, test_size=0.33, random_state=42)
model_scaled = LinearRegression()
model_scaled.fit(X_scaled_train,Y_scaled_train)
y_pred = model.predict(X_scaled_test)
y_pred
print('cost:'+ str(np.sum(abs(y_pred-Y_scaled_test)/len(y_pred))))
# In[12]:
test_data['SalePrice'] = None
test_data = test_data[cols]
# 填充缺失值
test_data['TotalBsmtSF'].fillna(test_data['TotalBsmtSF'].median(), inplace=True)
test_data['GarageCars'].fillna(test_data['GarageCars'].median(), inplace=True)
test_data['GarageArea'].fillna(test_data['GarageArea'].median(), inplace=True)
# In[13]:
X = test_data.values[:, 0:10]
y_test_pre = model.predict(X)
test_data['SalePrice'] = y_test_pre
test_data.head(10)