Python-回归(线性回归、RFE、LASSO和岭回归+K折交叉验
证)
1. 普通线性回归:通过输出模型的真实值和预测值的平均平⽅差尽可能⼩(即最⼩⼆乘估计法),但容易陷⼊过度拟合(即低偏差),后续回归⽅法会有带正则化法来缩减数据。
2. 普通线性回归+RFE:RFE是recursive feature elimination回归特征消除,让回归特征消除过程中只保留no_features个最重要的特征,可以避免过度拟合,但RFE会舍弃⼀些变量,原没有下⾯⼏个⽅法给变量赋权重来的好。
3. L2缩减回归 - 岭回归:正则化那块采⽤L2范式,alpha越⼤,缩减幅度越⼤。岭回归⽐LASSO的预测能⼒好点,但LASSO能完成动态选择。
4. L1缩减回归 - LASSO:Least absolute shrinkage and lection operator最⼩绝对值缩减和选择操作,LASSO更偏向于稀疏的结果,如果⼀个结果⼤多数系数被压缩为0,那么它被称为系数的,LASSO⼤多数的系数都变成0了,对相关联的变量,只选择保留⼀个。
RFE:
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 05 19:52:39 2018
@author: Alvin AI
"""
from sklearn.datats import load_boston
ss_validationi import train_test_split
from sklearn.linear_model import LinearRegression
ics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from itertools import combinations
from sklearn.feature_lection import RFE
#载⼊数据
def get_data():
data = load_boston()
x = data['data']
y = data['target']
return x,y
#建⽴模型
#让回归特征消除(RFE-recursive feature elimination)只保留no_features个最重要的特征
def build_model(x,y,no_features):
model = LinearRegression(normalize=True,fit_intercept=True)
rfe_model = RFE(estimator=model,n_features_to_lect=no_features)
rfe_model.fit(x,y)
return rfe_model
#查看模型
def view_model(model):
print "\nmodel coefficients"
print "===================\n"
#coef_提供了⼀个系数矩阵,intercept_提供了回归常数
for i,coef in f_):
print "\t coefficient %d %model"%(i+1,coef)
print "\n\tintercept %0.3f"%(model.intercept_)
#计算均平⽅差⽤以评估模型误差
def model_worth(true_y,predicted_y):
print "\t mean squared error = %0.2f"%(mean_squared_error(true_y,predicted_y))
return mean_squared_error(true_y,predicted_y)
#绘制残差图
def plot_residual(y,predicted_y):
plt.cla()
plt.cla()
plt.xlabel('predicted y')
plt.ylabel('residual')
plt.title('residual plot')
plt.figure1(1)
diff = y - predicted_y
plt.plot(predicted_y,diff,'go')
plt.show()
if __name__=="__main__":
x,y = get_data()
#划分数据集
x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\
test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\
test_size=0.3,random_state=9)
#准备⼀些多项式特征
poly_features = PolynomialFeatures(interaction_only=True)#只有x1和x2交互⼀起的,x1^2这种不⾏ x_train_poly = poly_features.fit_transform(x_train)
x_dev_poly = poly_features.fit_transform(x_dev)
choon_model = build_model(x_train_poly,y_train,20)
predicted_y = choon_model.predict(x_train_poly)
m = model_worth(y_train,predicted_y)
x_test_poly = poly_features.fit_transform(x_test)
predicted_y = choon_model.predict(x_test_poly)
model_worth(y_test,predicted_y)
LASSO:
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 09 09:08:51 2018
@author: Alvin AI
"""
from sklearn.datats import load_boston
del_lection import train_test_split
from sklearn.linear_model import Lasso, LinearRegression
ics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
草粿import matplotlib.pyplot as plt
河粉import numpy as np
#加载数据
def get_data():
data = load_boston()
x = data['data']
y = data['target']
搅拌摩擦焊
return x,y
#建⽴模型
def build_models(x,y):
alpha_range = np.linspace(0,0.5,200)
model = Lasso(normalize=True)#只需要标准化,不需要中⼼化
coeffiecients = []
#对每个alpha值适配模型
for alpha in alpha_range:
model.t_params(alpha=alpha)
model.fit(x,y)
coeffiecients.f_)#追踪系数⽤来绘图
#print coeffiecients #维度为200*13
#绘制系数权重变化和对应的alpha值
#绘制系数权重变化和对应的alpha值
#绘制模型的RMSE和对应的alpha值
coeff_path(alpha_range,coeffiecients)
#查看系数值
深湖巨兽
#view_model(model)
#查看回归系数值
def view_model(model):
print "\n model coeffiecients"
print "======================"
for i,coef in f_):
print "\t coefficient %d %0.3f" % (i+1,coef)
print "\n\t intercept %0.3f" % (model.intercept_)
#评估模型
def model_worth(true_y,predicted_y):
print "\t mean squared error = %0.2f\n" % \
(mean_squared_error(true_y,predicted_y))
#绘制不同alpha值情况下的系数权重
def coeff_path(alpha_range,coeffiecients):
plt.clo('all')
plt.cla()
plt.figure(1)
plt.xlabel("Alpha Values")
plt.ylabel("coeffiecient weights for different alpha values")
plt.plot(alpha_range,coeffiecients)
plt.axis('tight')#修改x、y坐标的范围让所有的数据显⽰出来
plt.show()
教师技能大赛#主函数调⽤,查看保留下来的回归系数有哪些
def get_coef(x,y,alpha):
model = Lasso(normalize=True,alpha=alpha)
model.fit(x,y)
coefs = f_
indices = [i for i,coef in enumerate(coefs) if abs(coef) > 0.0]
return indices
#电泳所有函数
if __name__ == "__main__":
x,y = get_data()
#⽤不⽤的alpha值多次建模,并绘出图形
build_models(x,y)
print "\npredicting using all the variables\n"
full_model = LinearRegression(normalize=True)
full_model.fit(x,y)
predicted_y = full_model.predict(x)
model_worth(y,predicted_y)
print "\n models at different alpha values\n"
alpa_values = [0.22,0.08,0.01]
for alpha in alpa_values:
indices = get_coef(x,y,alpha)
print "\t alpha = %0.2f number of variables lected = %d\
" % (alpha,len(indices))#看保留下来的回归系数有多少
print "\t attributes include ", indices#看保留下来的回归系数有哪些 x_new = x[:,indices]
model = LinearRegression(normalize=True)
model.fit(x_new,y)
predicted_y = model.predict(x_new)
model_worth(y,predicted_y)
岭回归+交叉验证迭代器:针对于数据少的时候,然后把训练集划分为K份,模型再k-1份数据上进⾏驯良,剩下的⽤作测试,这样就不需要单独划分dev集,这种⽅法也叫K折交叉验证法。
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 09 14:30:10 2018
@author: Alvin AI
"""
from sklearn.datats import load_boston
ss_validation import KFold,train_test_split
from sklearn.linear_model import Ridge
id_arch import GridSearchCV
ics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
#载⼊数据
def get_data():
data = load_boston()
x = data['data']
y = data['target']
return x,y
#构建模型
def build_model(x,y):
kfold = KFold(y.shape[0],5)#K折交叉检验划分训练集和测试集,5份数据集(每份包括训练和测试)
暑假趣事200字
model = Ridge(normalize=True)#标准化数据并采⽤岭回归模型
alpha_range = np.linspace(0.0015,0.0017,30)#⽣成alpha测试集
grid_param = {"alpha":alpha_range}
#GridSearchCV帮助我们采⽤⼀个范围内参数对模型进⾏训练
#cv定义了感兴趣的交叉验证类型
grid = GridSearchCV(estimator=model,param_grid=grid_param,cv=kfold,\
scoring='mean_squared_error')
grid.fit(x,y)
display_param_id_scores_)#展⽰均⽅误差平均值
print grid.best_params_#打印最好的参数和评估量
#追踪均⽅残差的计量⽤于绘制图形
return grid.best_estimator_
#查看回归系数和截距
def view_model(model):
#print "\n estimated alpha = %0.3f" % model.alpha_#打印模型采⽤的alpha值
print "\n model coeffiecients"
print "======================\n"
for i,coef in f_):
print "\t coefficent %d %0.3f" % (i+1,coef)
print "\n\t intercept %0.3f" % (model.intercept_)
#模型评估
def model_worth(true_y,predicted_y):
print "\t Mean squared error = %0.2f" % (mean_squared_error(true_y,predicted_y))
return mean_squared_error(true_y,predicted_y)
#展⽰参数结果
def display_param_results(param_results):
fold = 1
for param_result in param_results:
print "fold %d mean squared error %0.2f" % (fold,abs(param_result[1]\
)),param_result[0]
fold+=1
if __name__ == "__main__":
含有心的成语if __name__ == "__main__":
x,y = get_data()
#将数据集划分为训练集和测试集
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.3,\ random_state=9)
#准备⼀些多项式特征
poly_features = PolynomialFeatures(interaction_only=True)
x_train_poly = poly_features.fit_transform(x_train)
x_test_poly = poly_features.fit_transform(x_test)
choon_model = build_model(x_train_poly,y_train)
怎么看五线谱predicted_y = choon_model.predict(x_train_poly)
model_worth(y_train,predicted_y)
view_model(choon_model)
predicted_y = choon_model.predict(x_test_poly)
model_worth(y_test,predicted_y)