机器学习实战(⼀)Decision_tree_红酒数据集
from sklearn import tree
from sklearn.datats import load_wine
del_lection import train_test_split
wine = load_wine()
wine
wine.data
array([[1.423e+01, 1.710e+00, 2.430e+00, …, 1.040e+00, 3.920e+00,
1.065e+03],
[1.320e+01, 1.780e+00, 2.140e+00, …, 1.050e+00, 3.400e+00,
1.050e+03],
[1.316e+01, 2.360e+00, 2.670e+00, …, 1.030e+00, 3.170e+00,
1.185e+03],
…,
[1.327e+01, 4.280e+00, 2.260e+00, …, 5.900e-01, 1.560e+00,
8.350e+02],umbilicus
[1.317e+01, 2.590e+00, 2.370e+00, …, 6.000e-01, 1.620e+00,
8.400e+02],
[1.413e+01, 4.100e+00, 2.740e+00, …, 6.100e-01, 1.600e+00,
5.600e+02]])
wine.target
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2])
wine.data.shape
(178, 13)
import pandas as pd
01234567891011120 014.23 1.71 2.4315.6127.0 2.80 3.060.28 2.29 5.64 1.04 3.921065.00 113.20 1.78 2.1411.2100.0 2.65 2.760.26 1.28 4.38 1.05 3.401050.00 213.16 2.36 2.6718.6101.0 2.80 3.240.30 2.81 5.68 1.03 3.171185.00 314.37 1.95 2.5016.8113.0 3.85 3.490.24 2.187.800.86 3.451480.00 413.24 2.59 2.8721.0118.0 2.80 2.690.39 1.82 4.32 1.04 2.93735.00 .............................................
.............................................
01234567891011120 17313.71 5.65 2.4520.595.0 1.680.610.52 1.067.700.64 1.74740.02
17413.40 3.91 2.4823.0102.0 1.800.750.43 1.417.300.70 1.56750.02
17513.27 4.28 2.2620.0120.0 1.590.690.43 1.3510.200.59 1.56835.02
17613.17 2.59 2.3720.0120.0 1.650.680.53 1.469.300.60 1.62840.02
17714.13 4.10 2.7424.596.0 2.050.760.56 1.359.200.61 1.60560.02 178 rows × 14 columns
wine.feature_names
[‘alcohol’,
‘malic_acid’,
‘ash’,
‘alcalinity_of_ash’,
‘magnesium’,
‘total_phenols’,
‘flavanoids’,
‘nonflavanoid_phenols’,
‘proanthocyanins’,
‘color_intensity’,
‘hue’,
‘od280/od315_of_diluted_wines’,
‘proline’]
wine.target_names
array([‘class_0’, ‘class_1’, ‘class_2’], dtype=’<U7’)
train_test_split是随机划分训练集和测试集的。测试集占0.3,训练集占0.7:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(wine.data,wine.target,test_size =0.3)
Xtrain.shape
(124, 13)
Xtest.shape
(54, 13)
Ytrain
array([0, 2, 0, 0, 1, 2, 2, 1, 1, 1, 1, 0, 1, 1, 1, 0, 2, 0, 0, 2, 2, 0,
1, 0, 1, 1, 2, 1, 1, 0, 0, 0, 1, 2, 1, 0, 1, 1, 2, 2, 2, 2, 2, 1,
0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 2, 0, 1, 2, 2, 1, 1, 1, 2, 2, 1,
2, 0, 2, 2, 1, 1, 1, 0, 1, 0, 0, 0, 2, 1, 0, 2, 0, 1, 2, 1, 0, 2,
1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 2, 2, 2, 0, 1, 2, 0,
1, 0, 0, 1, 1, 0, 2, 0, 2, 2, 2, 0, 1, 0])
Ytest
array([0, 2, 1, 1, 2, 2, 1, 0, 0, 1, 0, 0, 0, 1, 1, 2, 0, 1, 1, 0, 0, 2,
2, 1, 1, 1, 0, 2, 2, 2, 1, 2, 1, 1, 0, 2, 0, 2, 2, 1, 1, 0, 2, 1,
1, 0, 1, 0, 0, 0, 0, 1, 2, 1])
clf = tree.DecisionTreeClassifier(criterion="entropy",random_state =30)
"""
entropy为“不纯度”指标。
random_state是⽤来设置分⽀中的随机模式的参数。输⼊任意整数,会长出同⼀棵树,使结果具有⼀定的稳定性。
决策树是随机的,在⾼维度时随机性会表现得⽐较明显,在低维度时随机性⼏乎没什么变化(⽐如鸢尾花数据集,只有3个特征,⽽通常只会⽤到它的三个特征)
30没有什么特殊的意义,可以换成任意数字,可以使score每⼀次都是⼀致的。
"""
clf = clf.fit(Xtrain,Ytrain)stefanie
score = clf.score(Xtest,Ytest)#返回预测的准确度
score
0.9259259259259259
score
0.9259259259259259
feature_name =['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','⾮黄烷类酚类','花青素','颜⾊强度','⾊调','od280/od315稀释葡萄酒','脯氨酸']
import graphviz
dot_data = port_graphviz(clf
,feature_names = feature_name
,class_names =["琴酒","雪莉","贝尔摩德"]
,filled =True#filled为“真”表⽰填充颜⾊。当颜⾊越深时,不纯度越低
,rounded =True#当rounded为“真”时,框为圆⾓的;如果不设置rounded,框为长⽅形,即有棱⾓的
)
graph = graphviz.Source(dot_data)
graph
clf.feature_importances_
拉帕奇
"""
查看使⽤了那些特征。没有使⽤的特征的importance为0,⽤到的特征会有⼀个对应的importance值
"""
array([0.34731406, 0. , 0. , 0. , 0. ,
0. , 0.44736729, 0. , 0. , 0.11003237,
0. , 0. , 0.09528628])
[*zip(feature_name,clf.feature_importances_)]
"""
把特征名和对应的importance联系起来。根节点对于决策树的贡献永远都是最⾼的
游戏动漫学习"""
[(‘酒精’, 0.3473140578439304),
(‘苹果酸’, 0.0),
(‘灰’, 0.0),
(‘灰的碱性’, 0.0),
(‘镁’, 0.0),
(‘总酚’, 0.0),
(‘类黄酮’, 0.44736728516836155),
(‘⾮黄烷类酚类’, 0.0),
(‘花青素’, 0.0),
免准考证号查四级成绩
(‘颜⾊强度’, 0.1100323740621365),
(‘⾊调’, 0.0),
(‘od280/od315稀释葡萄酒’, 0.0),
韩语发音词典
(‘脯氨酸’, 0.0952862829255716)]
clf = tree.DecisionTreeClassifier(criterion="entropy"
,random_state=30
,splitter="random"
"""
#增加随机性,使树变⼤变宽。如果使⽤random后准确率变⾼,那么就保留这⼀⾏代码; #如果⽤random后准确率变低了,就注掉这⼀⾏代码,默认的模式就是best
#⼀切以追求“score越⼤”为⽬标
"""
)
clf = clf.fit(Xtrain, Ytrain)
score = clf.score(Xtest, Ytest)
score
0.9074074074074074
import graphviz
dot_data = port_graphviz(clf
,feature_names= feature_name
无辜的英文
,class_names=["琴酒","雪莉","贝尔摩德"]
,filled=True
,rounded=True
)
graph = graphviz.Source(dot_data)
graph
polyester树对训练集的拟合程度如何:
score_train = clf.score(Xtrain,Ytrain)
score_train
1.0
clf = tree.DecisionTreeClassifier(criterion="entropy"
,random_state=30
,splitter="random"
,max_depth=3
"""
#多于三层的部分全部都会被砍掉.如果砍掉之后score没有什么变化,说明被砍掉的这些对于结果确实没什么帮助 #如果砍掉之后score变⼩了,说明砍多了
"""
,min_samples_leaf=10
,min_samples_split=25#如果⼩于25就不会分⽀
)
clf = clf.fit(Xtrain, Ytrain)
dot_data = port_graphviz(clf
,feature_names= feature_name
,class_names=["琴酒","雪莉","贝尔摩德"]
conclusion,filled=True
,rounded=True
)
graph = graphviz.Source(dot_data)
graph
score = clf.score(Xtest,Ytest)
score
0.9259259259259259
import matplotlib.pyplot as plt #画图⽤的
test =[]
for i in range(10):
clf = tree.DecisionTreeClassifier(max_depth=i+1
,
criterion="entropy"
,random_state=30
,splitter="random"
)
clf = clf.fit(Xtrain, Ytrain)
score = clf.score(Xtest, Ytest)
test.append(score)
plt.plot(range(1,11),test,color="red",label="max_depth")
plt.legend()
plt.show()
clf.apply(Xtest)
array([30, 9, 22, 22, 9, 9, 22, 30, 26, 16, 30, 30, 26, 26, 16, 9, 30,
4, 22, 30, 26, 4, 9, 16, 22, 16, 30, 9, 9, 9, 16, 9, 9, 22,
30, 8, 30, 9, 9, 16, 16, 30, 9, 16, 16, 24, 16, 30, 30, 30, 30,
22, 9, 16], dtype=int64)
clf.predict(Xtest)
array([0, 2, 1, 1, 2, 2, 1, 0, 0, 1, 0, 0, 0, 0, 1, 2, 0, 1, 1, 0, 0, 1,
2, 1, 1, 1, 0, 2, 2, 2, 1, 2, 2, 1, 0, 1, 0, 2, 2, 1, 1, 0, 2, 1,
1, 1, 1, 0, 0, 0, 0, 1, 2, 1])
from sklearn.datats import load_boston
del_lection import cross_val_score
import DecisionTreeRegressor读mba的条件是什么