首页 > 英语园地

机器学习实战（一）Decision_tree_红酒数据集

更新时间:2023-07-20 14:35:09 阅读：评论：0

机器学习实战（⼀）Decision_tree_红酒数据集

from sklearn import tree

from sklearn.datats import load_wine

del_lection import train_test_split

wine = load_wine()

wine

wine.data

array([[1.423e+01, 1.710e+00, 2.430e+00, …, 1.040e+00, 3.920e+00,

1.065e+03],

[1.320e+01, 1.780e+00, 2.140e+00, …, 1.050e+00, 3.400e+00,

1.050e+03],

[1.316e+01, 2.360e+00, 2.670e+00, …, 1.030e+00, 3.170e+00,

1.185e+03],

…,

[1.327e+01, 4.280e+00, 2.260e+00, …, 5.900e-01, 1.560e+00,

8.350e+02],umbilicus

[1.317e+01, 2.590e+00, 2.370e+00, …, 6.000e-01, 1.620e+00,

8.400e+02],

[1.413e+01, 4.100e+00, 2.740e+00, …, 6.100e-01, 1.600e+00,

5.600e+02]])

wine.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,

2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

2, 2])

wine.data.shape

(178, 13)

import pandas as pd

01234567891011120 014.23 1.71 2.4315.6127.0 2.80 3.060.28 2.29 5.64 1.04 3.921065.00 113.20 1.78 2.1411.2100.0 2.65 2.760.26 1.28 4.38 1.05 3.401050.00 213.16 2.36 2.6718.6101.0 2.80 3.240.30 2.81 5.68 1.03 3.171185.00 314.37 1.95 2.5016.8113.0 3.85 3.490.24 2.187.800.86 3.451480.00 413.24 2.59 2.8721.0118.0 2.80 2.690.39 1.82 4.32 1.04 2.93735.00 .............................................

.............................................

01234567891011120 17313.71 5.65 2.4520.595.0 1.680.610.52 1.067.700.64 1.74740.02

17413.40 3.91 2.4823.0102.0 1.800.750.43 1.417.300.70 1.56750.02

17513.27 4.28 2.2620.0120.0 1.590.690.43 1.3510.200.59 1.56835.02

17613.17 2.59 2.3720.0120.0 1.650.680.53 1.469.300.60 1.62840.02

17714.13 4.10 2.7424.596.0 2.050.760.56 1.359.200.61 1.60560.02 178 rows × 14 columns

wine.feature_names

[‘alcohol’,

‘malic_acid’,

‘ash’,

‘alcalinity_of_ash’,

‘magnesium’,

‘total_phenols’,

‘flavanoids’,

‘nonflavanoid_phenols’,

‘proanthocyanins’,

‘color_intensity’,

‘hue’,

‘od280/od315_of_diluted_wines’,

‘proline’]

wine.target_names

array([‘class_0’, ‘class_1’, ‘class_2’], dtype=’<U7’)

train_test_split是随机划分训练集和测试集的。测试集占0.3，训练集占0.7：

Xtrain,Xtest,Ytrain,Ytest = train_test_split(wine.data,wine.target,test_size =0.3)

Xtrain.shape

(124, 13)

Xtest.shape

(54, 13)

Ytrain

array([0, 2, 0, 0, 1, 2, 2, 1, 1, 1, 1, 0, 1, 1, 1, 0, 2, 0, 0, 2, 2, 0,

1, 0, 1, 1, 2, 1, 1, 0, 0, 0, 1, 2, 1, 0, 1, 1, 2, 2, 2, 2, 2, 1,

0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 2, 0, 1, 2, 2, 1, 1, 1, 2, 2, 1,

2, 0, 2, 2, 1, 1, 1, 0, 1, 0, 0, 0, 2, 1, 0, 2, 0, 1, 2, 1, 0, 2,

1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 2, 2, 2, 0, 1, 2, 0,

1, 0, 0, 1, 1, 0, 2, 0, 2, 2, 2, 0, 1, 0])

Ytest

array([0, 2, 1, 1, 2, 2, 1, 0, 0, 1, 0, 0, 0, 1, 1, 2, 0, 1, 1, 0, 0, 2,

2, 1, 1, 1, 0, 2, 2, 2, 1, 2, 1, 1, 0, 2, 0, 2, 2, 1, 1, 0, 2, 1,

1, 0, 1, 0, 0, 0, 0, 1, 2, 1])

clf = tree.DecisionTreeClassifier(criterion="entropy",random_state =30)

"""

entropy为“不纯度”指标。

random_state是⽤来设置分⽀中的随机模式的参数。输⼊任意整数，会长出同⼀棵树，使结果具有⼀定的稳定性。

决策树是随机的，在⾼维度时随机性会表现得⽐较明显，在低维度时随机性⼏乎没什么变化（⽐如鸢尾花数据集，只有3个特征，⽽通常只会⽤到它的三个特征）

30没有什么特殊的意义，可以换成任意数字，可以使score每⼀次都是⼀致的。

"""

clf = clf.fit(Xtrain,Ytrain)stefanie

score = clf.score(Xtest,Ytest)#返回预测的准确度

score

0.9259259259259259

score

0.9259259259259259

feature_name =['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','⾮黄烷类酚类','花青素','颜⾊强度','⾊调','od280/od315稀释葡萄酒','脯氨酸']

import graphviz

dot_data = port_graphviz(clf

,feature_names = feature_name

,class_names =["琴酒","雪莉","贝尔摩德"]

,filled =True#filled为“真”表⽰填充颜⾊。当颜⾊越深时，不纯度越低

,rounded =True#当rounded为“真”时，框为圆⾓的；如果不设置rounded，框为长⽅形，即有棱⾓的

)

graph = graphviz.Source(dot_data)

graph

clf.feature_importances_

拉帕奇

"""

查看使⽤了那些特征。没有使⽤的特征的importance为0，⽤到的特征会有⼀个对应的importance值

"""

array([0.34731406, 0. , 0. , 0. , 0. ,

0. , 0.44736729, 0. , 0. , 0.11003237,

0. , 0. , 0.09528628])

[*zip(feature_name,clf.feature_importances_)]

"""

把特征名和对应的importance联系起来。根节点对于决策树的贡献永远都是最⾼的

游戏动漫学习"""

[(‘酒精’, 0.3473140578439304),

(‘苹果酸’, 0.0),

(‘灰’, 0.0),

(‘灰的碱性’, 0.0),

(‘镁’, 0.0),

(‘总酚’, 0.0),

(‘类黄酮’, 0.44736728516836155),

(‘⾮黄烷类酚类’, 0.0),

(‘花青素’, 0.0),

免准考证号查四级成绩

(‘颜⾊强度’, 0.1100323740621365),

(‘⾊调’, 0.0),

(‘od280/od315稀释葡萄酒’, 0.0),

韩语发音词典

(‘脯氨酸’, 0.0952862829255716)]

clf = tree.DecisionTreeClassifier(criterion="entropy"

,random_state=30

,splitter="random"

"""

#增加随机性，使树变⼤变宽。如果使⽤random后准确率变⾼，那么就保留这⼀⾏代码； #如果⽤random后准确率变低了，就注掉这⼀⾏代码，默认的模式就是best

#⼀切以追求“score越⼤”为⽬标

"""

)

clf = clf.fit(Xtrain, Ytrain)

score = clf.score(Xtest, Ytest)

score

0.9074074074074074

import graphviz

dot_data = port_graphviz(clf

,feature_names= feature_name

无辜的英文

,class_names=["琴酒","雪莉","贝尔摩德"]

,filled=True

,rounded=True

)

graph = graphviz.Source(dot_data)

graph

polyester树对训练集的拟合程度如何：

score_train = clf.score(Xtrain,Ytrain)

score_train

1.0

clf = tree.DecisionTreeClassifier(criterion="entropy"

,random_state=30

,splitter="random"

,max_depth=3

"""

#多于三层的部分全部都会被砍掉.如果砍掉之后score没有什么变化，说明被砍掉的这些对于结果确实没什么帮助 #如果砍掉之后score变⼩了，说明砍多了

"""

,min_samples_leaf=10

,min_samples_split=25#如果⼩于25就不会分⽀

)

clf = clf.fit(Xtrain, Ytrain)

dot_data = port_graphviz(clf

,feature_names= feature_name

,class_names=["琴酒","雪莉","贝尔摩德"]

conclusion,filled=True

,rounded=True

)

graph = graphviz.Source(dot_data)

graph

score = clf.score(Xtest,Ytest)

score

0.9259259259259259

import matplotlib.pyplot as plt #画图⽤的

test =[]

for i in range(10):

clf = tree.DecisionTreeClassifier(max_depth=i+1

criterion="entropy"

,random_state=30

,splitter="random"

)

clf = clf.fit(Xtrain, Ytrain)

score = clf.score(Xtest, Ytest)

test.append(score)

plt.plot(range(1,11),test,color="red",label="max_depth")

plt.legend()

plt.show()

clf.apply(Xtest)

array([30, 9, 22, 22, 9, 9, 22, 30, 26, 16, 30, 30, 26, 26, 16, 9, 30,

4, 22, 30, 26, 4, 9, 16, 22, 16, 30, 9, 9, 9, 16, 9, 9, 22,

30, 8, 30, 9, 9, 16, 16, 30, 9, 16, 16, 24, 16, 30, 30, 30, 30,

22, 9, 16], dtype=int64)

clf.predict(Xtest)

array([0, 2, 1, 1, 2, 2, 1, 0, 0, 1, 0, 0, 0, 0, 1, 2, 0, 1, 1, 0, 0, 1,

2, 1, 1, 1, 0, 2, 2, 2, 1, 2, 2, 1, 0, 1, 0, 2, 2, 1, 1, 0, 2, 1,

1, 1, 1, 0, 0, 0, 0, 1, 2, 1])

from sklearn.datats import load_boston

del_lection import cross_val_score

import DecisionTreeRegressor读mba的条件是什么

本文发布于:2023-07-20 14:35:09，感谢您对本站的认可！

本文链接：https://www.wtabcd.cn/fanwen/fan/78/1107024.html

上一篇：投稿Decision in Process

下一篇：机动目标跟踪系列讲座Part IV-Decision-Bad Methods

标签：砍掉任意随机性

留言与评论（共有 0 条评论）