决策树的建⽴步骤(西⽠书例题)
树的建⽴步骤:引⼊包、数据的获取与处理、获取名称与类别标记——>选择样本最多的类作为类别标记——>重点来了,计算信息熵——>⼦数据集构建——>计算信息增益——>选择最优属性——>建⽴决策树。这些步骤搞懂了,基本就理解了决策树的原理。
import pandas as pd
import numpy as np
from collections import Counter
from math import log2
# In[56]:
#数据获取与处理
def getData(file_Path):
file_path = r'F:date427\decisionTree\xigua.xls'
data = pd.read_excel( file_path)
打印机驱动删除return data
def dataDeal(data):
dataList = np.array(data).tolist() #将矩阵转化成列表
dataSet = [element[1:] for element in dataList]
return dataSet
# In[57]:
#获取属性名称
def getLabels(data):
labels = lumns)[1:-1]评价测光
return labels
# In[58]:
#获取类别标记
def targetClass(dataSet):
classification = t([element[-1] for element in dataSet])
return classification
# In[59]:
#将分⽀结点标记为叶结点,选择样本数最多的类作为类标记
def majorityRule(dataSet):
mostKind = Counter([element[-1] for element in dataSet]).most_common(1)
majorityKind = mostKind[0][0]
return majorityKind
# In[60]:
#计算信息熵
def infoEntropy(dataSet):
classColumnCnt = Counter([element[-1] for element in dataSet])
Ent = 0
for symbol in classColumnCnt:电脑怎么连接宽带
p_k = classColumnCnt[symbol]/len(dataSet)南京名小吃
Ent = Ent-p_k*log2(p_k)
Ent = Ent-p_k*log2(p_k)
return Ent
# In[61]:
#⼦数据集构建
def makeAttributeData(dataSet,value,iColumn):
神机妙算是什么意思attributeData = []
for element in dataSet:
陛下是什么意思if element[iColumn]==value:
row = element[:iColumn]
attributeData.append(row)
return attributeData
# In[62]:
#计算信息增益
def infoGain(dataSet,iColumn):
Ent = infoEntropy(dataSet)
tempGain = 0.0
attribute = t([element[iColumn] for element in dataSet])
for value in attribute:
attributeData = makeAttributeData(dataSet,value,iColumn)
tempGain = tempGain+len(attributeData)/len(dataSet)*infoEntropy(attributeData) Gain = Ent-tempGain
return Gain
# In[63]:
#选择最优属性
def lectOptimalAttribute(dataSet,labels):
bestGain = 0
quence = 0
for iColumn in range(0,len(labels)):#不计最后的类别列
Gain = infoGain(dataSet,iColumn)
if Gain>bestGain:
bestGain = Gain
quence = iColumn
print(labels[iColumn],Gain)
return quence
# In[64]:
#建⽴决策树
def createTree(dataSet,labels):
classification = targetClass(dataSet) #获取类别种类(集合去重)
if len(classification) == 1:
return list(classification)[0]
if len(labels) == 1:
return majorityRule(dataSet)#返回样本种类较多的类别
quence = lectOptimalAttribute(dataSet,labels)
print(labels)
optimalAttribute = labels[quence]
del(labels[quence])
myTree = {optimalAttribute:{}}
办手机号需要什么attribute = t([element[quence] for element in dataSet])
for value in attribute:小练笔200字
print(myTree)
print(value)
subLabels = labels[:]
myTree[optimalAttribute][value] = createTree(makeAttributeData(dataSet,value,quence),subLabels) return myTree
# In[67]:
filePath = 'watermelonData.xls'
data = getData(filePath)
dataSet = dataDeal(data)
labels = getLabels(data)
myTree = createTree(dataSet,labels)
print(myTree)