西⽠书课后习题4.4基尼指数未剪枝预剪枝后剪枝
import operator
import csv
import numpy as np
def readDatat(filename):
'''
读取数据
:param filename: 数据⽂件名,CSV格式
:return: 以列表形式返回数据列表和特征列表
'''
with open(filename) as f:
reader = ader(f)
header_row = next(reader)
it教育labels = header_row[1:7]
datat = []沪江网校好吗
for line in reader:
tempVect = line[1:]
datat.append(tempVect)
trainIndex = [1, 2, 3, 6, 7, 10, 14, 15, 16, 17]
trainDatat = []
testDatat = []
for i in range(1, 18):
if (i in trainIndex):
trainDatat.append(datat[i - 1])
el:
testDatat.append(datat[i - 1])
trainDatat.append(datat[3]) # 为保持和书中结果相同,训练集中增加第四条数据
return datat, labels, trainDatat, testDatat
def Gini(datat):
'''
计算gini基尼值
:param datat: 输⼊数据集
:return: 返回基尼值gini
'''
numdata = len(datat)
labels = {}
for featVec in datat:
label = featVec[-1]
if label not in labels.keys():
labels[label] = 0
labels[label] += 1
中文转换日文
gini = 1
for lab in labels.keys():
prop = float(labels[lab]) / numdata
gini -= prop ** 2
return gini
def splitDatat(datat, axis, value):
'''
对某个特征进⾏划分后的数据集
:param datat: 数据集
:param axis: 划分属性的下标
:param value: 划分属性值
:return: 返回剩余数据集
'''洋菜
restDatat = []
for featVec in datat:
for featVec in datat:
if featVec[axis] == value:
restFeatVec = featVec[:axis]
restDatat.append(restFeatVec)
return restDatat
def bestFeatureSplit(datat):
'''
最优属性划分
:param datat: 输⼊需要划分的数据集
:
return: 返回最优划分属性的下标
'''
numFeature = len(datat[0]) - 1
bestGiniIndex = 10000
bestFeature = -1
for i in range(numFeature):
featList = [example[i] for example in datat]
uniqueValue = t(featList)
giniIndex = 0shutterfly
for value in uniqueValue:
subDatat = splitDatat(datat, i, value)
prop = len(subDatat) / float(len(datat))
giniIndex += prop * Gini(subDatat)
if (giniIndex < bestGiniIndex):
bestGiniIndex = giniIndex
bestFeature = i
return bestFeature
def majorClass(classList):
'''
对叶节点的分类结果进⾏划分,投票原则
:param classList: 叶节点上的样本数量
either怎么读:return: 返回叶节点划分结果
'''
classCount = {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), rever=True) # 返回数组 return sortedClassCount[0][0]
def decideTreePredict(decideTree, testData, labelsFull):
'''
决策树对测试数据进⾏结果预测
:param decideTree: 决策树模型
:param testData: 测试数据
:param labelsFull: 特征列表
:return: 返回预测结果
'''
firstFeat = list(decideTree.keys())[0]
cDict = decideTree[firstFeat]
featIndex = labelsFull.index(firstFeat)
classLabel = None
autocracy
for value in cDict.keys():
if testData[featIndex] == value:
if type(cDict[value]).__name__ == 'dict':
classLabel = decideTreePredict(cDict[value], testData, labelsFull)
el:
classLabel = cDict[value]
return classLabel
def prevReduceBranch(bestFeatLabel, trainDatat, testDatat, labelsFull):
classList = [example[-1] for example in trainDatat]
bestFeatIndex = labelsFull.index(bestFeatLabel)
trainDataValues = [example[bestFeatIndex] for example in trainDatat]
uniqueValues = t(trainDataValues)
error = 0
for value in uniqueValues:
partClassList = [classList[i] for i in range(len(classList)) if trainDataValues[i] == value] major = majorClass(partClassList)
for data in testDatat:
if data[bestFeatIndex] == value and data[-1] != major:
error += 1
# print('预剪枝继续展开错误数:' + str(error))
return error
def majorTest(major, testData):
error = 0
for i in range(len(testData)):
if major != testData[i][-1]:
error += 1
# print('当前节点为结节点错误数: ' + str(error))
return error
def postReduceBranch(subTree, testData, labelsFull):
error = 0
for i in range(len(testData)):
if decideTreePredict(subTree, testData[i], labelsFull) != testData[i][-1]:
error += 1
# print('后剪枝保留⼦树错误数: ' + str(error))
return error
def createTree(trainDatat, labels, datatFull, labelsFull, testDatat):
牛津中小学英语网
'''
递归创建决策树
:param datat: 数据集列表
:param labels: 标签集列表
:param datatFull: 数据集列表,再传⼀次
:param labelsFull: 标签集列表,再传⼀次
:param testData: 测试数据集列表
:return: 返回决策树字典
'''
classList = [example[-1] for example in trainDatat]
unt(classList[0]) == len(classList):
return classList[0]
if len(datat[0]) == 1:
return (majorClass(classList))
bestFeat = bestFeatureSplit(trainDatat)
bestFeatLabel = labels[bestFeat]
# 预剪枝
# if prevReduceBranch(bestFeatLabel, trainDatat, testDatat, labelsFull) < majorTest( # majorClass(classList),
# testDatat):
# myTree = {bestFeatLabel: {}}
# el:
# return majorClass(classList)
tanya jamesmyTree = {bestFeatLabel: {}}
del (labels[bestFeat])
featValues = [example[bestFeat] for example in trainDatat]
uniqueVal = t(featValues)
uniqueVal = t(featValues)
# 创建所有属性标签的所有值,以防漏掉某些取值,例如西⽠数据集2.0中的⾊泽:浅⽩
bestFeatIndex = labelsFull.index(bestFeatLabel)
stare
featValuesFull = [example[bestFeatIndex] for example in datatFull]
uniqueValFull = t(featValuesFull)
if uniqueVal == uniqueValFull:
for value in uniqueVal:
subLabels = labels[:] # 递归回退过程需要继续使⽤标签,所以前⾏过程标签副本
myTree[bestFeatLabel][value] = createTree(splitDatat(trainDatat,
bestFeat, value),subLabels, datatFull,labelsFull,splitDatat(testDatat,
bestFeat, value))
el:
for value in uniqueVal:
subLabels = labels[:] # 递归回退过程需要继续使⽤标签,所以前⾏过程标签副本
myTree[bestFeatLabel][value] = createTree(splitDatat(trainDatat,
bestFeat, value),subLabels, datatFull,labelsFull,splitDatat(testDatat,
bestFeat, value))
for value in uniqueValFull:
myTree[bestFeatLabel][value] = majorClass(classList)
return myTree
# 后剪枝
# print(myTree)
# if postReduceBranch(myTree, testDatat, labelsFull) <=
majorTest(majorClass(classList), testDatat):
# return myTree
# el:
# return majorClass(classList)
if __name__ == '__main__':
filename = 'C:\\Urs\\14399\\Desktop\\西⽠2.0.csv'
datat, labels, trainDatat, testDatat = readDatat(filename)
datatFull = trainDatat[:]
labelsFull = labels[:]
myTree = createTree(trainDatat, labels, datatFull, labelsFull, testDatat)
print(myTree)
未剪枝:{'脐部': {'凹陷': {'⾊泽': {'浅⽩': '否', '青绿': '是', '乌⿊': '是'}}, '稍凹': {'根蒂': {'蜷缩': '否', '稍蜷': {'⾊泽': {'青绿': '是', '乌⿊': {'纹理': {'稍糊': '是', '清晰': '否', '模糊': '是'}}, '浅⽩': '是'}}, '硬挺': '是'}}, '平坦': '否'}}
预剪枝:{'脐部': {'稍凹': '是', '平坦': '否', '凹陷': '是'}}
后剪枝: {'脐部': {'稍凹': {'根蒂': {'蜷缩': '否', '稍蜷': {'⾊泽': {'乌⿊': '是', '青绿': '是', '浅⽩': '是'}}, '硬挺': '是'}}, '凹陷': '是', '平坦': '否'}}