机器学习实战-随机森林⼆分类问题
随机森林
概论
前提
Random Forest:可以理解为Bagging with CARTS.
Bagging是bootstrap aggregating(引导聚集算法)的缩写。
CART(classification and regression Tree)分类和回归树,⼆分类树。
这⾥涉及到集成式学习的概念,集成学习可以分为Bagging和Boosting.
Bagging:⾃放回式采样,⼀种弱分类器,采⽤少数服从多数的机制,并⾏式运算。
Boosting:⾃适应的集成学习,顺序迭代,串⾏式运算。代表算法AdaBoost(Adaptive Boosting)
CART采⽤分⽽治之的策略。
回归树:采⽤分治策略,对于⽆法⽤唯⼀的全局线性回归来优化的⽬标进⾏分⽽治之,进⽽取得⽐较准确的结果。但分段后取均值并不是⼀个明智的选择,可以考虑将叶节点设置成⼀个线性函数,即分段线性模型树。
python三维向量转⼆维向量
运⾏:print(sum([[[1,2,3],[4,5,5]],[[1,2,3],[4,5,5]]],[]))
输出:[[1, 2, 3], [4, 5, 5], [1, 2, 3], [4, 5, 5]]
python中多个实参,放到⼀个元组⾥⾯,以*开头,可以传多个参数
*args:(表⽰的就是将实参中按照位置传值,多出来的值都给args,且以元祖的⽅式呈现)
分类回归效果的判断指标:
Information Entropy(信息熵)、Gini Index(基尼指数)、Gini Split(基尼分割)、Misclassification Error(错误分类)
以上判断数值越⼩,模型的效果越好
Information Gain(信息增益),数值越⼤,效果越好
实战
数据集说明
【sonar-all-data.csv】
60 个输⼊变量表⽰声纳从不同⾓度返回的强度。这是⼀个⼆元分类问题(binary classification problem),要求模型能够区分出岩⽯和⾦属柱体的不同材质和形状,总共有 208 个观测样本。
附代码
#coding = utf-8
from random import ed
from random import randrange
电力安全工作规程from csv import reader
from math import sqrt
from math import log
class randomForest:
def__init__(lf):
print('randomforest==start==')
print(ed(1))
#导⼊数据
def load_csv(lf,filename):
datat = list()
with open(filename, 'r') as file:各数是什么
csv_reader = reader(file)
for row in csv_reader:
if not row:
continue
datat.append(row)
return datat
#Convert string column to integer
def str_column_to_float(lf,datat,column):
for row in datat:
row[column] = float(row[column].strip())
#Convert String column to integer
福利分房def str_column_to_int(lf,datat,column):
class_values = [row[column] for row in datat]
unique = t(class_values)
lookup = dict()
#enumerate()⽤于将⼀个可遍历的数据对象组合成⼀个索引序列
for i, value in enumerate(unique):
lookup[value] = i
for row in datat:
row[column] = lookup[row[column]]
return lookup
#Create a random subsample from the datat with replacement
#创建随机⼦样本
def subsample(lf,datat,ratio):
sample = list()
#round()⽅法返回浮点数x的四舍五⼊值
n_sample = round(len(datat)* ratio)
# print(n_sample)
while len(sample)< n_sample:
#有放回的随机采样,有⼀些样本被重复采样,从⽽在训练集中多次出现,有的则从未在训练集中出现。
#此⽅法即为⾃助采样法,从⽽保证每颗决策树训练集的差异性
index = randrange(len(datat))
sample.append(datat[index])
return sample
#Split a datat bad on an attribute and an attribute value
#根据特征和特征值分割数据集
def test_split(lf,index,value,datat):
left, right = list(), list()
for row in datat:
if row[index] < value:
left.append(row)
el:
right.append(row)
return left,right
#计算基尼指数
def gini_index(lf, groups,class_values):
gini = 0.0
# print(groups)
# print(len(class_values))输出:166
for class_value in class_values:
for group in groups:
size = len(group)
if size == 0:
continue
# print(class_value)输出:{'M'}
# print(group)
# exit()
# print(size)输出:109
#list count()统计某个元素出现在列表中的次数
proportion = [row[-1] for row in group].count(class_value)/float(size)
# print(proportion)
gini += (proportion * (1.0 - proportion))
# print(gini)
return gini
#Select the best split point for a datat
#找出分割数据集的最优特征,得到最优的特征index,特征值row[index],以及分割完的数据groups(left,right)
def get_split(lf,datat,n_features):
#class_values =[0,1]
class_values = list(t(row[-1]) for row in datat)
b_index, b_value, b_score,b_group = 999,999,999,None
features = list()
#n_features特征值
while len(features) < n_features:
#往features添加n_features个特征(n_features等于特征数的根号),特征索引从datat中随机取
index = randrange(len(datat[0]) - 1)
if index not in features:
features.append(index)
#在n_features个特征中选出最优的特征索引,并没有遍历所有特征,从⽽保证每个决策树的差异
for index in features:
for row in datat:
#groups = (left, right);row[index]遍历每⼀⾏index索引下的特征值作为分类值values,
#找出最优的分类特征和特征值
groups = lf.test_split(index,row[index],datat)
# print(groups)输出格式:[[]],[[]]
gini = lf.gini_index(groups, class_values)
# print(gini)
if gini < b_score:
#最后得到最优的分类特征b_index,分类特征值b_value,分类结果b_groups。 b_value为分错的代价成本
b_index,b_value,b_score,b_groups = index, row[index], gini, groups
return {'index':b_index, 'value':b_value, 'groups':b_groups}
#创建⼀个终端节点
#输出group中出现次数最多的标签
def to_terminal(lf,group):
outcomes = [row[-1] for row in group]
#max()函数中,当key函数不为空时,就以key的函数对象为判断的标准
# print(outcomes)
return max(t(outcomes), key = unt)
关于想念的诗句#创建⼦分割器,递归分类,直到分类结束
def split(lf, node, max_depth, min_size, n_features, depth):
#max_depth = 10 ,min_size = 1, n_features = int(sqrt(len(datat[0])) - 1)
left,right = node['groups']
# print('node[groups]====')
del(node['groups'])
#检查左右分⽀
if not left or not right:
node['left'] = node['right'] = lf.to_terminal(left+right)
return
#检查迭代次数,表⽰递归⼗次后,若分类还没结束,则选取数据中分类标签较多的作为结果,使分类提前结束,防⽌过拟合。
if depth >= max_depth:
node['left'], node['right'] = lf.to_terminal(left), lf.to_terminal(right)
#加⼯左⼦树
if len(left)<= min_size:
node['left'] = lf.to_terminal(left)
el:
# print('左⼦树递归')
# node['left']是⼀个字典,形式为{'index':b_index,'value':b_value,'groups':b_groups},所以node是⼀个多层字典
node['left'] = lf.get_split(left, n_features)
#递归,depth+1计算递归层数
lf.split(node['left'],max_depth,min_size,n_features,depth+1)
#加⼯右⼦树
if len(right) <= min_size:
node['right'] = lf.to_terminal(right)
el:
# print('右⼦树递归')
node['right'] = lf.get_split(right,n_features)
lf.split(node['right'],max_depth,min_size,n_features,depth+1)
#build a decision tree,建⽴⼀个决策树
def build_tree(lf,train,max_depth,min_size,n_features):
#找出最优的分割点
root = lf.get_split(train,n_features)
# print(root)
#创建⼦分类器,递归分类,直到分类结束。
lf.split(root, max_depth,min_size,n_features, 1)
return root
# exit()
#⽤决策树进⾏预测,预测模型的分类结果
def predict(lf,node,row):
if row[node['index']] < node['value']:
if isinstance(node['left'], dict):
return lf.predict(node['left'], row)
韩国限制级电影网
el:
return node['left']
el:
if isinstance(node['right'],dict):
return lf.predict(node['right'],row)
el:
return node['right']
#⽤⼀系列的套袋树进⾏预测
def bagging_predict(lf, trees,row):
#使⽤多个决策树trees对测试集test的第row⾏进⾏预测,再使⽤简单投票法判断出该⾏所属的分类
predictions = [lf.predict(tree, row) for tree in trees]
return max(t(predictions), key = unt)
#Random Forest Algorithm,随机森林算法
def random_forest(lf,train, test, max_depth, min_size,sample_size,n_trees,n_features):
trees = list()
#n_trees表⽰决策树的数量
for i in range(n_trees):
#随机采样,保证每颗决策树训练集的差异性
#sample_size采样速率
print('训练集长度=',len(train))
#创建随机⼦样本
sample = lf.subsample(train, sample_size)
#建⽴⼀个决策树
tree = lf.build_tree(sample,max_depth,min_size,n_features)
# print(tree)
trees.append(tree)
##⽤⼀系列的套袋树进⾏预测
predictions = [lf.bagging_predict(trees, row) for row in test]
# print(predictions)
return(predictions)
# exit()
#Split a datat into k folds
'''
将数量集datat分成n_flods份,每份包含len(datat)/ n_folds个值,每个值由datat数据集的内容随机产⽣,每个值被调⽤⼀次 '''
def cross_validation_split(lf,datat,n_folds):
datat_split = list()
#复制⼀份datat,防⽌datat的内容改变
datat_copy = list(datat)
#每份的数据量
fold_size = len(datat)/n_folds
# print(datat_copy)
print('每份的长度',fold_size )
print('datat_copy长度=',len(datat_copy))
for i in range(n_folds):
#每次循环fold清零,防⽌重复导⼊datat_split
fold = list()
#随机抽取数据,不断地往fold中添加数据
while len(fold) < fold_size:
#指定递增基数集合中的⼀个随机数,基数缺省值为1
# print('datat长度=',len(datat_copy))
if(len(datat_copy)==0):
break
index = randrange(len(datat_copy))
#将对应索引index的内容从datat_copy中导出,并将该内容从datat_copy中删除。
#pop()函数⽤于移除列表中的⼀个元素,并返回该元素的值。
fold.append(datat_copy.pop(index))
# print(len(fold))
datat_split.append(fold)
print('i===',i)
#datat分割出的n_flods个数据构成的列表,为了⽤于交叉验证
return datat_split
#计算精度百分⽐,导⼊实际值和预测值,计算精确度
def accuracy_metric(lf, actual,predicted):
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct +=1
return correct/float(len(actual)) * 100.0
def evaluate_algorithm(lf, datat, algorithm, n_folds, *args):
folds = lf.cross_validation_split(datat, n_folds)
scores = list()
#每次循环从folds取出⼀个fold作为测试集,其余作为训练集,遍历整个folds,实现交叉验证
for fold in folds:
train_t = list(folds)
ve(fold)
#sum三维向量转⼆维数组,将多个fold组合成⼀个train_t列表
train_t = sum(train_t,[])
test_t = list()
#fold表⽰从原始数据集datat提取出来的测试集
for row in fold:
row_copy = list(row)
test_t.append(row_copy)
row_copy[-1] = None
predicted = algorithm(train_t,test_t,*args)
print('交叉验证RF的预测值=',predicted)
actual = [row[-1] for row in fold]
accuracy = lf.accuracy_metric(actual, predicted)
scores.append(accuracy)
return scores
if__name__ == '__main__':
rf = randomForest()
#load data
filename = 'sonar-all-data.csv'
datat = rf.load_csv(filename)
# print(datat)
奇怪的英语
#整个矩阵,按列从左到右转化
for i in range(0,len(datat[0]) - 1):
#将str类型转变为float
rf.str_column_to_float(datat, i)
# print(datat)
#将最后⼀列表⽰表⽰标签的值转化为Int类型0,1
# str_column_to_int(datat, len(datat[0]) - 1 )
#evaluate algorithm算法评估
#分成5份,进⾏交叉验证
n_folds = 5
#迭代次数
max_depth = 10
min_size = 1
sample_size = 1.0
#调参,TODO,准确性与多样性之间的权衡
n_features = 15
手提式干粉灭火器的使用方法
# n_features = int (sqrt(len(datat[0]) - 1))
#随机森林的树的选择,理论上越多越好
for n_trees in [1,10,20]:
#python中将函数作为另⼀个函数的参数传⼊
scores = rf.evaluate_algorithm(datat, rf.random_forest, n_folds,max_depth, min_size,sample_size,n_trees,n_features) print('Trees:%d' % n_trees)
一心之隔print('Scores:%s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
exit()