双重代价敏感随机森林算法附Python代码
参考⽂献
Cost-nsitive feature lection using random forest: Selecting low-cost subts of informative features 2016
《Knowledge-Bad Systems》
算法改进
相⽐于上⼀章节分享的代价敏感随机森林⽽⾔,这次引⼊了特征选择和序贯分析。
参考⽂献的特征选择算法只是单纯的计算出⼀个特征代价向量使随机过程更具有倾向性,但并未考虑特征间的相对关系,并且在特征区分度不⼤时退化成普通的RF算法。
鉴于此,提出了三点改进:
1)在⽣成特征向量阶段引⼊序贯分析
2)在Gini系数上做了调整
3)在决策树集成阶段引⼊了代价敏感,选择代价少的前90%的决策树(经实验计算,选择50%~90%的决策树数量准确度没什么区别)
ps:虽然该特征向量的⽣成和参考⽂献中的顺序不完全⼀样,但是若将特征分成⾼,中,低三个集合的话,每个集合中的特征元素,两者是相同的。
实验结果
对⽐UCI的8组数据集,这种算法的提升效果确实很明显,强弱特征和不平衡数据集都有明显的提升
备注
代码如下
# -*- coding: utf-8 -*-
"""
@Env:Python2.7
@Time: 2019/10/24 13:31
@Author: ZYL
@Function:Random Forest(RF),随机森林⼆分类
@Version: V1.1
参考⽂献:
[1] UCI. wine[DB/OL].archive.ics.uci.edu/ml/machine-learning-databas/wine.
"""
import pandas as pd
import numpy as np
import random
import math
pd.t_option('precision',4)
pd.t_option('display.max_rows',50)
pd.t_option('display.width',1000)
pd.t_option('display.max_columns',1000)
pd.t_option('expand_frame_repr',Fal)
应聘自我介绍怎么说import collections
import time
import warnings
warnings.filterwarnings("ignore")
def Cost_feature(Cost):
#赋值
P=[]
r=[]
for i in range(len(Cost)):
r.append(float(1/Cost[i]))
a=sum(r)
for i in range(len(Cost)):
P.append(float(format(float(r[i]/a),'.6f')))
return P
# 定义⼀棵决策树
class Tree(object):
def__init__(lf):
lf.split_feature =None
lf.split_value =None
lf.leaf_value =None
<_left =None
<_right =None
# 通过递归决策树找到样本所属叶⼦节点
def calc_predict_value(lf, datat):
if lf.leaf_value is not None:
return lf.leaf_value
elif datat[lf.split_feature]<= lf.split_value:
_left.calc_predict_value(datat)
el:
_right.calc_predict_value(datat)
# 以json形式打印决策树,⽅便查看树结构
def describe_tree(lf):
if _left and _right:
leaf_info ="{leaf_value:"+str(lf.leaf_value)+"}"
return leaf_info
left_info = lf.tree_left.describe_tree()
right_info = lf.tree_right.describe_tree()
tree_structure ="{split_feature:"+str(lf.split_feature)+ \
",split_value:"+str(lf.split_value)+ \
",left_tree:"+ left_info + \
",right_tree:"+ right_info +"}"
return tree_structure
class RandomForestClassifier(object):
def__init__(lf, n_estimators=10, max_depth=-1, min_samples_split=2, min_samples_leaf=1,
min_split_gain=0.0, colsample_bytree="sqrt", subsample=1.0, random_state=None,Cost=[],iteration=Fal): lf.n_estimators = n_estimators #决策树个数
lf.max_depth = max_depth if max_depth !=-1el float('inf')
lf.min_samples_split = min_samples_split
lf.min_samples_leaf = min_samples_leaf
lf.min_split_gain = min_split_gain
lf.subsample = subsample # ⾏采样
lf.random_state = random_state
lf.feature_importances_ =dict()
lf.Cost=Cost
lf.AC_Treee_List=[]
lf.P=[]
#记录准确率⾼于指定阈值的特征
lf.features=[]
lf.iteration=iteration
# 修改代价向量
def Clu_CostMatrix(lf, tree_position):
for i in range(0, lf.colsample_bytree):
中国集邮门户网
lf.Cost[int(lf.features[tree_position][i])]+=1
#初始化得到代价向量
def Get_Cost(lf,datat,label,train_index):
res =[]
res =[]
count=len(label)
for stage,tree s.items():
pred_list =[]
for index, row in datat.iterrows():
pred_list.append(tree.calc_predict_value(row))
#⾥边装的是每棵树对所有数据的分类
res.append(pred_list)
#计算每棵树的准确率
for j in range(len(res)):
acc =0
for i in range(count):
if res[j][i]==int(label[i+train_index]):
acc+=1
acc=float(acc/len(label))
if acc>0.99:
lf.Clu_CostMatrix(j)
# 初始化得到代价向量
def Get_Cost_Test(lf,tree,datat, label, train_index,features):
res =[]
韩国买化妆品
pred_list =[]
count =len(label)
for index, row in datat.iterrows():
pred_list.append(tree.calc_predict_value(row))
# 计算每棵树的准确率
acc =0
for j in range(len(pred_list)):
if pred_list[j]==int(label[j + train_index]):
acc +=1
acc =float(acc /len(label))
if acc >0.95:
for i in features:
lf.Cost[int(i)]+=5
def fit(lf, datat, targets,traindata,trainlabel,train_index):
asrt targets.unique().__len__()==2,"There must be two class for targets!"
targets = _frame(name='label')
if lf.random_state:
random.ed(lf.random_state)
random_state_stages = random.sample(range(lf.n_estimators), lf.n_estimators) # 两种列采样⽅式
lsample_bytree =="sqrt":
lsample_bytree =="log2":
el:
for stage in range(lf.n_estimators):
#计算代价向量的倒数向量
lf.P=Cost_feature(lf.Cost)
# bagging⽅式随机选择样本和特征
random.ed(random_state_stages[stage])
subt_index = random.sample(range(len(datat)),int(lf.subsample *len(datat))) #随机选择的特征
subcol_index = random.list(), lf.colsample_bytree) #随机特征对应的数据
datat_copy = datat.loc[subt_index, subcol_index].ret_index(drop=True) #随机特征对应的标签
targets_copy = targets.loc[subt_index,:].ret_index(drop=True)
tree = lf._fit(datat_copy, targets_copy, depth=0)
#如果迭代改变Cost
if lf.iteration:
lf.Get_Cost_Test(tree,traindata,trainlabel,train_index,subcol_index)
el:
lf.features.append(subcol_index)
# 递归建⽴决策树
def_fit(lf, datat, targets, depth):
# 如果该节点的类别全都⼀样/样本⼩于分裂所需最⼩样本数量,则选取出现次数最多的类别。终⽌分裂
if len(targets['label'].unique())<=1or datat.__len__()<= lf.min_samples_split:
tree = Tree()
tree.leaf_value = lf.calc_leaf_value(targets['label'])
return tree
if depth < lf.max_depth:
best_split_feature, best_split_value, best_split_gain = lf.choo_best_feature(datat, targets)
left_datat, right_datat, left_targets, right_targets = \
lf.split_datat(datat, targets, best_split_feature, best_split_value)
tree = Tree()
# 如果⽗节点分裂后,左叶⼦节点/右叶⼦节点样本⼩于设置的叶⼦节点最⼩样本数量,则该⽗节点终⽌分裂if left_datat.__len__()<= lf.min_samples_leaf or \
right_datat.__len__()<= lf.min_samples_leaf or \
best_split_gain <= lf.min_split_gain:
tree.leaf_value = lf.calc_leaf_value(targets['label'])
return tree
el:
# 如果分裂的时候⽤到该特征,则该特征的importance加1
lf.feature_importances_[best_split_feature]= \
lf.feature_importances_.get(best_split_feature,0)+1
tree.split_feature = best_split_feature
tree.split_value = best_split_value
<_left = lf._fit(left_datat, left_targets, depth+1)
<_right = lf._fit(right_datat, right_targets, depth+1)
return tree
# 如果树的深度超过预设值,则终⽌分裂
el:
桂枝的作用和功效tree = Tree()
tree.leaf_value = lf.calc_leaf_value(targets['label'])
return tree
# 选择最好的数据集划分⽅式,找到最优分裂特征、分裂阈值、分裂增益
def choo_best_feature(lf, datat, targets):
best_split_gain =1
best_split_feature =None
best_split_value =None
for feature lumns:
if datat[feature].unique().__len__()<=100:
unique_values =sorted(datat[feature].unique().tolist())
# 如果该维度特征取值太多,则选择100个百分位值作为待选分裂阈值
el:
unique_values = np.unique([np.percentile(datat[feature], x)
for x in np.linspace(0,100,100)])
# 对可能的分裂阈值求分裂增益,选取增益最⼤的阈值
for split_value in unique_values:
left_targets = targets[datat[feature]<= split_value]
right_targets = targets[datat[feature]> split_value]
split_gain = lf.calc_gini(left_targets['label'], right_targets['label'])
split_gain = lf.calc_gini(left_targets['label'], right_targets['label'])
split_gain=lf.P[int(feature)]*split_gain
if split_gain < best_split_gain:
从早晨到夜晚
best_split_feature = feature
best_split_value = split_value
best_split_gain = split_gain
return best_split_feature, best_split_value, best_split_gain
# 选择样本中出现次数最多的类别作为叶⼦节点取值
@staticmethod
def calc_leaf_value(targets):
label_counts = collections.Counter(targets)
major_label =max(zip(label_counts.values(), label_counts.keys())) return major_label[1]
# 分类树采⽤基尼指数来选择最优分裂点
@staticmethod
def calc_gini(left_targets, right_targets):
split_gain =0
for targets in[left_targets, right_targets]:
gini =1
# 统计每个类别有多少样本,然后计算gini
label_counts = collections.Counter(targets)
for key in label_counts:
prob = label_counts[key]*1.0/len(targets)
a=(1-prob)**1
gini -= prob **2
split_gain +=len(targets)*1.0/(len(left_targets)+len(right_targets))* gini return split_gain
# 根据特征和阈值将样本划分成左右两份,左边⼩于等于阈值,右边⼤于阈值
@staticmethod
def split_datat(datat, targets, split_feature, split_value):
left_datat = datat[datat[split_feature]<= split_value]
left_targets = targets[datat[split_feature]<= split_value]
right_datat = datat[datat[split_feature]> split_value]
right_targets = targets[datat[split_feature]> split_value]
return left_datat, right_datat, left_targets, right_targets
腊八粥的# 输⼊样本,预测所属类别
def predict(lf, datat):
res =[]
for index, row in datat.iterrows():
pred_list =[]凶猛反义词
# 统计每棵树的预测结果,选取出现次数最多的结果作为最终类别
肥肠鸡for stage, tree s.items():
pred_list.append(tree.calc_predict_value(row))
pred_label_counts = collections.Counter(pred_list)
pred_label =max(zip(pred_label_counts.values(), pred_label_counts.keys())) res.append(pred_label[1])
return np.array(res)
#得到新的决策树集合
def Clu_Cost(lf,datat,label,train_index,treeCount):
res =[]
count=len(label)
for stage,tree s.items():
pred_list =[]
for index, row in datat.iterrows():
pred_list.append(tree.calc_predict_value(row))
#⾥边装的是每棵树对所有数据的分类
res.append(pred_list)
#⽤来装新的树的列表
CostTreeList=[]
i=0