首页 > 英语园地

python实现随机森林randomforest的原理及方法

更新时间:2023-05-20 09:48:41 阅读：评论：0

python实现随机森林randomforest的原理及⽅法

引⾔

想通过随机森林来获取数据的主要特征

1、理论

随机森林是⼀个⾼度灵活的机器学习⽅法，拥有⼴泛的应⽤前景，从市场营销到医疗保健保险。既可以⽤来做市场营销模拟的建模，统计客户来源，保留和流失。也可⽤来预测疾病的风险和病患者的易感性。

根据个体学习器的⽣成⽅式，⽬前的集成学习⽅法⼤致可分为两⼤类，即个体学习器之间存在强依赖关系，必须串⾏⽣成的序列化⽅法，以及个体学习器间不存在强依赖关系，可同时⽣成的并⾏化⽅法；

前者的代表是Boosting，后者的代表是Bagging和“随机森林”（Random

Forest）

随机森林在以决策树为基学习器构建Bagging集成的基础上，进⼀步在决策树的训练过程中引⼊了随机属性选择（即引⼊随机特征选择）。

简单来说，随机森林就是对决策树的集成，但有两点不同：

（2）特征选取的差异性：每个决策树的n个分类特征是在所有特征中随机选择的（n是⼀个需要我们⾃⼰调整的参数）

随机森林，简单理解，⽐如预测salary，就是构建多个决策树job，age，hou，然后根据要预测的量的各个特征（teacher，39，suburb）分别在对应决策树的⽬标值概率（salary<5000,salary>=5000），从⽽，确定预测量的发⽣概率（如，预测出P(salary<5000)=0.3）.

随机森林是⼀个可做能够回归和分类。它具备处理⼤数据的特性，⽽且它有助于估计或变量是⾮常重要的基础数据建模。

参数说明：

最主要的两个参数是n_estimators和max_features。

n_estimators：表⽰森林⾥树的个数。理论上是越⼤越好。但是伴随着就是计算时间的增长。但是并不是取得越⼤就会越好，预测效果最好的将会出现在合理的树个数。max_features：随机选择特征集合的⼦集合，并⽤来分割节点。⼦集合的个数越少，⽅差就会减少的越快，但同时偏差就会增加的越快。根据较好的实践经验。如果是回归问题则：

max_features＝n_features，如果是分类问题则max_features＝sqrt(n_features)。

如果想获取较好的结果，必须将max_depth＝None，同时min_sample_split=1。

同时还要记得进⾏cross_validated（交叉验证），除此之外记得在random forest中，bootstrap=True。但在extra-trees中，bootstrap=Fal。

2、随机森林python实现

2.1Demo1

实现随机森林基本功能

#随机森林

import DecisionTreeRegressor内容英文

ble import RandomForestRegressor

motorcyclesimport numpy as np

from sklearn.datats import load_iris

iris=load_iris()

#print iris#iris的４个属性是：萼⽚宽度　萼⽚长度　花瓣宽度　花瓣长度　标签是花的种类：tosa versicolour virginica

print(iris['target'].shape)

rf=RandomForestRegressor()#这⾥使⽤了默认的参数设置

rf.fit(iris.data[:150],iris.target[:150])#进⾏模型的训练

#随机挑选两个预测不相同的样本

instance=iris.data[[100,109]]

print(instance)

rf.predict(instance[[0]])

print('instance 0 prediction；',rf.predict(instance[[0]]))

print( 'instance 1 prediction；',rf.predict(instance[[1]]))

print(iris.target[100],iris.target[109])

运⾏结果

(150,)

[[ 6.3 3.3 6. 2.5]

[ 7.2 3.6 6.1 2.5]]

instance 0 prediction； [ 2.]

instance 1 prediction； [ 2.]

2 2

2.2 Demo2

3种⽅法的⽐较

#random forest test

del_lection import cross_val_score

from sklearn.datats import make_blobs

ble import RandomForestClassifier

ble import ExtraTreesClassifier

import DecisionTreeClassifier

X, y = make_blobs(n_samples=10000, n_features=10, centers=100,random_state=0)

clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2,random_state=0)

scores = cross_val_score(clf, X, y)

clf = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)

dlc是什么意思

scores = cross_val_score(clf, X, y)

an())

clf = ExtraTreesClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)

scores = cross_val_score(clf, X, y)

an())

运⾏结果：

0.979408793821

0.999607843137

0.999898989899

2.3 Demo3-实现特征选择

#随机森林2

import DecisionTreeRegressor

ble import RandomForestRegressor

import numpy as np

from sklearn.datats import load_iris

iris=load_iris()

del_lection import cross_val_score, ShuffleSplit

X = iris["data"]

Y = iris["target"]

names = iris["feature_names"]

rf = RandomForestRegressor()

scores = []

two dozenfor i in range(X.shape[1]):

score = cross_val_score(rf, X[:, i:i+1], Y, scoring="r2",

cv=ShuffleSplit(len(X), 3, .3))

scores.append((an(score), 3), names[i]))

print(sorted(scores, rever=True))

运⾏结果：

[(0.89300000000000002, 'petal width (cm)'), (0.82099999999999995, 'petal length

(cm)'), (0.13, 'pal length (cm)'), (-0.79100000000000004, 'pal width (cm)')]

2.4 demo4-随机森林

本来想利⽤以下代码来构建随机随机森林决策树，但是，遇到的问题是，程序⼀直在运⾏，⽆法响应，还需要调试。#随机森林4

#coding:utf-8

import csv

from random import ed

from random import randrange

from math import sqrt

def loadCSV(filename):#加载数据，⼀⾏⾏的存⼊列表

dataSet = []

with open(filename, 'r') as file:

csvReader = ader(file)

for line in csvReader:

dataSet.append(line)

return dataSet

# 除了标签列，其他列都转换为float类型

def column_to_float(dataSet):

featLen = len(dataSet[0]) - 1

for data in dataSet:

for column in range(featLen):

data[column] = float(data[column].strip())

# 将数据集随机分成N块，⽅便交叉验证，其中⼀块是测试集，其他四块是训练集

def spiltDataSet(dataSet, n_folds):

fold_size = int(len(dataSet) / n_folds)

dataSet_copy = list(dataSet)

dataSet_spilt = []

for i in range(n_folds):

fold = []

while len(fold) < fold_size: # 这⾥不能⽤if，if只是在第⼀次判断时起作⽤，while执⾏循环，直到条件不成⽴

index = randrange(len(dataSet_copy))

fold.append(dataSet_copy.pop(index)) # pop() 函数⽤于移除列表中的⼀个元素（默认最后⼀个元素），并且返回该元素的值。

dataSet_spilt.append(fold)

return dataSet_spilt

ro gold

# 构造数据⼦集

def get_subsample(dataSet, ratio):

subdataSet = []

lenSubdata = round(len(dataSet) * ratio)#返回浮点数

while len(subdataSet) < lenSubdata:

index = randrange(len(dataSet) - 1)

subdataSet.append(dataSet[index])

# print len(subdataSet)

return subdataSet

# 分割数据集

def data_spilt(dataSet, index, value):

left = []

right = []

for row in dataSet:词源

if row[index] < value:

left.append(row)

el:

# 计算分割代价

def spilt_loss(left, right, class_values):

loss = 0.0

for class_value in class_values:

left_size = len(left)

if left_size != 0: # 防⽌除数为零

prop = [row[-1] for row in left].count(class_value) / float(left_size)

loss += (prop * (1.0 - prop))

right_size = len(right)

if right_size != 0:

prop = [row[-1] for row in right].count(class_value) / float(right_size)

loss += (prop * (1.0 - prop))

return loss

# 选取任意的n个特征，在这n个特征中，选取分割时的最优特征

def get_best_spilt(dataSet, n_features):

features = []

class_values = list(t(row[-1] for row in dataSet))

b_index, b_value, b_loss, b_left, b_right = 999, 999, 999, None, None

while len(features) < n_features:

index = randrange(len(dataSet[0]) - 1)

if index not in features:

features.append(index)

# print 'features:',features

for index in features:#找到列的最适合做节点的索引，（损失最⼩）

for row in dataSet:

left, right = data_spilt(dataSet, index, row[index])#以它为节点的，左右分⽀ loss = spilt_loss(left, right, class_values)

if loss < b_loss:#寻找最⼩分割代价

b_index, b_value, b_loss, b_left, b_right = index, row[index], loss, left, right # print b_loss

# print type(b_index)

return {'index': b_index, 'value': b_value, 'left': b_left, 'right': b_right}

# 决定输出标签

def decide_label(data):

output = [row[-1] for row in data]

return max(t(output), unt)

# ⼦分割，不断地构建叶节点的过程对对对

def sub_spilt(root, n_features, max_depth, min_size, depth):

left = root['left']

# print left

right = root['right']

del (root['left'])

del (root['right'])

# print depth

if not left or not right:

root['left'] = root['right'] = decide_label(left + right)

# print 'testing'

return

if depth > max_depth:

root['left'] = decide_label(left)

root['right'] = decide_label(right)

return

if len(left) < min_size:

root['left'] = decide_label(left)

el:

root['left'] = get_best_spilt(left, n_features)

# print 'testing_left'

sub_spilt(root['left'], n_features, max_depth, min_size, depth + 1)

if len(right) < min_size:

root['right'] = decide_label(right)

el:

root['right'] = get_best_spilt(right, n_features)

# print 'testing_right'

sub_spilt(root['right'], n_features, max_depth, min_size, depth + 1)

# 构造决策树

def build_tree(dataSet, n_features, max_depth, min_size):

root = get_best_spilt(dataSet, n_features)

sub_spilt(root, n_features, max_depth, min_size, 1)

return root

# 预测测试集结果

def predict(tree, row):

predictions = []

if row[tree['index']] < tree['value']:

if isinstance(tree['left'], dict):

return predict(tree['left'], row)

el:

return tree['left']

el:

if isinstance(tree['right'], dict):

return predict(tree['right'], row)

el:

return tree['right']

在线学习# predictions=t(predictions)

def bagging_predict(trees, row):

predictions = [predict(tree, row) for tree in trees]

return max(t(predictions), unt)

# 创建随机森林

def random_forest(train, test, ratio, n_feature, max_depth, min_size, n_trees): trees = []

for i in range(n_trees):

train = get_subsample(train, ratio)#从切割的数据集中选取⼦集

tree = build_tree(train, n_features, max_depth, min_size)

# print 'tree %d: '%i,tree

trees.append(tree)

# predict_values = [predict(trees,row) for row in test]

predict_values = [bagging_predict(trees, row) for row in test]

def accuracy(predict_values, actual):

correct = 0

for i in range(len(actual)):

if actual[i] == predict_values[i]:

correct += 1

return correct / float(len(actual))

if __name__ == '__main__':

ed(1)

dataSet = loadCSV(r'G:\0研究⽣\tianchiCompetition\训练⼩样本2.csv')

column_to_float(dataSet)

n_folds = 5

max_depth = 15

min_size = 1

ratio = 1.0

# n_features=sqrt(len(dataSet)-1)

n_features = 15

n_trees = 10

folds = spiltDataSet(dataSet, n_folds)#先是切割数据集

scores = []

for fold in folds:

train_t = folds[

:] # 此处不能简单地⽤train_t=folds，这样⽤属于引⽤，那么当train_t的值改变的时候，folds的值也会改变，所以要⽤复制的形式。（L[:]）能够复制序列，D.copy() 能够复制字典，list能够⽣成拷贝 list(L) ve(fold)#选好训练集

# print len(folds)

train_t = sum(train_t, []) # 将多个fold列表组合成⼀个train_t列表

# print len(train_t)

test_t = []

for row in fold:

row_copy = list(row)

row_copy[-1] = None

test_t.append(row_copy)

# for row in test_t:

# print row[-1]

actual = [row[-1] for row in fold]

predict_values = random_forest(train_t, test_t, ratio, n_features, max_depth, min_size, n_trees)

accur = accuracy(predict_values, actual)

scores.append(accur)

print ('Trees is %d' % n_trees)

print ('scores:%s' % scores)

print ('mean score:%s' % (sum(scores) / float(len(scores))))

2.5 随机森林分类sonic data

# CART on the Bank Note datat

from random import ed

from random import randrange

from csv import reader

# Load a CSV file

def load_csv(filename):

file = open(filename, "r")

lines = reader(file)

datat = list(lines)

return datat

# Convert string column to float

def str_column_to_float(datat, column):

for row in datat:

row[column] = float(row[column].strip())

# Split a datat into k folds

def cross_validation_split(datat, n_folds):

datat_split = list()

datat_copy = list(datat)

fold_size = int(len(datat) / n_folds)

for i in range(n_folds):

fold = list()

while len(fold) < fold_size:

index = randrange(len(datat_copy))

fold.append(datat_copy.pop(index))

datat_split.append(fold)

return datat_split

# Calculate accuracy percentage

def accuracy_metric(actual, predicted):

correct = 0

寻瑕伺隙for i in range(len(actual)):

if actual[i] == predicted[i]:

correct += 1

return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split

def evaluate_algorithm(datat, algorithm, n_folds, *args):

folds = cross_validation_split(datat, n_folds)

scores = list()

for fold in folds:

train_t = list(folds)

ve(fold)

train_t = sum(train_t, [])

黄英文

test_t = list()

for row in fold:

row_copy = list(row)

test_t.append(row_copy)

row_copy[-1] = None

predicted = algorithm(train_t, test_t, *args)

actual = [row[-1] for row in fold]

accuracy = accuracy_metric(actual, predicted)

scores.append(accuracy)

def test_split(index, value, datat):

left, right = list(), list()

for row in datat:

if row[index] < value:

left.append(row)

el:

right.append(row)

return left, right

# Calculate the Gini index for a split datat

def gini_index(groups, class_values):

gini = 0.0

for class_value in class_values:

for group in groups:

projects

size = len(group)

if size == 0:

continue

proportion = [row[-1] for row in group].count(class_value) / float(size) gini += (proportion * (1.0 - proportion))

return gini

# Select the best split point for a datat

def get_split(datat):

class_values = list(t(row[-1] for row in datat))

b_index, b_value, b_score, b_groups = 999, 999, 999, None

for index in range(len(datat[0])-1):

for row in datat:

groups = test_split(index, row[index], datat)

gini = gini_index(groups, class_values)

if gini < b_score:

b_index, b_value, b_score, b_groups = index, row[index], gini, groups print ({'index':b_index, 'value':b_value})

return {'index':b_index, 'value':b_value, 'groups':b_groups}

# Create a terminal node value

def to_terminal(group):

outcomes = [row[-1] for row in group]

return max(t(outcomes), unt)

# Create child splits for a node or make terminal

def split(node, max_depth, min_size, depth):

left, right = node['groups']

del(node['groups'])

# check for a no split

if not left or not right:

node['left'] = node['right'] = to_terminal(left + right)

return

# check for max depth

if depth >= max_depth:

node['left'], node['right'] = to_terminal(left), to_terminal(right)

return

# process left child

if len(left) <= min_size:

node['left'] = to_terminal(left)

el:

node['left'] = get_split(left)

split(node['left'], max_depth, min_size, depth+1)

# process right child

if len(right) <= min_size:

node['right'] = to_terminal(right)

el:

node['right'] = get_split(right)

split(node['right'], max_depth, min_size, depth+1)

# Build a decision tree

def build_tree(train, max_depth, min_size):

root = get_split(train)

split(root, max_depth, min_size, 1)

return root

# Make a prediction with a decision tree

def predict(node, row):

if row[node['index']] < node['value']:

if isinstance(node['left'], dict):

return predict(node['left'], row)

el:

return node['left']

el:

if isinstance(node['right'], dict):

return predict(node['right'], row)

el:

return node['right']

# Classification and Regression Tree Algorithm

def decision_tree(train, test, max_depth, min_size):

tree = build_tree(train, max_depth, min_size)

predictions = list()

for row in test:

prediction = predict(tree, row)

predictions.append(prediction)

return(predictions)

# Test CART on Bank Note datat

ed(1)

# load and prepare data

filename = r'G:\0pythonstudy\决策树\sonar.all-data.csv'

datat = load_csv(filename)

# convert string attributes to integers

for i in range(len(datat[0])-1):

本文发布于:2023-05-20 09:48:41，感谢您对本站的认可！

本文链接：https://www.wtabcd.cn/fanwen/fan/78/705948.html

上一篇：天池离线赛-移动推荐算法（四）：基于LR,RF,GBDT等模型的预测

下一篇：融资约束、政府补贴与新能源企业投资效率--基于异质性双边随机前沿模型

标签：决策树森林预测数据

留言与评论（共有 0 条评论）