首页 > 英语园地

XGBoost算法案例与调参实例

更新时间:2023-05-20 08:57:17 阅读：评论：0

XGBoost算法案例与调参实例

XGBoost

法拉第电磁感应现象

优化的分布式梯度增强库，旨在实现⾼效，灵活和便携。

在Gradient Boosting框架下实现机器学习算法。

提供了并⾏树提升（也称为GBDT，GBM），可以快速准确地解决许多数据科学问题。相同的代码在主要的分布式环境

（Hadoop，SGE，MPI）上运⾏，并且可以解决超过数⼗亿个样例的问题。

利⽤了核外计算并且能够使数据科学家在⼀个主机上处理数亿的样本数据。最终，将这些技术进⾏结合来做⼀个端到端的系统以最少的集群系统来扩展到更⼤的数据集上。

以CART决策树为⼦模型，通过Gradient Tree Boosting实现多棵CART树的集成学习，得到最终模型。

在⽣成新树的过程中，最基本的操作是节点分裂。节点分裂中最重要的环节是找到最优特征及最优切分点, 然后将叶⼦节点按照最优特征和最优切分点进⾏分裂。

2019感恩节是哪天XGBoost最优特征和最优切分点划分

精确贪⼼算法⾸先找到所有的候选特征及所有的候选切分点, 然后选择最⼤的特征及对应切分点作为最优特征和最优切分点。

基于直⽅图的近似算法对某⼀特征寻找最优切分点时，⾸先对该特征的所有切分点按分位数 (如百分位) 分桶, 得到⼀个候选切分点集。

特征的每⼀个切分点都可以分到对应的分桶; 然后，对每个桶计算特征统计得到直⽅图, 最后，选择所有候选特征及候选切分点中对应桶的特征统计收益最⼤的作为最优特征及最优切分点。

XGBoost分类案例

from sklearn.datats import load_irissmart中文意思

import xgboost as xgb

from xgboost import plot_importance

from matplotlib import pyplot as plt

del_lection import train_test_split

ics import accuracy_score # 准确率

# 加载样本数据集

iris = load_iris()

X,y = iris.data,iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234565)# 数据集分割# 算法参数

params ={

'booster':'gbtree',

'objective':'multi:softmax',

'num_class': 3,

'gamma': 0.1,

'max_depth': 6,

'lambda': 2,

'subsample': 0.7,lafite

'colsample_bytree': 0.75,

'min_child_weight': 3,

'silent': 0,

'eta': 0.1,

'ed': 1,

'nthread': 4,

}

plst = params.items()

dtrain = xgb.DMatrix(X_train, y_train)# ⽣成数据集格式

num_rounds = 500

model = ain(plst, dtrain, num_rounds)# xgboost模型训练

# 对测试集进⾏预测

dtest = xgb.DMatrix(X_test)

talentedy_pred = model.predict(dtest)

# 计算准确率

accuracy = accuracy_score(y_test,y_pred)

print("accuarcy: %.2f%%" % (accuracy*100.0))

# 显⽰重要特征

plot_importance(model)

plt.show()

Xgboost回归案例

from xgboost import plot_importance

from matplotlib import pyplot as plt

del_lection import train_test_split

from sklearn.datats import load_boston

ics import mean_squared_error

# 加载数据集

boston = load_boston()

X,y = boston.data,boston.target

# XGBoost训练过程

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

params ={

'booster':'gbtree',

'objective':'reg:squarederror',

'gamma': 0.1,

'max_depth': 5,

'lambda': 3,

'subsample': 0.7,

'colsample_bytree': 0.7,

'min_child_weight': 3,

'silent': 1,

'eta': 0.1,

'ed': 1000,

'nthread': 4,

}

dtrain = xgb.DMatrix(X_train, y_train)

num_rounds = 300

plst = params.items()

model = ain(plst, dtrain, num_rounds)

# 对测试集进⾏预测

dtest = xgb.DMatrix(X_test)

ans = model.predict(dtest)

# 显⽰重要特征

plot_importance(model)

plt.show()

XGBoost结合sklearn⽹格搜索调参

import pandas as pd

del_lection import train_test_split

del_lection import GridSearchCV

ics import roc_auc_score

iris = load_iris()

X,y = iris.data,iris.target

col = iris.target_names

train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.3, random_state=1)# 分训练集和验证集parameters ={

'max_depth':[5, 10, 15, 20, 25],

'learning_rate':[0.01, 0.02, 0.05, 0.1, 0.15],

'n_estimators':[500, 1000, 2000, 3000, 5000],

surfairness'min_child_weight':[0, 2, 5, 10, 20],

'max_delta_step':[0, 0.2, 0.6, 1, 2],

'subsample':[0.6, 0.7, 0.8, 0.85, 0.95],

'colsample_bytree':[0.5, 0.6, 0.7, 0.8, 0.9],

'reg_alpha':[0, 0.25, 0.5, 0.75, 1],

'reg_lambda':[0.2, 0.4, 0.6, 0.8, 1],

'scale_pos_weight':[0.2, 0.4, 0.6, 0.8, 1]

}

xlf = xgb.XGBClassifier(max_depth=10,

learning_rate=0.01,

n_estimators=2000,

silent=True,

objective='multi:softmax',

num_class=3 ,

nthread=-1,

gamma=0,

min_child_weight=1,

max_delta_step=0,

subsample=0.85,

colsample_bytree=0.7,

colsample_bylevel=1,

reg_alpha=0,

reg_lambda=1,

scale_pos_weight=1,

ed=0,

missing=None)

gs = GridSearchCV(xlf, param_grid=parameters, scoring='accuracy', cv=3)

gs.fit(train_x, train_y)

print("Best score: %0.3f" % gs.best_score_)

print("Best parameters t: %s" % gs.best_params_ )

LightGBM与⽹格搜索结合调参

import lightgbm as lgb

from sklearn import metrics

from sklearn.datats import load_breast_cancer

eddie怎么读del_lection import train_test_split

canceData=load_breast_cancer()

X=canceData.data

y=canceData.target

X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)

### 数据转换

boarderprint('数据转换')

lgb_train = lgb.Datat(X_train, y_train, free_raw_data=Fal)

lgb_eval = lgb.Datat(X_test, y_test, reference=lgb_train,free_raw_data=Fal)

lgb_eval = lgb.Datat(X_test, y_test, reference=lgb_train,free_raw_data=Fal) ### 设置初始参数--不含交叉验证参数

print('设置参数')

params ={

'boosting_type':'gbdt',

'objective':'binary',

'metric':'auc',

'nthread':4,

'learning_rate':0.1

}

### 交叉验证(调参)

print('交叉验证')

max_auc = float('0')

best_params ={}

# 准确率

print("调参1：提⾼准确率")

for num_leaves in range(5,100,5):

for max_depth in range(3,8,1):

params['num_leaves']= num_leaves

params['max_depth']= max_depth

cv_results = lgb.cv(

params,

lgb_train,

ed=1,

nfold=5,

metrics=['auc'],

msn是什么东西early_stopping_rounds=10,

verbo_eval=True

)

mean_auc = pd.Series(cv_results['auc-mean']).max()

boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()

if mean_auc >= max_auc:

max_auc = mean_auc

best_params['num_leaves']= num_leaves

best_params['max_depth']= max_depth

if'num_leaves' and 'max_depth'in best_params.keys():

params['num_leaves']= best_params['num_leaves']

params['max_depth']= best_params['max_depth']

# 过拟合

print("调参2：降低过拟合")

for max_bin in range(5,256,10):

for min_data_in_leaf in range(1,102,10):

params['max_bin']= max_bin

params['min_data_in_leaf']= min_data_in_leaf

cv_results = lgb.cv(

params,

lgb_train,

ed=1,

苏州翻译公司nfold=5,

metrics=['auc'],

early_stopping_rounds=10,

verbo_eval=True

)

mean_auc = pd.Series(cv_results['auc-mean']).max()

boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()

if mean_auc >= max_auc:

本文发布于:2023-05-20 08:57:17，感谢您对本站的认可！

本文链接：https://www.wtabcd.cn/fanwen/fan/78/705501.html

上一篇：酵母(Saccharomyces cerevisiae)及酵母提取物对肉鸡肉质的影响

下一篇：python随机森林参数说明

标签：特征数据算法

留言与评论（共有 0 条评论）