【AI】随机森林、lgb与xgb的代码实践
Monday, August 21, 2023
本文共542字
2分钟阅读时长
⚠️本文是作者P3troL1er原创,首发于https://peterliuzhi.top/posts/ai%E9%9A%8F%E6%9C%BA%E6%A3%AE%E6%9E%97lgb%E4%B8%8Exgb%E7%9A%84%E4%BB%A3%E7%A0%81%E5%AE%9E%E8%B7%B5/。商业转载请联系作者获得授权,非商业转载请注明出处!
The exercise of an extraordinary gift is the supremist pleasure in life.
— Mark Twain
scikit-learn实现随机森林
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# 加载数据集
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
print(data.target_names)
y = pd.DataFrame(data.target).squeeze()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
[‘malignant’ ‘benign’]
# n_jobs=-1保证使用最大线程数运行
clf = RandomForestClassifier(n_jobs=-1)
clf.fit(
X_train,
y_train
)
features = X_train.columns
importances = clf.feature_importances_
# 对重要性进行排序并获取索引
indices = np.argsort(importances)
# 绘制特征重要性
plt.figure(figsize=(10, 10))
plt.title("Feature Importances")
plt.barh(np.array(features)[indices], importances[indices], align="center")
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()
y_pred = clf.predict(X_test)
# 处理预测结果
# 0.5是置信度阈值
y_pred = [1 if pred > 0.5 else 0 for pred in y_pred]
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"{accuracy=}, {precision=}, {recall=}, {f1=}")
accuracy=0.9649122807017544, precision=0.958904109589041, recall=0.9859154929577465, f1=0.9722222222222222
lightgbm
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# 加载数据集
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
print(data.target_names)
y = pd.DataFrame(data.target).squeeze()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
d_train = lgb.Dataset(X_train, y_train)
[‘malignant’ ‘benign’]
# 设置参数
params = {
'objective': 'binary',
'boosting_type': 'gbdt',
'subsample': 0.623,
'colsample_bytree': 0.7,
'num_leaves': 127,
'max_depth': 8,
'seed': 42,
'bagging_freq': 1,
'n_jobs': -1
}
clf = lgb.train(params, d_train, 100)
lgb.plot_importance(clf, height=0.8, title="Feature Importances", xlabel="Importance", figsize=(10, 10), grid=False, importance_type="gain")
<Axes: title={‘center’: ‘Feature Importances’}, xlabel=‘Importance’, ylabel=‘Features’>
y_pred = clf.predict(X_test)
# 处理预测结果
# 0.5是置信度阈值
y_pred = [1 if pred > 0.5 else 0 for pred in y_pred]
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"{accuracy=}, {precision=}, {recall=}, {f1=}")
accuracy=0.9736842105263158, precision=0.9722222222222222, recall=0.9859154929577465, f1=0.979020979020979
xgboost
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import xgboost as xgb
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# 加载数据集
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
print(data.target_names)
y = pd.DataFrame(data.target).squeeze()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
d_train = xgb.DMatrix(X_train, y_train)
d_test = xgb.DMatrix(X_test)
[‘malignant’ ‘benign’]
# 设置参数
params = {
'objective': 'binary:logistic',
'booster': 'gbtree',
'subsample': 0.8,
'colsample_bytree': 0.8,
'eta': 1,
'max_depth': 6,
'seed': 42,
'n_jobs': -1
}
# 训练模型
clf = xgb.train(params, d_train, 100)
xgb.plot_importance(clf, height=0.8, title="Feature Importances", xlabel="Importance", grid=False, importance_type="gain")
<Axes: title={‘center’: ‘Feature Importances’}, xlabel=‘Importance’, ylabel=‘Features’>
y_pred = clf.predict(d_test)
# 处理预测结果
# 0.5是置信度阈值
y_pred = [1 if pred > 0.5 else 0 for pred in y_pred]
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"{accuracy=}, {precision=}, {recall=}, {f1=}")
accuracy=0.9736842105263158, precision=0.9722222222222222, recall=0.9859154929577465, f1=0.979020979020979
扫码阅读此文章
点击按钮复制分享信息
点击订阅