【AI】随机森林、lgb与xgb的代码实践

Monday, August 21, 2023
本文共542字
2分钟阅读时长
posts , AI

⚠️本文是作者P3troL1er原创,首发于https://peterliuzhi.top/posts/ai%E9%9A%8F%E6%9C%BA%E6%A3%AE%E6%9E%97lgb%E4%B8%8Exgb%E7%9A%84%E4%BB%A3%E7%A0%81%E5%AE%9E%E8%B7%B5/。商业转载请联系作者获得授权,非商业转载请注明出处!

The exercise of an extraordinary gift is the supremist pleasure in life. — Mark Twain

scikit-learn实现随机森林

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# 加载数据集
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
print(data.target_names)
y = pd.DataFrame(data.target).squeeze()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

[‘malignant’ ‘benign’]

# n_jobs=-1保证使用最大线程数运行
clf = RandomForestClassifier(n_jobs=-1)
clf.fit(
    X_train,
    y_train
)
features = X_train.columns
importances = clf.feature_importances_
# 对重要性进行排序并获取索引
indices = np.argsort(importances)

# 绘制特征重要性
plt.figure(figsize=(10, 10))
plt.title("Feature Importances")
plt.barh(np.array(features)[indices], importances[indices], align="center")
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()

png

y_pred = clf.predict(X_test)
# 处理预测结果
# 0.5是置信度阈值
y_pred = [1 if pred > 0.5 else 0 for pred in y_pred]
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"{accuracy=}, {precision=}, {recall=}, {f1=}")

accuracy=0.9649122807017544, precision=0.958904109589041, recall=0.9859154929577465, f1=0.9722222222222222

lightgbm

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# 加载数据集
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
print(data.target_names)
y = pd.DataFrame(data.target).squeeze()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
d_train = lgb.Dataset(X_train, y_train)

[‘malignant’ ‘benign’]

# 设置参数
params = {
    'objective': 'binary', 
    'boosting_type': 'gbdt',
    'subsample': 0.623,
    'colsample_bytree': 0.7,
    'num_leaves': 127,
    'max_depth': 8,
    'seed': 42,
    'bagging_freq': 1,
    'n_jobs': -1
}
clf = lgb.train(params, d_train, 100)
lgb.plot_importance(clf, height=0.8, title="Feature Importances", xlabel="Importance", figsize=(10, 10), grid=False, importance_type="gain")

<Axes: title={‘center’: ‘Feature Importances’}, xlabel=‘Importance’, ylabel=‘Features’>

png

y_pred = clf.predict(X_test)
# 处理预测结果
# 0.5是置信度阈值
y_pred = [1 if pred > 0.5 else 0 for pred in y_pred]
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"{accuracy=}, {precision=}, {recall=}, {f1=}")

accuracy=0.9736842105263158, precision=0.9722222222222222, recall=0.9859154929577465, f1=0.979020979020979

xgboost

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import xgboost as xgb
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# 加载数据集
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
print(data.target_names)
y = pd.DataFrame(data.target).squeeze()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
d_train = xgb.DMatrix(X_train, y_train)
d_test = xgb.DMatrix(X_test)

[‘malignant’ ‘benign’]

# 设置参数
params = {
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eta': 1,
    'max_depth': 6,
    'seed': 42,
    'n_jobs': -1
}
# 训练模型
clf = xgb.train(params, d_train, 100)
xgb.plot_importance(clf, height=0.8, title="Feature Importances", xlabel="Importance", grid=False, importance_type="gain")

<Axes: title={‘center’: ‘Feature Importances’}, xlabel=‘Importance’, ylabel=‘Features’>

png

y_pred = clf.predict(d_test)
# 处理预测结果
# 0.5是置信度阈值
y_pred = [1 if pred > 0.5 else 0 for pred in y_pred]
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"{accuracy=}, {precision=}, {recall=}, {f1=}")

accuracy=0.9736842105263158, precision=0.9722222222222222, recall=0.9859154929577465, f1=0.979020979020979