机器学习模型优化指南
学习目标
完成本章学习后,你将能够:
- 理解过拟合和欠拟合的概念
- 掌握常用的正则化技术
- 熟练使用超参数优化方法
- 实践模型集成技术
1. 模型优化基础
1.1 过拟合与欠拟合
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
def plot_model_complexity():
"""可视化不同复杂度模型的拟合效果"""
# 生成数据
np.random.seed(0)
X = np.sort(np.random.rand(30))
y = np.cos(1.5 * np.pi * X) + np.random.normal(0, 0.1, 30)
X = X.reshape(-1, 1)
# 尝试不同程度的多项式
degrees = [1, 4, 15] # 分别对应欠拟合、合适、过拟合
plt.figure(figsize=(15, 5))
for i, degree in enumerate(degrees):
plt.subplot(1, 3, i + 1)
polynomial_features = PolynomialFeatures(degree=degree, include_bias=False)
linear_regression = LinearRegression()
pipeline = Pipeline([
("polynomial_features", polynomial_features),
("linear_regression", linear_regression)
])
pipeline.fit(X, y)
# 绘制结果
X_test = np.linspace(0, 1, 100).reshape(-1, 1)
plt.scatter(X, y, label='Training points')
plt.plot(X_test, pipeline.predict(X_test), label=f'Degree {degree}')
plt.legend()
plt.title(f'Polynomial Degree {degree}')
plt.tight_layout()
plt.show()
2. 正则化技术
2.1 L1正则化(Lasso)
from sklearn.linear_model import Lasso
import numpy as np
def lasso_regression_example():
"""L1正则化示例"""
# 生成数据
np.random.seed(42)
X = np.random.randn(100, 20)
true_coef = np.zeros(20)
true_coef[:5] = np.random.randn(5)
y = np.dot(X, true_coef) + np.random.randn(100) * 0.1
# 使用不同的alpha值
alphas = [0.0001, 0.001, 0.01, 0.1, 1]
for alpha in alphas:
lasso = Lasso(alpha=alpha)
lasso.fit(X, y)
print(f'Alpha: {alpha}')
print(f'Number of non-zero coefficients: {np.sum(lasso.coef_ != 0)}')
print(f'R2 score: {lasso.score(X, y)}\n')
2.2 L2正则化(Ridge)
from sklearn.linear_model import Ridge
def ridge_regression_example():
"""L2正则化示例"""
# 使用不同的alpha值
alphas = [0.0001, 0.001, 0.01, 0.1, 1]
for alpha in alphas:
ridge = Ridge(alpha=alpha)
ridge.fit(X, y)
print(f'Alpha: {alpha}')
print(f'Coefficient norm: {np.linalg.norm(ridge.coef_)}')
print(f'R2 score: {ridge.score(X, y)}\n')
2.3 弹性网络
from sklearn.linear_model import ElasticNet
def elastic_net_example():
"""弹性网络示例"""
elastic = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic.fit(X, y)
print('Number of non-zero coefficients:', np.sum(elastic.coef_ != 0))
print('R2 score:', elastic.score(X, y))
2.4 Dropout(深度学习)
import tensorflow as tf
def create_model_with_dropout():
"""创建带有Dropout的神经网络"""
model = tf.keras.Sequential([
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(0.3),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(1)
])
return model
3. 超参数优化
3.1 网格搜索
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
def grid_search_example(X, y):
"""网格搜索示例"""
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [10, 20, 30, None],
'min_samples_split': [2, 5, 10]
}
rf = RandomForestClassifier()
grid_search = GridSearchCV(
rf, param_grid, cv=5,
scoring='accuracy',
n_jobs=-1
)
grid_search.fit(X, y)
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
return grid_search
3.2 随机搜索
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
def random_search_example(X, y):
"""随机搜索示例"""
param_distributions = {
'n_estimators': randint(100, 500),
'max_depth': randint(10, 50),
'min_samples_split': randint(2, 20),
'max_features': uniform(0.1, 0.9)
}
rf = RandomForestClassifier()
random_search = RandomizedSearchCV(
rf, param_distributions,
n_iter=100, cv=5,
scoring='accuracy',
n_jobs=-1
)
random_search.fit(X, y)
print("Best parameters:", random_search.best_params_)
print("Best score:", random_search.best_score_)
return random_search
3.3 贝叶斯优化
from skopt import BayesSearchCV
def bayes_search_example(X, y):
"""贝叶斯优化示例"""
from skopt.space import Real, Integer
search_spaces = {
'n_estimators': Integer(100, 500),
'max_depth': Integer(10, 50),
'min_samples_split': Integer(2, 20),
'max_features': Real(0.1, 0.9)
}
rf = RandomForestClassifier()
bayes_search = BayesSearchCV(
rf, search_spaces, n_iter=50,
cv=5, scoring='accuracy',
n_jobs=-1
)
bayes_search.fit(X, y)
print("Best parameters:", bayes_search.best_params_)
print("Best score:", bayes_search.best_score_)
return bayes_search
4. 模型集成
4.1 投票集成
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
def voting_ensemble_example(X, y):
"""投票集成示例"""
# 创建基础模型
clf1 = LogisticRegression()
clf2 = DecisionTreeClassifier()
clf3 = SVC(probability=True)
# 创建投票集成
voting_clf = VotingClassifier(
estimators=[
('lr', clf1),
('dt', clf2),
('svc', clf3)
],
voting='soft'
)
return voting_clf
4.2 Bagging
from sklearn.ensemble import BaggingClassifier
def bagging_example(X, y):
"""Bagging示例"""
base_estimator = DecisionTreeClassifier()
bagging = BaggingClassifier(
base_estimator=base_estimator,
n_estimators=100,
max_samples=0.8,
max_features=0.8
)
bagging.fit(X, y)
return bagging
4.3 Boosting
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
def boosting_comparison(X, y):
"""比较不同的Boosting算法"""
# GBDT
gbdt = GradientBoostingClassifier(
n_estimators=100,
learning_rate=0.1
)
# XGBoost
xgb = XGBClassifier(
n_estimators=100,
learning_rate=0.1
)
# LightGBM
lgb = LGBMClassifier(
n_estimators=100,
learning_rate=0.1
)
models = {
'GBDT': gbdt,
'XGBoost': xgb,
'LightGBM': lgb
}
results = {}
for name, model in models.items():
model.fit(X, y)
score = model.score(X, y)
results[name] = score
return results
4.4 Stacking
from sklearn.ensemble import StackingClassifier
def stacking_example(X, y):
"""Stacking集成示例"""
estimators = [
('rf', RandomForestClassifier()),
('gbdt', GradientBoostingClassifier()),
('xgb', XGBClassifier())
]
stacking = StackingClassifier(
estimators=estimators,
final_estimator=LogisticRegression()
)
stacking.fit(X, y)
return stacking
5. 实战案例:信用卡欺诈检测
5.1 完整优化流程
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
def credit_card_fraud_detection():
"""信用卡欺诈检测完整示例"""
# 1. 加载数据
data = pd.read_csv('creditcard.csv')
X = data.drop('Class', axis=1)
y = data['Class']
# 2. 数据预处理
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 3. 划分数据
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.2, random_state=42
)
# 4. 创建和优化模型
param_grid = {
'n_estimators': [100, 200],
'max_depth': [10, 20],
'learning_rate': [0.01, 0.1]
}
xgb = XGBClassifier()
grid_search = GridSearchCV(
xgb, param_grid, cv=3,
scoring='f1', n_jobs=-1
)
# 5. 训练模型
grid_search.fit(X_train, y_train)
# 6. 评估结果
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))
return grid_search
常见问题解答
Q: 如何选择正则化方法? A: 考虑以下因素:
- L1正则化(Lasso)适合特征选择
- L2正则化(Ridge)适合处理多重共线性
- 弹性网络结合两者优点
- 数据量和特征数也是重要考虑因素
Q: 如何避免过拟合? A: 常用方法:
- 增加训练数据
- 使用正则化
- 简化模型
- 使用交叉验证
- 特征选择
- 提前停止
Q: 集成学习的优缺点? A: 优点:
- 提高模型性能
- 减少过拟合
- 提高稳定性
缺点:
- 计算成本高
- 模型复杂度增加
- 可解释性降低