Advanced Regression 掌握高级回归方法,应对复杂预测问题

高级回归方法

学习目标

完成本模块学习后,你将能够:

  • 理解并实现各种高级回归算法
  • 掌握正则化技术来防止过拟合
  • 处理非线性回归问题
  • 使用集成方法提高回归性能

先修知识

  • Python编程基础
  • 线性代数基础
  • 基本统计概念
  • 机器学习基础

1. 从零实现线性回归

1.1 基础理论

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression

class LinearRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.lr = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
        
    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        # 初始化参数
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        # 梯度下降
        for _ in range(self.n_iterations):
            y_predicted = np.dot(X, self.weights) + self.bias
            
            # 计算梯度
            dw = (1/n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1/n_samples) * np.sum(y_predicted - y)
            
            # 更新参数
            self.weights -= self.lr * dw
            self.bias -= self.lr * db
            
    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

1.2 可视化与分析

def plot_regression_line(X, y, model):
    plt.scatter(X, y, color='blue')
    plt.plot(X, model.predict(X), color='red')
    plt.title('Linear Regression')
    plt.xlabel('X')
    plt.ylabel('y')
    plt.show()

2. 高级回归方法

2.1 岭回归(L2正则化)

from sklearn.linear_model import Ridge

class RidgeRegression:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.model = Ridge(alpha=self.alpha)
        
    def fit(self, X, y):
        self.model.fit(X, y)
        
    def predict(self, X):
        return self.model.predict(X)

2.2 Lasso回归(L1正则化)

from sklearn.linear_model import Lasso

class LassoRegression:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.model = Lasso(alpha=self.alpha)
        
    def fit(self, X, y):
        self.model.fit(X, y)
        
    def predict(self, X):
        return self.model.predict(X)

2.3 弹性网络

from sklearn.linear_model import ElasticNet

class ElasticNetRegression:
    def __init__(self, alpha=1.0, l1_ratio=0.5):
        self.alpha = alpha
        self.l1_ratio = l1_ratio
        self.model = ElasticNet(alpha=self.alpha, l1_ratio=self.l1_ratio)
        
    def fit(self, X, y):
        self.model.fit(X, y)
        
    def predict(self, X):
        return self.model.predict(X)

3. 正则化技术

3.1 L1正则化(Lasso)

  • 特点:产生稀疏解
  • 用途:特征选择
  • 数学表达式:
def l1_penalty(weights, alpha):
    """计算L1正则化项"""
    return alpha * np.sum(np.abs(weights))

3.2 L2正则化(Ridge)

  • 特点:防止过拟合
  • 用途:处理多重共线性
  • 数学表达式:
def l2_penalty(weights, alpha):
    """计算L2正则化项"""
    return alpha * np.sum(weights ** 2)

3.3 正则化参数选择

from sklearn.model_selection import GridSearchCV

def find_best_alpha(X, y, model_class):
    """网格搜索找最佳正则化参数"""
    param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
    grid_search = GridSearchCV(model_class(), param_grid, cv=5)
    grid_search.fit(X, y)
    return grid_search.best_params_['alpha']

4. 非线性回归

4.1 多项式回归

from sklearn.preprocessing import PolynomialFeatures

class PolynomialRegression:
    def __init__(self, degree=2):
        self.degree = degree
        self.poly_features = PolynomialFeatures(degree=degree)
        self.linear_regression = LinearRegression()
        
    def fit(self, X, y):
        X_poly = self.poly_features.fit_transform(X)
        self.linear_regression.fit(X_poly, y)
        
    def predict(self, X):
        X_poly = self.poly_features.transform(X)
        return self.linear_regression.predict(X_poly)

4.2 样条回归

from scipy.interpolate import make_interp_spline

def spline_regression(X, y, n_knots=5):
    """使用样条进行非线性回归"""
    spl = make_interp_spline(X.flatten(), y, k=3)
    X_new = np.linspace(X.min(), X.max(), 200)
    y_new = spl(X_new)
    return X_new, y_new

5. 实战案例:房价预测

5.1 数据准备

from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

def prepare_housing_data():
    """准备加州房价数据集"""
    housing = fetch_california_housing()
    X = housing.data
    y = housing.target
    
    # 标准化
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, y

5.2 模型比较

def compare_regression_models(X, y):
    """比较不同回归模型的性能"""
    models = {
        'Linear': LinearRegression(),
        'Ridge': Ridge(alpha=1.0),
        'Lasso': Lasso(alpha=1.0),
        'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5)
    }
    
    results = {}
    for name, model in models.items():
        # 交叉验证评分
        scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
        rmse = np.sqrt(-scores.mean())
        results[name] = rmse
        
    return results

常见问题解答

Q: 如何选择合适的正则化方法? A: 根据问题特点选择:

  • L1正则化:当需要特征选择时
  • L2正则化:当处理多重共线性时
  • 弹性网络:当两种效果都需要时

Q: 如何处理非线性关系? A: 可以使用以下方法:

  • 多项式回归
  • 样条回归
  • 核方法
  • 神经网络

Q: 如何避免过拟合? A: 可以采用以下策略:

  • 使用正则化
  • 减少模型复杂度
  • 增加训练数据
  • 使用交叉验证

扩展阅读