特征工程实践指南
学习目标
完成本章学习后,你将能够:
- 理解特征工程在机器学习中的重要性
- 掌握常用的特征选择和特征提取方法
- 熟练运用特征变换和特征组合技术
- 构建高效的特征工程pipeline
1. 特征工程概述
1.1 什么是特征工程
特征工程是将原始数据转换为更好的特征表示的过程,使机器学习算法能够更好地工作。它是机器学习中最重要的步骤之一,往往对模型性能有决定性影响。
1.2 特征工程的重要性
# 示例:特征工程对模型性能的影响
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import numpy as np
# 加载数据
boston = load_boston()
X, y = boston.data, boston.target
# 不进行特征工程
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = LinearRegression()
model.fit(X_train, y_train)
print("原始特征得分:", model.score(X_test, y_test))
# 进行标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)
model.fit(X_train, y_train)
print("标准化后得分:", model.score(X_test, y_test))
2. 特征选择
2.1 过滤法(Filter Methods)
基于统计指标选择特征。
from sklearn.feature_selection import SelectKBest, f_classif
def filter_features(X, y, k=5):
"""使用F检验选择最重要的k个特征"""
selector = SelectKBest(score_func=f_classif, k=k)
X_selected = selector.fit_transform(X, y)
selected_features = selector.get_support()
return X_selected, selected_features
# 示例使用
X_selected, mask = filter_features(X, y)
selected_features = X.columns[mask] # 假设X是DataFrame
2.2 包装法(Wrapper Methods)
使用目标算法的性能来评估特征子集。
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
def wrapper_feature_selection(X, y, n_features):
"""使用递归特征消除法选择特征"""
estimator = RandomForestClassifier(n_estimators=100)
selector = RFE(estimator, n_features_to_select=n_features)
X_selected = selector.fit_transform(X, y)
return X_selected, selector.support_
2.3 嵌入法(Embedded Methods)
在模型训练过程中进行特征选择。
from sklearn.linear_model import Lasso
def embedded_feature_selection(X, y, alpha=1.0):
"""使用Lasso进行特征选择"""
lasso = Lasso(alpha=alpha)
lasso.fit(X, y)
return lasso.coef_ != 0
3. 特征提取
3.1 主成分分析(PCA)
from sklearn.decomposition import PCA
def apply_pca(X, n_components):
"""应用PCA降维"""
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)
explained_variance = pca.explained_variance_ratio_
return X_pca, explained_variance
# 可视化PCA结果
def plot_pca_components(pca, feature_names):
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
components = pd.DataFrame(
pca.components_,
columns=feature_names
)
sns.heatmap(components, cmap='coolwarm')
plt.title('PCA Components')
plt.show()
3.2 t-SNE
from sklearn.manifold import TSNE
def apply_tsne(X, n_components=2):
"""应用t-SNE进行可视化"""
tsne = TSNE(n_components=n_components)
X_tsne = tsne.fit_transform(X)
return X_tsne
3.3 自编码器
import tensorflow as tf
def create_autoencoder(input_dim, encoding_dim):
"""创建简单的自编码器"""
# 编码器
input_layer = tf.keras.layers.Input(shape=(input_dim,))
encoded = tf.keras.layers.Dense(encoding_dim, activation='relu')(input_layer)
# 解码器
decoded = tf.keras.layers.Dense(input_dim, activation='sigmoid')(encoded)
# 完整的自编码器
autoencoder = tf.keras.Model(input_layer, decoded)
# 仅编码器
encoder = tf.keras.Model(input_layer, encoded)
return autoencoder, encoder
4. 特征变换
4.1 数值特征变换
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
class FeatureTransformer:
"""特征变换工具类"""
@staticmethod
def standard_scale(X):
"""标准化"""
return StandardScaler().fit_transform(X)
@staticmethod
def minmax_scale(X):
"""归一化"""
return MinMaxScaler().fit_transform(X)
@staticmethod
def robust_scale(X):
"""鲁棒缩放"""
return RobustScaler().fit_transform(X)
@staticmethod
def log_transform(X):
"""对数变换"""
return np.log1p(X)
@staticmethod
def power_transform(X, power):
"""幂变换"""
return np.power(X, power)
4.2 类别特征编码
class CategoryEncoder:
"""类别特征编码工具类"""
@staticmethod
def one_hot_encode(X, columns):
"""独热编码"""
return pd.get_dummies(X, columns=columns)
@staticmethod
def label_encode(X):
"""标签编码"""
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
return le.fit_transform(X)
@staticmethod
def target_encode(X, y, column):
"""目标编码"""
means = X.groupby(column)[y].mean()
return X[column].map(means)
5. 特征组合
5.1 多项式特征
from sklearn.preprocessing import PolynomialFeatures
def create_polynomial_features(X, degree=2):
"""创建多项式特征"""
poly = PolynomialFeatures(degree=degree)
return poly.fit_transform(X)
5.2 交叉特征
def create_interaction_features(df, feature1, feature2, operation='multiply'):
"""创建交叉特征"""
if operation == 'multiply':
return df[feature1] * df[feature2]
elif operation == 'add':
return df[feature1] + df[feature2]
elif operation == 'subtract':
return df[feature1] - df[feature2]
elif operation == 'divide':
return df[feature1] / (df[feature2] + 1e-7)
6. 特征工程Pipeline
6.1 构建Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
def create_preprocessing_pipeline(numeric_features, categorical_features):
"""创建特征预处理pipeline"""
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(drop='first'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
return preprocessor
6.2 自定义Transformer
from sklearn.base import BaseEstimator, TransformerMixin
class CustomFeatureTransformer(BaseEstimator, TransformerMixin):
"""自定义特征转换器"""
def __init__(self, power=2):
self.power = power
def fit(self, X, y=None):
return self
def transform(self, X):
X_transformed = X.copy()
# 添加自定义转换逻辑
return X_transformed
7. 实战案例:房价预测
7.1 特征工程完整流程
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
def housing_price_prediction():
# 1. 加载数据
data = pd.read_csv('housing.csv')
# 2. 分离特征和目标
X = data.drop('price', axis=1)
y = data['price']
# 3. 识别特征类型
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
# 4. 创建预处理pipeline
preprocessor = create_preprocessing_pipeline(
numeric_features,
categorical_features
)
# 5. 创建完整pipeline
full_pipeline = Pipeline([
('preprocessor', preprocessor),
('regressor', RandomForestRegressor())
])
# 6. 训练和评估
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
full_pipeline.fit(X_train, y_train)
score = full_pipeline.score(X_test, y_test)
return full_pipeline, score
常见问题解答
Q: 如何处理高基数类别特征? A: 可以采用以下方法:
- 频率编码
- 目标编码
- 分箱
- 选择top-k类别,其他归为"其他"
Q: 如何处理时间特征? A: 常用方法包括:
- 提取年、月、日、星期等
- 创建周期性特征
- 计算时间差
- 提取时间窗口统计特征
Q: 特征选择和降维有什么区别? A: 主要区别:
- 特征选择保留原始特征的子集
- 降维创建新的特征组合
- 特征选择更易解释
- 降维可能获得更好的表示