关联分析在购物篮分析、商品推荐等领域有重要应用
关联分析详解
学习目标
完成本节后,你将能够:
- 理解关联规则挖掘的基本概念
- 掌握主要的关联分析算法
- 实现和评估关联规则
- 处理实际的关联分析问题
- 应用关联规则进行推荐
先修知识
学习本节内容需要:
- Python编程基础
- 数据结构基础
- 基本的统计学知识
- 数据预处理技能
关联分析基础
基本概念
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
# 生成示例数据
def generate_transaction_data(n_transactions=1000):
"""
生成购物交易数据
"""
# 定义商品列表
items = ['面包', '牛奶', '黄油', '啤酒', '尿布']
# 生成交易
transactions = []
for _ in range(n_transactions):
# 随机选择2-4件商品
n_items = np.random.randint(2, 5)
transaction = np.random.choice(items, n_items,
replace=False)
transactions.append(list(transaction))
return pd.DataFrame(transactions)
# 转换为one-hot编码
def convert_to_binary(df):
"""
将交易数据转换为二进制矩阵
"""
# 获取所有唯一商品
all_items = []
for items in df.values:
all_items.extend(items)
unique_items = list(set([x for x in all_items if pd.notna(x)]))
# 创建二进制矩阵
binary_data = pd.DataFrame(index=range(len(df)),
columns=unique_items)
for i, transaction in enumerate(df.values):
for item in transaction:
if pd.notna(item):
binary_data.loc[i, item] = 1
return binary_data.fillna(0)
Apriori算法
算法实现
def apriori_algorithm(transactions, min_support=0.1):
"""
实现Apriori算法
参数:
transactions: 交易数据
min_support: 最小支持度
"""
# 转换为二进制矩阵
binary_data = convert_to_binary(transactions)
# 使用apriori算法找出频繁项集
frequent_itemsets = apriori(binary_data,
min_support=min_support,
use_colnames=True)
# 生成关联规则
rules = association_rules(frequent_itemsets,
metric="confidence",
min_threshold=0.5)
return frequent_itemsets, rules
# 使用示例
transactions = generate_transaction_data()
itemsets, rules = apriori_algorithm(transactions)
print("\n频繁项集:")
print(itemsets)
print("\n关联规则:")
print(rules)
评估指标
def evaluate_rules(rules):
"""
评估关联规则的质量
"""
# 计算提升度
rules['lift'] = rules.apply(lambda x:
x['confidence'] / (
support_dict[frozenset(x['consequents'])]
), axis=1)
# 计算杠杆率
rules['leverage'] = rules.apply(lambda x:
x['support'] - (
support_dict[frozenset(x['antecedents'])] *
support_dict[frozenset(x['consequents'])]
), axis=1)
# 计算确信度
rules['conviction'] = rules.apply(lambda x:
(1 - support_dict[frozenset(x['consequents'])]) /
(1 - x['confidence'])
if x['confidence'] < 1 else float('inf'), axis=1)
return rules
FP-Growth算法
算法实现
from mlxtend.frequent_patterns import fpgrowth
def fp_growth_algorithm(transactions, min_support=0.1):
"""
实现FP-Growth算法
"""
# 转换为二进制矩阵
binary_data = convert_to_binary(transactions)
# 使用FP-Growth算法找出频繁项集
frequent_itemsets = fpgrowth(binary_data,
min_support=min_support,
use_colnames=True)
# 生成关联规则
rules = association_rules(frequent_itemsets,
metric="confidence",
min_threshold=0.5)
return frequent_itemsets, rules
# 比较Apriori和FP-Growth的性能
def compare_algorithms(transactions):
"""
比较不同算法的性能
"""
import time
# Apriori算法
start_time = time.time()
apriori_itemsets, _ = apriori_algorithm(transactions)
apriori_time = time.time() - start_time
# FP-Growth算法
start_time = time.time()
fpgrowth_itemsets, _ = fp_growth_algorithm(transactions)
fpgrowth_time = time.time() - start_time
print(f"Apriori耗时: {apriori_time:.2f}秒")
print(f"FP-Growth耗时: {fpgrowth_time:.2f}秒")
实战项目:商品推荐系统
数据准备
def prepare_retail_data():
"""
准备零售数据集
"""
# 生成更复杂的交易数据
n_transactions = 5000
items = [
'面包', '牛奶', '黄油', '啤酒', '尿布',
'薯片', '可乐', '巧克力', '饼干', '果汁'
]
transactions = []
for _ in range(n_transactions):
# 添加一些购买模式
if np.random.random() < 0.3:
# 婴儿用品套装
transaction = ['尿布', '牛奶']
if np.random.random() < 0.7:
transaction.append('面包')
elif np.random.random() < 0.5:
# 零食套装
transaction = ['薯片', '可乐']
if np.random.random() < 0.6:
transaction.append('巧克力')
else:
# 随机购买
n_items = np.random.randint(2, 5)
transaction = list(np.random.choice(items,
n_items,
replace=False))
transactions.append(transaction)
return pd.DataFrame(transactions)
商品推荐
def recommend_products(rules, items, top_n=5):
"""
基于购买的商品推荐其他商品
参数:
rules: 关联规则
items: 已购买的商品列表
top_n: 推荐商品数量
"""
# 找出符合条件的规则
relevant_rules = rules[rules['antecedents'].apply(
lambda x: x.issubset(items))]
# 按照提升度排序
relevant_rules = relevant_rules.sort_values('lift',
ascending=False)
# 获取推荐商品
recommendations = []
for _, rule in relevant_rules.iterrows():
consequents = list(rule['consequents'])
for item in consequents:
if item not in items and item not in recommendations:
recommendations.append(item)
if len(recommendations) >= top_n:
break
if len(recommendations) >= top_n:
break
return recommendations
# 使用示例
retail_data = prepare_retail_data()
itemsets, rules = apriori_algorithm(retail_data,
min_support=0.01)
# 推荐商品
basket = ['面包', '牛奶']
recommendations = recommend_products(rules, basket)
print(f"\n基于{basket}的推荐商品:")
print(recommendations)
练习与作业
基础练习:
- 实现简单的Apriori算法
- 计算支持度和置信度
- 生成关联规则
进阶练习:
- 实现FP-Growth算法
- 优化算法性能
- 处理大规模数据集
项目实践:
- 构建商品推荐系统
- 分析用户购买模式
- 评估推荐效果
常见问题
Q1: 如何选择最小支持度和置信度? A1: 需要考虑以下因素:
- 数据集大小
- 项集的稀疏程度
- 业务需求
- 计算资源限制
Q2: 如何处理大规模数据集? A2: 可以采用以下方法:
- 使用FP-Growth算法
- 数据采样
- 并行计算
- 增量式更新
扩展阅读
下一步学习
- 序列模式挖掘
- 时序关联分析
- 多层关联规则
- 可视化关联规则