检索增强生成:构建智能问答系统
RAG 是什么?🤔
检索增强生成(Retrieval-Augmented Generation,RAG)是一种将检索系统与生成式AI模型结合的技术。它通过检索相关信息来增强AI模型的回答能力,使回答更准确、更可靠。
工作原理
from langchain import OpenAI, VectorDBQA
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
class RAGSystem:
def __init__(self, api_key):
"""初始化RAG系统"""
self.embeddings = OpenAIEmbeddings(openai_api_key=api_key)
self.llm = OpenAI(openai_api_key=api_key)
self.vector_store = None
def load_documents(self, file_path):
"""加载文档"""
loader = TextLoader(file_path)
documents = loader.load()
# 文本分割
text_splitter = CharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
texts = text_splitter.split_documents(documents)
# 创建向量存储
self.vector_store = Chroma.from_documents(
texts,
self.embeddings
)
def query(self, question):
"""查询问题"""
if not self.vector_store:
raise ValueError("请先加载文档")
qa = VectorDBQA.from_chain_type(
llm=self.llm,
chain_type="stuff",
vectorstore=self.vector_store
)
return qa.run(question)
核心组件 🔧
1. 文档处理
from typing import List, Dict
import numpy as np
class DocumentProcessor:
def __init__(self, chunk_size=1000, overlap=200):
self.chunk_size = chunk_size
self.overlap = overlap
def process_document(self, content: str) -> List[Dict]:
"""处理文档内容"""
# 分割文本
chunks = self._split_text(content)
# 清理和规范化
cleaned_chunks = [self._clean_text(chunk) for chunk in chunks]
# 提取元数据
documents = [
{
'content': chunk,
'metadata': self._extract_metadata(chunk)
}
for chunk in cleaned_chunks
]
return documents
def _split_text(self, text: str) -> List[str]:
"""智能分割文本"""
# 实现智能分割逻辑
chunks = []
start = 0
while start < len(text):
# 找到合适的分割点
end = start + self.chunk_size
if end < len(text):
# 寻找句子边界
while end > start and text[end] not in '.!?':
end -= 1
if end == start:
end = start + self.chunk_size
chunks.append(text[start:end])
start = end - self.overlap
return chunks
def _clean_text(self, text: str) -> str:
"""清理和规范化文本"""
# 移除多余空白
text = ' '.join(text.split())
# 统一标点符号
text = text.replace(',', ',').replace('。', '.')
return text
def _extract_metadata(self, chunk: str) -> Dict:
"""提取文本元数据"""
return {
'length': len(chunk),
'sentences': len([s for s in chunk.split('.') if s]),
'created_at': datetime.now().isoformat()
}
2. 向量索引
class VectorIndex:
def __init__(self, embedding_dim=768):
self.embedding_dim = embedding_dim
self.index = None
self.documents = []
def build_index(self, documents: List[Dict], embeddings: List[np.ndarray]):
"""构建向量索引"""
# 使用HNSW索引
self.index = hnswlib.Index(space='cosine', dim=self.embedding_dim)
self.index.init_index(
max_elements=len(embeddings),
ef_construction=200,
M=16
)
# 添加向量
self.index.add_items(
embeddings,
np.arange(len(embeddings))
)
# 存储文档
self.documents = documents
def search(self, query_vector: np.ndarray, k: int = 5) -> List[Dict]:
"""搜索相关文档"""
if self.index is None:
raise ValueError("索引未构建")
# 执行向量搜索
labels, distances = self.index.knn_query(query_vector, k)
# 返回相关文档
results = []
for idx, dist in zip(labels[0], distances[0]):
results.append({
'document': self.documents[idx],
'score': 1 - dist # 转换距离为相似度分数
})
return results
3. 上下文构建
class ContextBuilder:
def __init__(self, max_tokens=2000):
self.max_tokens = max_tokens
def build_context(self, retrieved_docs: List[Dict]) -> str:
"""构建问答上下文"""
context = []
current_tokens = 0
for doc in retrieved_docs:
# 估算token数量
estimated_tokens = len(doc['document']['content'].split()) * 1.3
if current_tokens + estimated_tokens > self.max_tokens:
break
context.append(doc['document']['content'])
current_tokens += estimated_tokens
# 组织上下文
return self._format_context(context)
def _format_context(self, context_parts: List[str]) -> str:
"""格式化上下文"""
return "\n\n相关信息:\n" + "\n---\n".join(context_parts)
4. 响应生成
from typing import Optional
class ResponseGenerator:
def __init__(self, model_name="gpt-3.5-turbo"):
self.model_name = model_name
self.client = OpenAI()
def generate_response(
self,
question: str,
context: str,
max_tokens: int = 500
) -> Dict:
"""生成回答"""
# 构建提示词
prompt = self._build_prompt(question, context)
# 调用模型生成回答
response = self.client.chat.completions.create(
model=self.model_name,
messages=[
{"role": "system", "content": "你是一个专业的AI助手,请基于提供的上下文信息回答问题。如果无法从上下文中找到答案,请明确说明。"},
{"role": "user", "content": prompt}
],
max_tokens=max_tokens,
temperature=0.7
)
return {
'answer': response.choices[0].message.content,
'model': self.model_name,
'tokens_used': response.usage.total_tokens
}
def _build_prompt(self, question: str, context: str) -> str:
"""构建提示词"""
return f"""请基于以下信息回答问题:
{context}
问题:{question}
请提供准确、完整的回答。如果上下文信息不足以回答问题,请明确指出。"""
实践案例 💡
1. 构建文档问答系统
class DocumentQA:
def __init__(self, api_key: str):
self.processor = DocumentProcessor()
self.embedder = OpenAIEmbeddings(api_key=api_key)
self.vector_index = VectorIndex()
self.context_builder = ContextBuilder()
self.generator = ResponseGenerator()
def load_documents(self, file_paths: List[str]):
"""加载文档"""
all_documents = []
all_embeddings = []
for path in file_paths:
# 读取文件
with open(path, 'r', encoding='utf-8') as f:
content = f.read()
# 处理文档
documents = self.processor.process_document(content)
# 生成嵌入
embeddings = [
self.embedder.embed_query(doc['content'])
for doc in documents
]
all_documents.extend(documents)
all_embeddings.extend(embeddings)
# 构建索引
self.vector_index.build_index(all_documents, all_embeddings)
def answer_question(self, question: str) -> Dict:
"""回答问题"""
# 生成问题的嵌入向量
query_vector = self.embedder.embed_query(question)
# 检索相关文档
relevant_docs = self.vector_index.search(query_vector)
# 构建上下文
context = self.context_builder.build_context(relevant_docs)
# 生成回答
response = self.generator.generate_response(question, context)
return {
'answer': response['answer'],
'sources': [doc['document']['content'][:200] + "..."
for doc in relevant_docs],
'model_info': {
'model': response['model'],
'tokens': response['tokens_used']
}
}
2. 使用示例
# 初始化系统
qa_system = DocumentQA(api_key="your-api-key")
# 加载文档
qa_system.load_documents([
"company_policies.txt",
"product_manual.pdf",
"faq.md"
])
# 提问
question = "我们的退货政策是什么?"
response = qa_system.answer_question(question)
print("回答:", response['answer'])
print("\n来源文档:")
for source in response['sources']:
print("-", source)
print("\n使用模型:", response['model_info']['model'])
print("消耗Token:", response['model_info']['tokens'])
优化技巧 🚀
1. 提高检索质量
- 合理设置文档分块大小
- 使用重叠来保持上下文
- 优化向量索引参数
2. 增强回答准确性
- 优化提示词工程
- 添加相关性过滤
- 使用多轮对话
3. 性能优化
- 实现结果缓存
- 批量处理文档
- 异步处理请求
注意事项 ⚠️
数据安全
- 保护敏感信息
- 实现访问控制
- 定期更新文档
成本控制
- 优化Token使用
- 实现缓存机制
- 监控API调用
系统维护
- 定期更新索引
- 监控系统性能
- 处理异常情况
小结 📝
本章我们深入学习了RAG技术:
基础概念
- RAG原理
- 核心组件
- 工作流程
实现细节
- 文档处理
- 向量索引
- 上下文构建
- 响应生成
实践应用
- 系统构建
- 优化技巧
- 注意事项
下一章,我们将探讨向量数据库的更多实际应用场景!