使用假设文档嵌入 (HyDE) 改进检索
最后更新:2025年4月30日
📚 本食谱附带一篇完整的教程文章 《使用 HyDE 优化检索》
在本食谱中,我们将构建 Haystack 组件,使我们能够轻松地将 HyDE 整合到我们的 RAG 管道中,以优化检索。
要了解更多关于 HyDE 以及何时使用它的信息,请查看我们的 假设文档嵌入(HyDE)指南
安装要求
!pip install haystack-ai sentence-transformers datasets
在接下来的部分中,我们将使用 OpenAIGenerator,因此我们需要提供我们的 API 密钥 👇
from getpass import getpass
import os
os.environ["OPENAI_API_KEY"] = getpass("Enter your openAI key:")
构建用于假设文档嵌入的管道
我们将构建一个 Haystack 管道来生成“虚假”文档。对于这部分,我们将使用带有一个 PromptBuilder 的 OpenAIGenerator,该 PromptBuilder 指导模型生成段落。
from haystack.components.generators.openai import OpenAIGenerator
from haystack.components.builders import PromptBuilder
generator = OpenAIGenerator(
model="gpt-4o-mini",
generation_kwargs={"n": 5, "temperature": 0.75, "max_tokens": 400},
)
template="""Given a question, generate a paragraph of text that answers the question.
Question: {{question}}
Paragraph:"""
prompt_builder = PromptBuilder(template=template)
接下来,我们使用 OutputAdapter 将生成的段落转换为 Document 列表。这样,我们就可以使用 SentenceTransformersDocumentEmbedder 来创建嵌入,因为该组件期望 List[Document]。
from haystack import Document
from haystack.components.converters import OutputAdapter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from typing import List
adapter = OutputAdapter(
template="{{answers | build_doc}}",
output_type=List[Document],
custom_filters={"build_doc": lambda data: [Document(content=d) for d in data]}
)
embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
embedder.warm_up()
最后,我们创建一个自定义组件 HypotheticalDocumentEmbedder,它期望 documents 并返回一个 hypotethetical_embeddings 列表,这是“假设”(虚假)文档嵌入的平均值。要了解更多关于此技术及其用途的信息,请查看我们的 HyDE 指南
from numpy import array, mean
from haystack import component
@component
class HypotheticalDocumentEmbedder:
@component.output_types(hypothetical_embedding=List[float])
def run(self, documents: List[Document]):
stacked_embeddings = array([doc.embedding for doc in documents])
avg_embeddings = mean(stacked_embeddings, axis=0)
hyde_vector = avg_embeddings.reshape((1, len(avg_embeddings)))
return {"hypothetical_embedding": hyde_vector[0].tolist()}
我们将所有组件添加到管道中,以生成一个假设文档嵌入 🚀👇
from haystack import Pipeline
hyde = HypotheticalDocumentEmbedder()
pipeline = Pipeline()
pipeline.add_component(name="prompt_builder", instance=prompt_builder)
pipeline.add_component(name="generator", instance=generator)
pipeline.add_component(name="adapter", instance=adapter)
pipeline.add_component(name="embedder", instance=embedder)
pipeline.add_component(name="hyde", instance=hyde)
pipeline.connect("prompt_builder", "generator")
pipeline.connect("generator.replies", "adapter.answers")
pipeline.connect("adapter.output", "embedder.documents")
pipeline.connect("embedder.documents", "hyde.documents")
query = "What should I do if I have a fever?"
result = pipeline.run(data={"prompt_builder": {"question": query}})
print(result["hyde"])
构建一个封装了整个逻辑的 HyDE 组件
本节将向您展示如何创建一个 HypotheticalDocumentEmbedder,该组件封装了整个逻辑,并允许我们提供嵌入模型作为可选参数。
这个“超级”组件做了几件事情:
- 允许用户选择生成假设文档的 LLM
- 允许用户使用
nr_completions定义应创建多少个文档 - 允许用户定义他们想要用于生成 HyDE 嵌入的嵌入模型。
from haystack import Pipeline, component, Document, default_to_dict, default_from_dict
from haystack.components.converters import OutputAdapter
from haystack.components.embedders.sentence_transformers_document_embedder import SentenceTransformersDocumentEmbedder
from haystack.components.generators.openai import OpenAIGenerator
from haystack.components.builders import PromptBuilder
from typing import Dict, Any, List
from numpy import array, mean
from haystack.utils import Secret
@component
class HypotheticalDocumentEmbedder:
def __init__(
self,
instruct_llm: str = "gpt-4o-mini",
instruct_llm_api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
nr_completions: int = 5,
embedder_model: str = "sentence-transformers/all-MiniLM-L6-v2",
):
self.instruct_llm = instruct_llm
self.instruct_llm_api_key = instruct_llm_api_key
self.nr_completions = nr_completions
self.embedder_model = embedder_model
self.generator = OpenAIGenerator(
api_key=self.instruct_llm_api_key,
model=self.instruct_llm,
generation_kwargs={"n": self.nr_completions, "temperature": 0.75, "max_tokens": 400},
)
self.prompt_builder = PromptBuilder(
template="""Given a question, generate a paragraph of text that answers the question.
Question: {{question}}
Paragraph:
"""
)
self.adapter = OutputAdapter(
template="{{answers | build_doc}}",
output_type=List[Document],
custom_filters={"build_doc": lambda data: [Document(content=d) for d in data]},
)
self.embedder = SentenceTransformersDocumentEmbedder(model=embedder_model, progress_bar=False)
self.embedder.warm_up()
self.pipeline = Pipeline()
self.pipeline.add_component(name="prompt_builder", instance=self.prompt_builder)
self.pipeline.add_component(name="generator", instance=self.generator)
self.pipeline.add_component(name="adapter", instance=self.adapter)
self.pipeline.add_component(name="embedder", instance=self.embedder)
self.pipeline.connect("prompt_builder", "generator")
self.pipeline.connect("generator.replies", "adapter.answers")
self.pipeline.connect("adapter.output", "embedder.documents")
def to_dict(self) -> Dict[str, Any]:
data = default_to_dict(
self,
instruct_llm=self.instruct_llm,
instruct_llm_api_key=self.instruct_llm_api_key,
nr_completions=self.nr_completions,
embedder_model=self.embedder_model,
)
data["pipeline"] = self.pipeline.to_dict()
return data
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "HypotheticalDocumentEmbedder":
hyde_obj = default_from_dict(cls, data)
hyde_obj.pipeline = Pipeline.from_dict(data["pipeline"])
return hyde_obj
@component.output_types(hypothetical_embedding=List[float])
def run(self, query: str):
result = self.pipeline.run(data={"prompt_builder": {"question": query}})
# return a single query vector embedding representing the average of the hypothetical document embeddings
stacked_embeddings = array([doc.embedding for doc in result["embedder"]["documents"]])
avg_embeddings = mean(stacked_embeddings, axis=0)
hyde_vector = avg_embeddings.reshape((1, len(avg_embeddings)))
return {"hypothetical_embedding": hyde_vector[0].tolist()}
使用 HyDE 进行检索
让我们看看如何在完整的管道中使用这个组件。首先,我们将一些文档索引到 InMemoryDocumentStore 中。
from datasets import load_dataset, Dataset
from haystack import Pipeline, Document
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
embedder_model = "sentence-transformers/all-MiniLM-L6-v2"
def index_docs(data: Dataset):
# create a data store and indexing pipeline with the components
document_store = InMemoryDocumentStore()
pipeline = Pipeline()
pipeline.add_component("cleaner", DocumentCleaner())
pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=10))
pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder(model=embedder_model))
pipeline.add_component("writer", DocumentWriter(document_store=document_store, policy="skip"))
# connect the components
pipeline.connect("cleaner", "splitter")
pipeline.connect("splitter", "embedder")
pipeline.connect("embedder", "writer")
# index the documents and return the data store
pipeline.run({"cleaner": {"documents": [Document.from_dict(doc) for doc in data["train"]]}})
return document_store
data = load_dataset("Tuana/game-of-thrones")
doc_store = index_docs(data)
现在我们可以运行一个检索管道,它不仅仅基于查询嵌入进行检索,而是使用 HypotheticalDocumentEmbedder 根据我们的 query 创建假设文档嵌入,并使用这些新的嵌入来检索文档。
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
def retriever_with_hyde(doc_store):
hyde = HypotheticalDocumentEmbedder(instruct_llm="gpt-4o-mini", nr_completions=5)
retriever = InMemoryEmbeddingRetriever(document_store=doc_store)
retrieval_pipeline = Pipeline()
retrieval_pipeline.add_component(instance=hyde, name="query_embedder")
retrieval_pipeline.add_component(instance=retriever, name="retriever")
retrieval_pipeline.connect("query_embedder.hypothetical_embedding", "retriever.query_embedding")
return retrieval_pipeline
retrieval_pipeline = retriever_with_hyde(doc_store)
query = "Who is Araya Stark?"
retrieval_pipeline.run(data={"query_embedder": {"query": query}, "retriever": {"top_k": 5}})
