使用 Haystack 评估 AI

_{最后更新：2025 年 7 月 8 日}

作者 Bilge Yucel ( X, Linkedin)

在本 cookbook 中，我们将介绍 Haystack 中的 Evaluators，创建一个评估 pipeline，并尝试不同的评估框架，例如 FlowJudge。

📚 有用资源:

📺 观看直播

!pip install haystack-ai "sentence-transformers>=3.0.0" pypdf "flow-judge[hf]"

1. 构建你的 pipeline

ARAGOG

此数据集基于论文 Advanced Retrieval Augmented Generation Output Grading (ARAGOG)。它是 ArXiv 上一篇关于 Transformer 和大型语言模型主题的论文集合，全部为 PDF 格式。

数据集包含

13 篇 PDF 论文。
107 个问题和答案，由 GPT-4 辅助生成，并由人工验证/更正。

我们有

黄金标准答案
问题

在此处获取数据集

# Run this to download the dataset
!mkdir -p ARAGOG/papers_for_questions

!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/DetectGPT.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/MMLU_measure.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/PAL.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/bert.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/codenet.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/distilbert.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/glm_130b.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/hellaswag.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/llama.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/llm_long_tail.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/meaning_of_prompt.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/megatron.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/red_teaming.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/roberta.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/superglue.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/task2vec.pdf -P ARAGOG/papers_for_questions

索引管道

import os

from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.converters import PyPDFToDocument
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy

embedding_model="sentence-transformers/all-MiniLM-L6-v2"
document_store = InMemoryDocumentStore()

files_path = "./ARAGOG/papers_for_questions"
pipeline = Pipeline()
pipeline.add_component("converter", PyPDFToDocument())
pipeline.add_component("cleaner", DocumentCleaner())
pipeline.add_component("splitter", DocumentSplitter(split_length=250, split_by="word"))  # default splitting by word
pipeline.add_component("writer", DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP))
pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder(embedding_model))
pipeline.connect("converter", "cleaner")
pipeline.connect("cleaner", "splitter")
pipeline.connect("splitter", "embedder")
pipeline.connect("embedder", "writer")
pdf_files = [files_path + "/" + f_name for f_name in os.listdir(files_path)]

pipeline.run({"converter": {"sources": pdf_files}})

document_store.count_documents()

RAG

import os
from getpass import getpass

if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass('OPENAI_API_KEY: ')

from haystack import Pipeline
from haystack.components.builders import ChatPromptBuilder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.generators.chat import OpenAIChatGenerator
from haystack.components.retrievers import InMemoryEmbeddingRetriever
from haystack.dataclasses import ChatMessage

chat_message = ChatMessage.from_user(
    text="""You have to answer the following question based on the given context information only.
If the context is empty or just a '\\n' answer with None, example: "None".

Context:
{% for document in documents %}
  {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""
)

basic_rag = Pipeline()
basic_rag.add_component("query_embedder", SentenceTransformersTextEmbedder(
    model=embedding_model, progress_bar=False
))
basic_rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store))
basic_rag.add_component("chat_prompt_builder", ChatPromptBuilder(template=[chat_message], required_variables="*"))
basic_rag.add_component("chat_generator", OpenAIChatGenerator(model="gpt-4o-mini"))

basic_rag.connect("query_embedder", "retriever.query_embedding")
basic_rag.connect("retriever", "chat_prompt_builder.documents")
basic_rag.connect("chat_prompt_builder", "chat_generator")

<haystack.core.pipeline.pipeline.Pipeline object at 0x309fa13d0>
🚅 Components
  - query_embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - chat_prompt_builder: ChatPromptBuilder
  - chat_generator: OpenAIChatGenerator
🛤️ Connections
  - query_embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> chat_prompt_builder.documents (List[Document])
  - chat_prompt_builder.prompt -> chat_generator.messages (List[ChatMessage])

2. 人工评估

!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/eval_questions.json -P ARAGOG

from typing import List, Tuple
import json

def read_question_answers() -> Tuple[List[str], List[str]]:
    with open("./ARAGOG/eval_questions.json", "r") as f:
        data = json.load(f)
        questions = data["questions"]
        answers = data["ground_truths"]
    return questions, answers

all_questions, all_answers = read_question_answers()

print(len(all_questions))
print(len(all_answers))

107
107

questions = all_questions[:15]
answers = all_answers[:15]

index = 5
print(questions[index])
print(answers[index])
question = questions[index]

How were the questions for the multitask test sourced, and what was the criteria for their inclusion?
Questions were manually collected by graduate and undergraduate students from freely available online sources, including practice questions for standardized tests and undergraduate courses, ensuring a wide representation of difficulty levels and subjects.

basic_rag.run({"query_embedder":{"text":question}, "chat_prompt_builder":{"question": question}})

{'chat_generator': {'replies': [ChatMessage(_role=<ChatRole.ASSISTANT: 'assistant'>, _content=[TextContent(text='The questions for the multitask test were manually collected by graduate and undergraduate students from freely available sources online. These sources included practice questions for tests such as the Graduate Record Examination and the United States Medical Licensing Examination, as well as questions designed for undergraduate courses and readers of Oxford University Press books. The criteria for inclusion involved ensuring that each subject contained a sufficient number of test examples, with each subject having a minimum of 100 test examples. Tasks that were either too challenging for humans without extensive training or too easy for the machine baselines were filtered out.')], _name=None, _meta={'model': 'gpt-4o-mini-2024-07-18', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 110, 'prompt_tokens': 4550, 'total_tokens': 4660, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}})]}}

3. 决定指标

语义答案相似度：SASEvaluator 基于通用的嵌入模型，将生成的答案与黄金标准答案的嵌入进行比较。
ContextRelevanceEvaluator 将评估检索到的上下文与回答查询问题的相关性。
FaithfulnessEvaluator 评估生成的答案是否可以从上下文中推导出来。

4. 构建评估 pipeline

from haystack import Pipeline
from haystack.components.evaluators import ContextRelevanceEvaluator, FaithfulnessEvaluator, SASEvaluator

eval_pipeline = Pipeline()
eval_pipeline.add_component("context_relevance", ContextRelevanceEvaluator(raise_on_failure=False))
eval_pipeline.add_component("faithfulness", FaithfulnessEvaluator(raise_on_failure=False))
eval_pipeline.add_component("sas", SASEvaluator(model=embedding_model))

5. 运行评估

运行 RAG Pipeline

predicted_answers = []
retrieved_context = []

for question in questions: # loops over 15 questions
    result = basic_rag.run(
        {"query_embedder":{"text":question}, "chat_prompt_builder":{"question": question}}, include_outputs_from={"retriever"}
    )
    predicted_answers.append(result["chat_generator"]["replies"][0].text)
    retrieved_context.append(result["retriever"]["documents"])

运行评估

eval_pipeline_results = eval_pipeline.run(
    {
        "context_relevance": {"questions": questions, "contexts": retrieved_context},
        "faithfulness": {"questions": questions, "contexts": retrieved_context, "predicted_answers": predicted_answers},
        "sas": {"predicted_answers": predicted_answers, "ground_truth_answers": answers},
    }
)

results = {
    "context_relevance": eval_pipeline_results['context_relevance'],
    "faithfulness": eval_pipeline_results['faithfulness'],
    "sas": eval_pipeline_results['sas']
}

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:10<00:00,  1.43it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:33<00:00,  2.23s/it]

6. 分析结果

EvaluationRunResult

from haystack.evaluation import EvaluationRunResult

inputs = {
    'questions': questions,
    'contexts': retrieved_context,
    'true_answers': answers,
    'predicted_answers': predicted_answers
}
run_name="rag_eval"
eval_results = EvaluationRunResult(run_name=run_name, inputs=inputs, results=results)
eval_results.aggregated_report()

{'metrics': ['context_relevance', 'faithfulness', 'sas'],
 'score': [0.26666666666666666, 0.7, 0.5344941093275944]}

index = 2
print(eval_pipeline_results['context_relevance']["individual_scores"][index], "\nQuestion:", questions[index],"\nTrue Answer:", answers[index], "\nAnswer:", predicted_answers[index])
print("".join([doc.content for doc in retrieved_context[index]]))

评估框架

关于 RagasEvaluator，请查看我们的 cookbook 使用 RAGAS 进行 RAG Pipeline 评估。
在这里我们将展示如何使用 FlowJudge。

from flow_judge.integrations.haystack import HaystackFlowJudge
from flow_judge.metrics.presets import RESPONSE_FAITHFULNESS_5POINT
from flow_judge import Hf

model = Hf(flash_attn=False)

flow_judge_evaluator = HaystackFlowJudge(
    metric=RESPONSE_FAITHFULNESS_5POINT,
    model=model,
    progress_bar=True,
    raise_on_failure=True,
    save_results=True,
    fail_on_parse_error=False
)

import os
from getpass import getpass

if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass('OPENAI_API_KEY: ')

str_fj_retrieved_context = []
for context in retrieved_context:
    str_context = [doc.content for doc in context]
    str_fj_retrieved_context.append(" ".join(str_context)) # ["", "", ...]

from haystack import Pipeline

integration_eval_pipeline = Pipeline()
integration_eval_pipeline.add_component("flow_judge_evaluator", flow_judge_evaluator)

eval_framework_pipeline_results = integration_eval_pipeline.run(
    {
        "flow_judge_evaluator": {"query": questions, "context": str_fj_retrieved_context, "response": predicted_answers},
    }
)

eval_framework_pipeline_results