Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 469474d

Browse files
Merge pull request #122 from Azure-Samples/testeval6
Add seed parameter (optional) and custom evaluation metric for citations overlap
2 parents 02ed71a + da05c77 commit 469474d

File tree

6 files changed

+43
-3
lines changed

6 files changed

+43
-3
lines changed

‎evals/eval_config.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
{
22
"testdata_path": "ground_truth.jsonl",
33
"results_dir": "results/experiment<TIMESTAMP>",
4-
"requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citation_match"],
4+
"requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citations_matched"],
55
"target_url": "http://127.0.0.1:8000/chat",
66
"target_parameters": {
77
"overrides": {
88
"use_advanced_flow": true,
99
"top": 3,
1010
"retrieval_mode": "hybrid",
11-
"temperature": 0.3
11+
"temperature": 0.3,
12+
"seed": 42
1213
}
1314
},
1415
"target_response_answer_jmespath": "message.content",

‎evals/evaluate.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,44 @@
11
import argparse
22
import logging
33
import os
4+
import re
45
from pathlib import Path
56
from typing import Any
67

78
import azure.identity
89
from dotenv import load_dotenv
910
from evaltools.eval.evaluate import run_evaluate_from_config
11+
from evaltools.eval.evaluate_metrics import register_metric
12+
from evaltools.eval.evaluate_metrics.base_metric import BaseMetric
1013
from rich.logging import RichHandler
1114

1215
logger = logging.getLogger("ragapp")
1316

1417

18+
class CitationsMatchedMetric(BaseMetric):
19+
METRIC_NAME = "citations_matched"
20+
21+
@classmethod
22+
def evaluator_fn(cls, **kwargs):
23+
def citations_overlap(*, response, ground_truth, **kwargs):
24+
if response is None:
25+
logger.warning("Received response of None, can't compute citation_match metric. Setting to -1.")
26+
return {cls.METRIC_NAME: -1}
27+
truth_citations = set(re.findall(r"\[(\d+)\]", ground_truth))
28+
response_citations = set(re.findall(r"\[(\d+)\]", response))
29+
# Count the percentage of citations that are present in the response
30+
num_citations = len(truth_citations)
31+
num_matched_citations = len(truth_citations.intersection(response_citations))
32+
return {cls.METRIC_NAME: num_matched_citations / num_citations}
33+
34+
return citations_overlap
35+
36+
@classmethod
37+
def get_aggregate_stats(cls, df):
38+
df = df[df[cls.METRIC_NAME] != -1]
39+
return {"mean": round(df[cls.METRIC_NAME].mean(), 2)}
40+
41+
1542
def get_openai_config() -> dict:
1643
openai_config: dict[str, Any]
1744
if os.environ.get("OPENAI_CHAT_HOST") == "azure":
@@ -60,6 +87,7 @@ def get_openai_config() -> dict:
6087

6188
openai_config = get_openai_config()
6289

90+
register_metric(CitationsMatchedMetric)
6391
run_evaluate_from_config(
6492
working_dir=Path(__file__).parent,
6593
config_path="eval_config.json",

‎src/backend/fastapi_app/api_models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class ChatRequestOverrides(BaseModel):
2828
retrieval_mode: RetrievalMode = RetrievalMode.HYBRID
2929
use_advanced_flow: bool = True
3030
prompt_template: str | None = None
31+
seed: int | None = None
3132

3233

3334
class ChatRequestContext(BaseModel):

‎src/backend/fastapi_app/rag_advanced.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,11 @@ def __init__(
3535
self.chat_token_limit = get_token_limit(chat_model, default_to_minimum=True)
3636

3737
async def generate_search_query(
38-
self, original_user_query: str, past_messages: list[ChatCompletionMessageParam], query_response_token_limit: int
38+
self,
39+
original_user_query: str,
40+
past_messages: list[ChatCompletionMessageParam],
41+
query_response_token_limit: int,
42+
seed: int | None = None,
3943
) -> tuple[list[ChatCompletionMessageParam], Any | str | None, list]:
4044
"""Generate an optimized keyword search query based on the chat history and the last question"""
4145

@@ -63,6 +67,7 @@ async def generate_search_query(
6367
n=1,
6468
tools=tools,
6569
tool_choice=tool_choice,
70+
seed=seed,
6671
)
6772

6873
query_text, filters = extract_search_arguments(original_user_query, chat_completion)
@@ -76,6 +81,7 @@ async def prepare_context(
7681
original_user_query=chat_params.original_user_query,
7782
past_messages=chat_params.past_messages,
7883
query_response_token_limit=500,
84+
seed=chat_params.seed,
7985
)
8086

8187
# Retrieve relevant rows from the database with the GPT optimized query
@@ -142,6 +148,7 @@ async def answer(
142148
max_tokens=chat_params.response_token_limit,
143149
n=1,
144150
stream=False,
151+
seed=chat_params.seed,
145152
)
146153

147154
return RetrievalResponse(

‎src/backend/fastapi_app/rag_base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def get_params(self, messages: list[ChatCompletionMessageParam], overrides: Chat
3636
return ChatParams(
3737
top=overrides.top,
3838
temperature=overrides.temperature,
39+
seed=overrides.seed,
3940
retrieval_mode=overrides.retrieval_mode,
4041
use_advanced_flow=overrides.use_advanced_flow,
4142
response_token_limit=response_token_limit,

‎src/backend/fastapi_app/rag_simple.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ async def answer(
9090
max_tokens=chat_params.response_token_limit,
9191
n=1,
9292
stream=False,
93+
seed=chat_params.seed,
9394
)
9495

9596
return RetrievalResponse(
@@ -130,6 +131,7 @@ async def answer_stream(
130131
max_tokens=chat_params.response_token_limit,
131132
n=1,
132133
stream=True,
134+
seed=chat_params.seed,
133135
)
134136

135137
yield RetrievalResponseDelta(

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /