Commit 469474d

authored

Merge pull request #122 from Azure-Samples/testeval6

Add seed parameter (optional) and custom evaluation metric for citations overlap

2 parents 02ed71a + da05c77 commit 469474dCopy full SHA for 469474d

File tree

6 files changed

+43

-3

lines changed

evals
- eval_config.json
- evaluate.py
src/backend/fastapi_app

6 files changed

+43

-3

lines changed

`‎evals/eval_config.json`

Lines changed: 3 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,14 +1,15 @@`
`1`	`1`	`{`
`2`	`2`	`"testdata_path": "ground_truth.jsonl",`
`3`	`3`	`"results_dir": "results/experiment<TIMESTAMP>",`
`4`		`- "requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citation_match"],`
	`4`	`+ "requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citations_matched"],`
`5`	`5`	`"target_url": "http://127.0.0.1:8000/chat",`
`6`	`6`	`"target_parameters": {`
`7`	`7`	`"overrides": {`
`8`	`8`	`"use_advanced_flow": true,`
`9`	`9`	`"top": 3,`
`10`	`10`	`"retrieval_mode": "hybrid",`
`11`		`- "temperature": 0.3`
	`11`	`+ "temperature": 0.3,`
	`12`	`+ "seed": 42`
`12`	`13`	`}`
`13`	`14`	`},`
`14`	`15`	`"target_response_answer_jmespath": "message.content",`

`‎evals/evaluate.py`

Lines changed: 28 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,17 +1,44 @@`
`1`	`1`	`import argparse`
`2`	`2`	`import logging`
`3`	`3`	`import os`
	`4`	`+import re`
`4`	`5`	`from pathlib import Path`
`5`	`6`	`from typing import Any`
`6`	`7`
`7`	`8`	`import azure.identity`
`8`	`9`	`from dotenv import load_dotenv`
`9`	`10`	`from evaltools.eval.evaluate import run_evaluate_from_config`
	`11`	`+from evaltools.eval.evaluate_metrics import register_metric`
	`12`	`+from evaltools.eval.evaluate_metrics.base_metric import BaseMetric`
`10`	`13`	`from rich.logging import RichHandler`
`11`	`14`
`12`	`15`	`logger = logging.getLogger("ragapp")`
`13`	`16`
`14`	`17`
	`18`	`+class CitationsMatchedMetric(BaseMetric):`
	`19`	`+ METRIC_NAME = "citations_matched"`
	`20`	`+`
	`21`	`+ @classmethod`
	`22`	`+ def evaluator_fn(cls, **kwargs):`
	`23`	`+ def citations_overlap(, response, ground_truth, *kwargs):`
	`24`	`+ if response is None:`
	`25`	`+ logger.warning("Received response of None, can't compute citation_match metric. Setting to -1.")`
	`26`	`+ return {cls.METRIC_NAME: -1}`
	`27`	`+ truth_citations = set(re.findall(r"\[(\d+)\]", ground_truth))`
	`28`	`+ response_citations = set(re.findall(r"\[(\d+)\]", response))`
	`29`	`+ # Count the percentage of citations that are present in the response`
	`30`	`+ num_citations = len(truth_citations)`
	`31`	`+ num_matched_citations = len(truth_citations.intersection(response_citations))`
	`32`	`+ return {cls.METRIC_NAME: num_matched_citations / num_citations}`
	`33`	`+`
	`34`	`+ return citations_overlap`
	`35`	`+`
	`36`	`+ @classmethod`
	`37`	`+ def get_aggregate_stats(cls, df):`
	`38`	`+ df = df[df[cls.METRIC_NAME] != -1]`
	`39`	`+ return {"mean": round(df[cls.METRIC_NAME].mean(), 2)}`
	`40`	`+`
	`41`	`+`
`15`	`42`	`def get_openai_config() -> dict:`
`16`	`43`	`openai_config: dict[str, Any]`
`17`	`44`	`if os.environ.get("OPENAI_CHAT_HOST") == "azure":`
`@@ -60,6 +87,7 @@ def get_openai_config() -> dict:`
`60`	`87`
`61`	`88`	`openai_config = get_openai_config()`
`62`	`89`
	`90`	`+ register_metric(CitationsMatchedMetric)`
`63`	`91`	`run_evaluate_from_config(`
`64`	`92`	`working_dir=Path(__file__).parent,`
`65`	`93`	`config_path="eval_config.json",`

`‎src/backend/fastapi_app/api_models.py`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@ class ChatRequestOverrides(BaseModel):`
`28`	`28`	`retrieval_mode: RetrievalMode = RetrievalMode.HYBRID`
`29`	`29`	`use_advanced_flow: bool = True`
`30`	`30`	`prompt_template: str \| None = None`
	`31`	`+ seed: int \| None = None`
`31`	`32`
`32`	`33`
`33`	`34`	`class ChatRequestContext(BaseModel):`

`‎src/backend/fastapi_app/rag_advanced.py`

Lines changed: 8 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,11 @@ def __init__(`
`35`	`35`	`self.chat_token_limit = get_token_limit(chat_model, default_to_minimum=True)`
`36`	`36`
`37`	`37`	`async def generate_search_query(`
`38`		`- self, original_user_query: str, past_messages: list[ChatCompletionMessageParam], query_response_token_limit: int`
	`38`	`+ self,`
	`39`	`+ original_user_query: str,`
	`40`	`+ past_messages: list[ChatCompletionMessageParam],`
	`41`	`+ query_response_token_limit: int,`
	`42`	`+ seed: int \| None = None,`
`39`	`43`	`) -> tuple[list[ChatCompletionMessageParam], Any \| str \| None, list]:`
`40`	`44`	`"""Generate an optimized keyword search query based on the chat history and the last question"""`
`41`	`45`
`@@ -63,6 +67,7 @@ async def generate_search_query(`
`63`	`67`	`n=1,`
`64`	`68`	`tools=tools,`
`65`	`69`	`tool_choice=tool_choice,`
	`70`	`+ seed=seed,`
`66`	`71`	`)`
`67`	`72`
`68`	`73`	`query_text, filters = extract_search_arguments(original_user_query, chat_completion)`
`@@ -76,6 +81,7 @@ async def prepare_context(`
`76`	`81`	`original_user_query=chat_params.original_user_query,`
`77`	`82`	`past_messages=chat_params.past_messages,`
`78`	`83`	`query_response_token_limit=500,`
	`84`	`+ seed=chat_params.seed,`
`79`	`85`	`)`
`80`	`86`
`81`	`87`	`# Retrieve relevant rows from the database with the GPT optimized query`
`@@ -142,6 +148,7 @@ async def answer(`
`142`	`148`	`max_tokens=chat_params.response_token_limit,`
`143`	`149`	`n=1,`
`144`	`150`	`stream=False,`
	`151`	`+ seed=chat_params.seed,`
`145`	`152`	`)`
`146`	`153`
`147`	`154`	`return RetrievalResponse(`

`‎src/backend/fastapi_app/rag_base.py`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,7 @@ def get_params(self, messages: list[ChatCompletionMessageParam], overrides: Chat`
`36`	`36`	`return ChatParams(`
`37`	`37`	`top=overrides.top,`
`38`	`38`	`temperature=overrides.temperature,`
	`39`	`+ seed=overrides.seed,`
`39`	`40`	`retrieval_mode=overrides.retrieval_mode,`
`40`	`41`	`use_advanced_flow=overrides.use_advanced_flow,`
`41`	`42`	`response_token_limit=response_token_limit,`

`‎src/backend/fastapi_app/rag_simple.py`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -90,6 +90,7 @@ async def answer(`
`90`	`90`	`max_tokens=chat_params.response_token_limit,`
`91`	`91`	`n=1,`
`92`	`92`	`stream=False,`
	`93`	`+ seed=chat_params.seed,`
`93`	`94`	`)`
`94`	`95`
`95`	`96`	`return RetrievalResponse(`
`@@ -130,6 +131,7 @@ async def answer_stream(`
`130`	`131`	`max_tokens=chat_params.response_token_limit,`
`131`	`132`	`n=1,`
`132`	`133`	`stream=True,`
	`134`	`+ seed=chat_params.seed,`
`133`	`135`	`)`
`134`	`136`
`135`	`137`	`yield RetrievalResponseDelta(`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 469474d

File tree

6 files changed

6 files changed

`‎evals/eval_config.json`

`‎evals/evaluate.py`

`‎src/backend/fastapi_app/api_models.py`

`‎src/backend/fastapi_app/rag_advanced.py`

`‎src/backend/fastapi_app/rag_base.py`

`‎src/backend/fastapi_app/rag_simple.py`

0 commit comments