From b41f55964cd69363c0fe16aee2a1140172038a2a Mon Sep 17 00:00:00 2001 From: jayy-77 <1427jay@gmail.com> Date: Sat, 14 Feb 2026 19:36:56 +0530 Subject: [PATCH] Enhance evaluation functionality with support for multiple runs and dynamic config discovery --- src/google/adk/cli/cli_eval.py | 28 +++++++-- src/google/adk/cli/cli_tools_click.py | 62 ++++++++++++++++---- src/google/adk/evaluation/agent_evaluator.py | 16 ++++- src/google/adk/evaluation/eval_config.py | 11 ++++ 4 files changed, 99 insertions(+), 18 deletions(-) diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py index 33c1693208..3da1eb0e04 100644 --- a/src/google/adk/cli/cli_eval.py +++ b/src/google/adk/cli/cli_eval.py @@ -14,6 +14,7 @@ from __future__ import annotations +import asyncio import importlib.util import logging import os @@ -24,7 +25,7 @@ import click from google.genai import types as genai_types -from ..agents.llm_agent import Agent +from ..agents.base_agent import BaseAgent from ..evaluation.base_eval_service import BaseEvalService from ..evaluation.base_eval_service import EvaluateConfig from ..evaluation.base_eval_service import EvaluateRequest @@ -86,11 +87,28 @@ def get_default_metric_info( ) -def get_root_agent(agent_module_file_path: str) -> Agent: - """Returns root agent given the agent module.""" +def get_root_agent(agent_module_file_path: str) -> BaseAgent: + """Returns root agent given the agent module. + + Supports modules exporting either `root_agent` or `get_agent_async`. + """ agent_module = _get_agent_module(agent_module_file_path) - root_agent = agent_module.agent.root_agent - return root_agent + agent_module_with_agent = ( + agent_module.agent if hasattr(agent_module, "agent") else agent_module + ) + if hasattr(agent_module_with_agent, "root_agent"): + return agent_module_with_agent.root_agent + + if hasattr(agent_module_with_agent, "get_agent_async"): + result = asyncio.run(agent_module_with_agent.get_agent_async()) + if isinstance(result, tuple): + root_agent, _ = result + return root_agent + return result + + raise ValueError( + "Module does not have a root_agent or get_agent_async method." + ) def try_get_reset_func(agent_module_file_path: str) -> Any: diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py index 5b5d3e5c82..fd74e00821 100644 --- a/src/google/adk/cli/cli_tools_click.py +++ b/src/google/adk/cli/cli_tools_click.py @@ -709,6 +709,13 @@ def wrapper(*args, **kwargs): ) @click.argument("eval_set_file_path_or_id", nargs=-1) @click.option("--config_file_path", help="Optional. The path to config file.") +@click.option( + "--num_runs", + type=click.IntRange(min=1), + default=1, + show_default=True, + help="Optional. Number of times to run each eval case.", +) @click.option( "--print_detailed_results", is_flag=True, @@ -721,6 +728,7 @@ def cli_eval( agent_module_file_path: str, eval_set_file_path_or_id: list[str], config_file_path: str, + num_runs: int, print_detailed_results: bool, eval_storage_uri: Optional[str] = None, log_level: str = "INFO", @@ -789,6 +797,7 @@ def cli_eval( from ..evaluation.base_eval_service import InferenceRequest from ..evaluation.custom_metric_evaluator import _CustomMetricEvaluator from ..evaluation.eval_config import get_eval_metrics_from_config + from ..evaluation.eval_config import discover_eval_config_for_test_file from ..evaluation.eval_config import get_evaluation_criteria_or_default from ..evaluation.eval_result import EvalCaseResult from ..evaluation.evaluator import EvalStatus @@ -808,9 +817,12 @@ def cli_eval( except ModuleNotFoundError as mnf: raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf - eval_config = get_evaluation_criteria_or_default(config_file_path) - print(f"Using evaluation criteria: {eval_config}") - eval_metrics = get_eval_metrics_from_config(eval_config) + eval_metrics_by_eval_set_id = {} + global_eval_metrics = None + if config_file_path: + eval_config = get_evaluation_criteria_or_default(config_file_path) + print(f"Using evaluation criteria: {eval_config}") + global_eval_metrics = get_eval_metrics_from_config(eval_config) root_agent = get_root_agent(agent_module_file_path) app_name = os.path.basename(agent_module_file_path) @@ -854,6 +866,18 @@ def cli_eval( f"`{eval_set_file_path}` should be a valid eval set file." ) from fne + eval_config_for_eval_set = ( + get_evaluation_criteria_or_default(config_file_path) + if config_file_path + else discover_eval_config_for_test_file(eval_set_file_path) + ) + print( + f"Using evaluation criteria for {eval_set_file_path}:" + f" {eval_config_for_eval_set}" + ) + eval_metrics_by_eval_set_id[eval_set.eval_set_id] = ( + get_eval_metrics_from_config(eval_config_for_eval_set) + ) eval_sets_manager.create_eval_set( app_name=app_name, eval_set_id=eval_set.eval_set_id ) @@ -873,6 +897,10 @@ def cli_eval( ) else: # We assume that what we have are eval set ids instead. + if global_eval_metrics is None: + eval_config = get_evaluation_criteria_or_default(config_file_path) + print(f"Using evaluation criteria: {eval_config}") + global_eval_metrics = get_eval_metrics_from_config(eval_config) eval_sets_manager = ( eval_sets_manager if eval_storage_uri @@ -888,6 +916,7 @@ def cli_eval( inference_config=InferenceConfig(), ) ) + eval_metrics_by_eval_set_id[eval_set_id_key] = global_eval_metrics user_simulator_provider = UserSimulatorProvider( user_simulator_config=eval_config.user_simulator_config @@ -920,18 +949,31 @@ def cli_eval( metric_evaluator_registry=metric_evaluator_registry, ) + repeated_inference_requests = inference_requests * num_runs inference_results = asyncio.run( _collect_inferences( - inference_requests=inference_requests, eval_service=eval_service - ) - ) - eval_results = asyncio.run( - _collect_eval_results( - inference_results=inference_results, + inference_requests=repeated_inference_requests, eval_service=eval_service, - eval_metrics=eval_metrics, ) ) + eval_results = [] + for eval_set_id, eval_metrics in eval_metrics_by_eval_set_id.items(): + inference_results_for_eval_set = [ + inference_result + for inference_result in inference_results + if inference_result.eval_set_id == eval_set_id + ] + if not inference_results_for_eval_set: + continue + eval_results.extend( + asyncio.run( + _collect_eval_results( + inference_results=inference_results_for_eval_set, + eval_service=eval_service, + eval_metrics=eval_metrics, + ) + ) + ) except ModuleNotFoundError as mnf: raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index c0fc736340..42597167b7 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -38,6 +38,7 @@ from .eval_case import IntermediateDataType from .eval_case import Invocation from .eval_config import EvalConfig +from .eval_config import discover_eval_config_for_test_file from .eval_config import get_eval_metrics_from_config from .eval_config import get_evaluation_criteria_or_default from .eval_metrics import BaseCriterion @@ -46,6 +47,7 @@ from .eval_metrics import PrebuiltMetrics from .eval_result import EvalCaseResult from .eval_set import EvalSet +from .eval_set_results_manager import EvalSetResultsManager from .eval_sets_manager import EvalSetsManager from .evaluator import EvalStatus from .in_memory_eval_sets_manager import InMemoryEvalSetsManager @@ -100,9 +102,7 @@ class AgentEvaluator: @staticmethod def find_config_for_test_file(test_file: str) -> EvalConfig: """Find the test_config.json file in the same folder as the test file.""" - test_folder = os.path.dirname(test_file) - config_path = os.path.join(test_folder, "test_config.json") - return get_evaluation_criteria_or_default(config_path) + return discover_eval_config_for_test_file(test_file) @staticmethod async def evaluate_eval_set( @@ -113,6 +113,7 @@ async def evaluate_eval_set( num_runs: int = NUM_RUNS, agent_name: Optional[str] = None, print_detailed_results: bool = True, + eval_set_results_manager: Optional[EvalSetResultsManager] = None, ): """Evaluates an agent using the given EvalSet. @@ -130,6 +131,8 @@ async def evaluate_eval_set( than root agent. If left empty or none, then root agent is evaluated. print_detailed_results: Whether to print detailed results for each metric evaluation. + eval_set_results_manager: Optional results manager for persisting eval + outputs. """ if criteria: logger.warning( @@ -161,6 +164,7 @@ async def evaluate_eval_set( eval_metrics=eval_metrics, num_runs=num_runs, user_simulator_provider=user_simulator_provider, + eval_set_results_manager=eval_set_results_manager, ) # Step 2: Post-process the results! @@ -200,6 +204,7 @@ async def evaluate( agent_name: Optional[str] = None, initial_session_file: Optional[str] = None, print_detailed_results: bool = True, + eval_set_results_manager: Optional[EvalSetResultsManager] = None, ): """Evaluates an Agent given eval data. @@ -218,6 +223,8 @@ async def evaluate( needed by all the evals in the eval dataset. print_detailed_results: Whether to print detailed results for each metric evaluation. + eval_set_results_manager: Optional results manager for persisting eval + outputs. """ test_files = [] if isinstance(eval_dataset_file_path_or_dir, str) and os.path.isdir( @@ -245,6 +252,7 @@ async def evaluate( num_runs=num_runs, agent_name=agent_name, print_detailed_results=print_detailed_results, + eval_set_results_manager=eval_set_results_manager, ) @staticmethod @@ -536,6 +544,7 @@ async def _get_eval_results_by_eval_id( eval_metrics: list[EvalMetric], num_runs: int, user_simulator_provider: UserSimulatorProvider, + eval_set_results_manager: Optional[EvalSetResultsManager] = None, ) -> dict[str, list[EvalCaseResult]]: """Returns EvalCaseResults grouped by eval case id. @@ -560,6 +569,7 @@ async def _get_eval_results_by_eval_id( app_name=app_name, eval_set=eval_set ), user_simulator_provider=user_simulator_provider, + eval_set_results_manager=eval_set_results_manager, ) inference_requests = [ diff --git a/src/google/adk/evaluation/eval_config.py b/src/google/adk/evaluation/eval_config.py index ead2303ceb..e953ffb30c 100644 --- a/src/google/adk/evaluation/eval_config.py +++ b/src/google/adk/evaluation/eval_config.py @@ -180,6 +180,17 @@ def get_evaluation_criteria_or_default( return _DEFAULT_EVAL_CONFIG +def discover_eval_config_for_test_file(test_file_path: str) -> EvalConfig: + """Returns EvalConfig for a test file via adjacent test_config.json lookup. + + The lookup checks for a `test_config.json` in the same directory as the test + file, and falls back to the default criteria if not found. + """ + test_folder = os.path.dirname(test_file_path) + config_path = os.path.join(test_folder, "test_config.json") + return get_evaluation_criteria_or_default(config_path) + + def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]: """Returns a list of EvalMetrics mapped from the EvalConfig.""" eval_metric_list = []