From 3bc784a94c5ffe5949c2f34d631c6a0be9da05b4 Mon Sep 17 00:00:00 2001 From: A Vertex SDK engineer Date: Sat, 28 Feb 2026 15:19:36 -0800 Subject: [PATCH] fix: Only include CandidateResponse if a response is present PiperOrigin-RevId: 876784701 --- .../replays/test_create_evaluation_run.py | 57 +++++++++++++++---- vertexai/_genai/_evals_common.py | 21 ++++--- 2 files changed, 56 insertions(+), 22 deletions(-) diff --git a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py index cd97ab042c..18a05b642c 100644 --- a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py +++ b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py @@ -63,7 +63,9 @@ ) ), ) - +INFERENCE_CONFIG = types.EvaluationRunInferenceConfig( + model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash" +) def test_create_eval_run_data_source_evaluation_set(client): """Tests that create_evaluation_run() creates a correctly structured EvaluationRun.""" @@ -189,9 +191,6 @@ def test_create_eval_run_data_source_bigquery_request_set(client): def test_create_eval_run_with_inference_configs(client): """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs.""" client._api_client._http_options.api_version = "v1beta1" - inference_config = types.EvaluationRunInferenceConfig( - model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash" - ) evaluation_run = client.evals.create_evaluation_run( name="test_inference_config", display_name="test_inference_config", @@ -200,7 +199,7 @@ def test_create_eval_run_with_inference_configs(client): ), dest=GCS_DEST, metrics=[GENERAL_QUALITY_METRIC], - inference_configs={"model_1": inference_config}, + inference_configs={"model_1": INFERENCE_CONFIG}, labels={"label1": "value1"}, ) assert isinstance(evaluation_run, types.EvaluationRun) @@ -216,7 +215,7 @@ def test_create_eval_run_with_inference_configs(client): ), metrics=[GENERAL_QUALITY_METRIC], ) - assert evaluation_run.inference_configs["model_1"] == inference_config + assert evaluation_run.inference_configs["model_1"] == INFERENCE_CONFIG assert evaluation_run.labels == { "label1": "value1", } @@ -318,6 +317,45 @@ def test_create_eval_run_with_inference_configs(client): # ) # assert evaluation_run.error is None +import pandas as pd + +def test_create_eval_run_data_source_evaluation_dataset_inference_config(client): + """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset.""" + input_df = pd.DataFrame( + { + "prompt": ["prompt1", "prompt2"], + "reference": ["reference1", "reference2"], + } + ) + evaluation_run = client.evals.create_evaluation_run( + name="test9", + display_name="test9", + dataset=types.EvaluationDataset( + candidate_name="candidate_1", + eval_dataset_df=input_df, + ), + dest=GCS_DEST, + metrics=[GENERAL_QUALITY_METRIC], + inference_configs={"candidate_1": INFERENCE_CONFIG}, + ) + assert isinstance(evaluation_run, types.EvaluationRun) + assert evaluation_run.display_name == "test9" + assert evaluation_run.state == types.EvaluationRunState.PENDING + assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) + # Check evaluation set + assert evaluation_run.data_source.evaluation_set + eval_set = client.evals.get_evaluation_set( + name=evaluation_run.data_source.evaluation_set + ) + assert len(eval_set.evaluation_items) == 2 + assert evaluation_run.inference_configs["candidate_1"] == INFERENCE_CONFIG + # Check evaluation items + for i, eval_item_name in enumerate(eval_set.evaluation_items): + eval_item = client.evals.get_evaluation_item(name=eval_item_name) + assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST + assert eval_item.evaluation_request.prompt.text == input_df.iloc[i]["prompt"] + assert eval_item.evaluation_request.candidate_responses == [] + assert evaluation_run.error is None pytest_plugins = ("pytest_asyncio",) @@ -370,9 +408,6 @@ async def test_create_eval_run_async(client): async def test_create_eval_run_async_with_inference_configs(client): """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs asynchronously.""" client._api_client._http_options.api_version = "v1beta1" - inference_config = types.EvaluationRunInferenceConfig( - model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash" - ) evaluation_run = await client.aio.evals.create_evaluation_run( name="test_inference_config_async", display_name="test_inference_config_async", @@ -381,7 +416,7 @@ async def test_create_eval_run_async_with_inference_configs(client): ), dest=GCS_DEST, metrics=[GENERAL_QUALITY_METRIC], - inference_configs={"model_1": inference_config}, + inference_configs={"model_1": INFERENCE_CONFIG}, labels={"label1": "value1"}, ) assert isinstance(evaluation_run, types.EvaluationRun) @@ -397,7 +432,7 @@ async def test_create_eval_run_async_with_inference_configs(client): ), metrics=[GENERAL_QUALITY_METRIC], ) - assert evaluation_run.inference_configs["model_1"] == inference_config + assert evaluation_run.inference_configs["model_1"] == INFERENCE_CONFIG assert evaluation_run.labels == { "label1": "value1", } diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py index 0bc28994ed..a0e986612b 100644 --- a/vertexai/_genai/_evals_common.py +++ b/vertexai/_genai/_evals_common.py @@ -1962,6 +1962,15 @@ def _create_evaluation_set_from_dataframe( for event in row[_evals_constant.INTERMEDIATE_EVENTS]: if CONTENT in event: intermediate_events.append(event[CONTENT]) + candidate_responses = [] + if _evals_constant.RESPONSE in row: + candidate_responses.append( + types.CandidateResponse( + candidate=candidate_name or "Candidate 1", + text=row[_evals_constant.RESPONSE], + events=intermediate_events or None, + ) + ) eval_item_requests.append( types.EvaluationItemRequest( prompt=( @@ -1974,17 +1983,7 @@ def _create_evaluation_set_from_dataframe( if _evals_constant.REFERENCE in row else None ), - candidate_responses=[ - types.CandidateResponse( - candidate=candidate_name or "Candidate 1", - text=row.get(_evals_constant.RESPONSE, None), - events=( - intermediate_events - if len(intermediate_events) > 0 - else None - ), - ) - ], + candidate_responses=candidate_responses, ) ) logger.info("Writing evaluation item requests to GCS.")