diff --git a/tests/unit/score/test_console_scorer_printer.py b/tests/unit/score/test_console_scorer_printer.py new file mode 100644 index 000000000..fc7d1e64f --- /dev/null +++ b/tests/unit/score/test_console_scorer_printer.py @@ -0,0 +1,366 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from unittest.mock import MagicMock, patch + +import pytest +from colorama import Fore, Style + +from pyrit.identifiers import ComponentIdentifier +from pyrit.score.printer.console_scorer_printer import ConsoleScorerPrinter +from pyrit.score.scorer_evaluation.scorer_metrics import ( + HarmScorerMetrics, + ObjectiveScorerMetrics, +) + + +def _make_scorer_identifier( + *, + class_name: str = "TestScorer", + params: dict | None = None, + children: dict | None = None, +) -> ComponentIdentifier: + return ComponentIdentifier( + class_name=class_name, + class_module="pyrit.score.test_scorer", + params=params or {}, + children=children or {}, + ) + + +def _make_objective_metrics(**overrides) -> ObjectiveScorerMetrics: + defaults = { + "num_responses": 100, + "num_human_raters": 3, + "accuracy": 0.92, + "accuracy_standard_error": 0.02, + "f1_score": 0.91, + "precision": 0.93, + "recall": 0.90, + "average_score_time_seconds": 0.3, + } + defaults.update(overrides) + return ObjectiveScorerMetrics(**defaults) + + +def _make_harm_metrics(**overrides) -> HarmScorerMetrics: + defaults = { + "num_responses": 50, + "num_human_raters": 2, + "mean_absolute_error": 0.08, + "mae_standard_error": 0.01, + "t_statistic": 1.5, + "p_value": 0.13, + "krippendorff_alpha_combined": 0.85, + "krippendorff_alpha_model": 0.82, + "average_score_time_seconds": 0.8, + } + defaults.update(overrides) + return HarmScorerMetrics(**defaults) + + +# --- __init__ tests --- + + +def test_init_default_values(): + printer = ConsoleScorerPrinter() + assert printer._indent == " " + assert printer._enable_colors is True + + +def test_init_custom_indent(): + printer = ConsoleScorerPrinter(indent_size=4) + assert printer._indent == " " + + +def test_init_zero_indent(): + printer = ConsoleScorerPrinter(indent_size=0) + assert printer._indent == "" + + +def test_init_negative_indent_raises(): + with pytest.raises(ValueError, match="indent_size must be non-negative"): + ConsoleScorerPrinter(indent_size=-1) + + +def test_init_colors_disabled(): + printer = ConsoleScorerPrinter(enable_colors=False) + assert printer._enable_colors is False + + +# --- _print_colored tests --- + + +def test_print_colored_with_colors_enabled(capsys): + printer = ConsoleScorerPrinter(enable_colors=True) + printer._print_colored("hello", Fore.GREEN) + captured = capsys.readouterr() + assert "hello" in captured.out + assert Style.RESET_ALL in captured.out + + +def test_print_colored_with_colors_disabled(capsys): + printer = ConsoleScorerPrinter(enable_colors=False) + printer._print_colored("hello", Fore.GREEN) + captured = capsys.readouterr() + assert captured.out.strip() == "hello" + assert Style.RESET_ALL not in captured.out + + +def test_print_colored_no_colors_arg(capsys): + printer = ConsoleScorerPrinter(enable_colors=True) + printer._print_colored("plain text") + captured = capsys.readouterr() + assert captured.out.strip() == "plain text" + + +# --- _get_quality_color tests --- + + +def test_quality_color_higher_is_better_good(): + printer = ConsoleScorerPrinter() + color = printer._get_quality_color(0.95, higher_is_better=True, good_threshold=0.9, bad_threshold=0.7) + assert color == Fore.GREEN + + +def test_quality_color_higher_is_better_bad(): + printer = ConsoleScorerPrinter() + color = printer._get_quality_color(0.5, higher_is_better=True, good_threshold=0.9, bad_threshold=0.7) + assert color == Fore.RED + + +def test_quality_color_higher_is_better_middle(): + printer = ConsoleScorerPrinter() + color = printer._get_quality_color(0.8, higher_is_better=True, good_threshold=0.9, bad_threshold=0.7) + assert color == Fore.CYAN + + +def test_quality_color_lower_is_better_good(): + printer = ConsoleScorerPrinter() + color = printer._get_quality_color(0.05, higher_is_better=False, good_threshold=0.1, bad_threshold=0.25) + assert color == Fore.GREEN + + +def test_quality_color_lower_is_better_bad(): + printer = ConsoleScorerPrinter() + color = printer._get_quality_color(0.3, higher_is_better=False, good_threshold=0.1, bad_threshold=0.25) + assert color == Fore.RED + + +def test_quality_color_lower_is_better_middle(): + printer = ConsoleScorerPrinter() + color = printer._get_quality_color(0.15, higher_is_better=False, good_threshold=0.1, bad_threshold=0.25) + assert color == Fore.CYAN + + +# --- _print_scorer_info tests --- + + +def test_print_scorer_info_basic(capsys): + printer = ConsoleScorerPrinter(enable_colors=False) + identifier = _make_scorer_identifier(class_name="SelfAskScaleScorer") + printer._print_scorer_info(identifier, indent_level=2) + output = capsys.readouterr().out + assert "SelfAskScaleScorer" in output + + +def test_print_scorer_info_with_display_params(capsys): + printer = ConsoleScorerPrinter(enable_colors=False) + identifier = _make_scorer_identifier( + class_name="TestScorer", + params={"scorer_type": "likert", "score_aggregator": "mean", "hidden_param": "ignore"}, + ) + printer._print_scorer_info(identifier, indent_level=2) + output = capsys.readouterr().out + assert "scorer_type" in output + assert "score_aggregator" in output + assert "hidden_param" not in output + + +def test_print_scorer_info_with_prompt_target_child(capsys): + printer = ConsoleScorerPrinter(enable_colors=False) + target_id = ComponentIdentifier( + class_name="OpenAIChatTarget", + class_module="pyrit.prompt_target", + params={"model_name": "gpt-4", "temperature": "0.0", "extra": "skip"}, + ) + identifier = _make_scorer_identifier( + children={"prompt_target": target_id}, + ) + printer._print_scorer_info(identifier, indent_level=2) + output = capsys.readouterr().out + assert "gpt-4" in output + assert "extra" not in output + + +def test_print_scorer_info_with_sub_scorers(capsys): + printer = ConsoleScorerPrinter(enable_colors=False) + sub1 = _make_scorer_identifier(class_name="SubScorer1") + sub2 = _make_scorer_identifier(class_name="SubScorer2") + identifier = _make_scorer_identifier( + class_name="CompositeScorer", + children={"sub_scorers": [sub1, sub2]}, + ) + printer._print_scorer_info(identifier, indent_level=2) + output = capsys.readouterr().out + assert "Composite of 2 scorer(s)" in output + assert "SubScorer1" in output + assert "SubScorer2" in output + + +# --- _print_objective_metrics tests --- + + +def test_print_objective_metrics_none(capsys): + printer = ConsoleScorerPrinter(enable_colors=False) + printer._print_objective_metrics(None) + output = capsys.readouterr().out + assert "Official evaluation has not been run yet" in output + + +def test_print_objective_metrics_full(capsys): + printer = ConsoleScorerPrinter(enable_colors=False) + metrics = _make_objective_metrics() + printer._print_objective_metrics(metrics) + output = capsys.readouterr().out + assert "Accuracy" in output + assert "F1 Score" in output + assert "Precision" in output + assert "Recall" in output + assert "Average Score Time" in output + + +def test_print_objective_metrics_optional_fields_none(capsys): + printer = ConsoleScorerPrinter(enable_colors=False) + metrics = _make_objective_metrics( + accuracy_standard_error=None, + f1_score=None, + precision=None, + recall=None, + average_score_time_seconds=None, + ) + printer._print_objective_metrics(metrics) + output = capsys.readouterr().out + assert "Accuracy" in output + assert "F1 Score" not in output + assert "Precision" not in output + assert "Recall" not in output + assert "Average Score Time" not in output + + +# --- _print_harm_metrics tests --- + + +def test_print_harm_metrics_none(capsys): + printer = ConsoleScorerPrinter(enable_colors=False) + printer._print_harm_metrics(None) + output = capsys.readouterr().out + assert "Official evaluation has not been run yet" in output + + +def test_print_harm_metrics_full(capsys): + printer = ConsoleScorerPrinter(enable_colors=False) + metrics = _make_harm_metrics() + printer._print_harm_metrics(metrics) + output = capsys.readouterr().out + assert "Mean Absolute Error" in output + assert "Krippendorff Alpha (Combined)" in output + assert "Krippendorff Alpha (Model)" in output + assert "Average Score Time" in output + + +def test_print_harm_metrics_optional_fields_none(capsys): + printer = ConsoleScorerPrinter(enable_colors=False) + metrics = _make_harm_metrics( + mae_standard_error=None, + krippendorff_alpha_combined=None, + krippendorff_alpha_model=None, + average_score_time_seconds=None, + ) + printer._print_harm_metrics(metrics) + output = capsys.readouterr().out + assert "Mean Absolute Error" in output + assert "MAE Std Error" not in output + assert "Krippendorff Alpha (Combined)" not in output + assert "Krippendorff Alpha (Model)" not in output + assert "Average Score Time" not in output + + +# --- print_objective_scorer tests --- + + +@patch("pyrit.score.scorer_evaluation.scorer_metrics_io.find_objective_metrics_by_eval_hash") +@patch("pyrit.identifiers.evaluation_identifier.ScorerEvaluationIdentifier") +def test_print_objective_scorer_with_metrics(mock_eval_id_cls, mock_find, capsys): + printer = ConsoleScorerPrinter(enable_colors=False) + identifier = _make_scorer_identifier(class_name="MyScorer") + metrics = _make_objective_metrics() + + mock_eval_instance = MagicMock() + mock_eval_instance.eval_hash = "abc123" + mock_eval_id_cls.return_value = mock_eval_instance + mock_find.return_value = metrics + + printer.print_objective_scorer(scorer_identifier=identifier) + output = capsys.readouterr().out + + assert "Scorer Information" in output + assert "MyScorer" in output + assert "Accuracy" in output + mock_find.assert_called_once_with(eval_hash="abc123") + + +@patch("pyrit.score.scorer_evaluation.scorer_metrics_io.find_objective_metrics_by_eval_hash") +@patch("pyrit.identifiers.evaluation_identifier.ScorerEvaluationIdentifier") +def test_print_objective_scorer_no_metrics(mock_eval_id_cls, mock_find, capsys): + printer = ConsoleScorerPrinter(enable_colors=False) + identifier = _make_scorer_identifier() + + mock_eval_instance = MagicMock() + mock_eval_instance.eval_hash = "xyz" + mock_eval_id_cls.return_value = mock_eval_instance + mock_find.return_value = None + + printer.print_objective_scorer(scorer_identifier=identifier) + output = capsys.readouterr().out + assert "Official evaluation has not been run yet" in output + + +# --- print_harm_scorer tests --- + + +@patch("pyrit.score.scorer_evaluation.scorer_metrics_io.find_harm_metrics_by_eval_hash") +@patch("pyrit.identifiers.evaluation_identifier.ScorerEvaluationIdentifier") +def test_print_harm_scorer_with_metrics(mock_eval_id_cls, mock_find, capsys): + printer = ConsoleScorerPrinter(enable_colors=False) + identifier = _make_scorer_identifier(class_name="HarmScorer") + metrics = _make_harm_metrics() + + mock_eval_instance = MagicMock() + mock_eval_instance.eval_hash = "harm_hash" + mock_eval_id_cls.return_value = mock_eval_instance + mock_find.return_value = metrics + + printer.print_harm_scorer(identifier, harm_category="hate_speech") + output = capsys.readouterr().out + + assert "Scorer Information" in output + assert "HarmScorer" in output + assert "Mean Absolute Error" in output + mock_find.assert_called_once_with(eval_hash="harm_hash", harm_category="hate_speech") + + +@patch("pyrit.score.scorer_evaluation.scorer_metrics_io.find_harm_metrics_by_eval_hash") +@patch("pyrit.identifiers.evaluation_identifier.ScorerEvaluationIdentifier") +def test_print_harm_scorer_no_metrics(mock_eval_id_cls, mock_find, capsys): + printer = ConsoleScorerPrinter(enable_colors=False) + identifier = _make_scorer_identifier() + + mock_eval_instance = MagicMock() + mock_eval_instance.eval_hash = "no_data" + mock_eval_id_cls.return_value = mock_eval_instance + mock_find.return_value = None + + printer.print_harm_scorer(identifier, harm_category="violence") + output = capsys.readouterr().out + assert "Official evaluation has not been run yet" in output diff --git a/tests/unit/score/test_metrics_type.py b/tests/unit/score/test_metrics_type.py new file mode 100644 index 000000000..53276fc4a --- /dev/null +++ b/tests/unit/score/test_metrics_type.py @@ -0,0 +1,47 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from pyrit.score.scorer_evaluation.metrics_type import MetricsType, RegistryUpdateBehavior + + +def test_metrics_type_harm_value(): + assert MetricsType.HARM.value == "harm" + + +def test_metrics_type_objective_value(): + assert MetricsType.OBJECTIVE.value == "objective" + + +def test_metrics_type_members(): + members = list(MetricsType) + assert len(members) == 2 + assert MetricsType.HARM in members + assert MetricsType.OBJECTIVE in members + + +def test_metrics_type_from_value(): + assert MetricsType("harm") is MetricsType.HARM + assert MetricsType("objective") is MetricsType.OBJECTIVE + + +def test_registry_update_behavior_skip_if_exists(): + assert RegistryUpdateBehavior.SKIP_IF_EXISTS.value == "skip_if_exists" + + +def test_registry_update_behavior_always_update(): + assert RegistryUpdateBehavior.ALWAYS_UPDATE.value == "always_update" + + +def test_registry_update_behavior_never_update(): + assert RegistryUpdateBehavior.NEVER_UPDATE.value == "never_update" + + +def test_registry_update_behavior_members(): + members = list(RegistryUpdateBehavior) + assert len(members) == 3 + + +def test_registry_update_behavior_from_value(): + assert RegistryUpdateBehavior("skip_if_exists") is RegistryUpdateBehavior.SKIP_IF_EXISTS + assert RegistryUpdateBehavior("always_update") is RegistryUpdateBehavior.ALWAYS_UPDATE + assert RegistryUpdateBehavior("never_update") is RegistryUpdateBehavior.NEVER_UPDATE diff --git a/tests/unit/score/test_score_aggregator_result.py b/tests/unit/score/test_score_aggregator_result.py new file mode 100644 index 000000000..6212fc42e --- /dev/null +++ b/tests/unit/score/test_score_aggregator_result.py @@ -0,0 +1,110 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from dataclasses import FrozenInstanceError + +import pytest + +from pyrit.score.score_aggregator_result import ScoreAggregatorResult + + +def test_init_with_bool_value(): + result = ScoreAggregatorResult( + value=True, + description="All passed", + rationale="All scores were true", + category=["safety"], + metadata={"count": 3}, + ) + assert result.value is True + assert result.description == "All passed" + assert result.rationale == "All scores were true" + assert result.category == ["safety"] + assert result.metadata == {"count": 3} + + +def test_init_with_float_value(): + result = ScoreAggregatorResult( + value=0.75, + description="High score", + rationale="Average was above threshold", + category=["harm", "violence"], + metadata={"mean": 0.75, "std": 0.1}, + ) + assert result.value == 0.75 + assert result.description == "High score" + assert result.category == ["harm", "violence"] + assert result.metadata == {"mean": 0.75, "std": 0.1} + + +def test_init_with_empty_category_and_metadata(): + result = ScoreAggregatorResult( + value=False, + description="No matches", + rationale="", + category=[], + metadata={}, + ) + assert result.category == [] + assert result.metadata == {} + assert result.rationale == "" + + +def test_frozen_cannot_set_value(): + result = ScoreAggregatorResult( + value=True, + description="test", + rationale="test", + category=[], + metadata={}, + ) + with pytest.raises(FrozenInstanceError): + result.value = False # type: ignore[misc] + + +def test_frozen_cannot_set_description(): + result = ScoreAggregatorResult( + value=0.5, + description="original", + rationale="test", + category=[], + metadata={}, + ) + with pytest.raises(FrozenInstanceError): + result.description = "changed" # type: ignore[misc] + + +def test_equality_same_values(): + r1 = ScoreAggregatorResult(value=True, description="d", rationale="r", category=["c"], metadata={"k": 1}) + r2 = ScoreAggregatorResult(value=True, description="d", rationale="r", category=["c"], metadata={"k": 1}) + assert r1 == r2 + + +def test_inequality_different_values(): + r1 = ScoreAggregatorResult(value=True, description="d", rationale="r", category=[], metadata={}) + r2 = ScoreAggregatorResult(value=False, description="d", rationale="r", category=[], metadata={}) + assert r1 != r2 + + +def test_inequality_different_description(): + r1 = ScoreAggregatorResult(value=0.5, description="a", rationale="r", category=[], metadata={}) + r2 = ScoreAggregatorResult(value=0.5, description="b", rationale="r", category=[], metadata={}) + assert r1 != r2 + + +def test_slots_no_dict(): + result = ScoreAggregatorResult(value=True, description="d", rationale="r", category=[], metadata={}) + assert not hasattr(result, "__dict__") + + +def test_metadata_with_mixed_types(): + result = ScoreAggregatorResult( + value=0.9, + description="mixed", + rationale="test", + category=["a"], + metadata={"name": "scorer1", "count": 5, "threshold": 0.8}, + ) + assert result.metadata["name"] == "scorer1" + assert result.metadata["count"] == 5 + assert result.metadata["threshold"] == 0.8 diff --git a/tests/unit/score/test_scorer_metrics_io.py b/tests/unit/score/test_scorer_metrics_io.py new file mode 100644 index 000000000..9ed4e4138 --- /dev/null +++ b/tests/unit/score/test_scorer_metrics_io.py @@ -0,0 +1,414 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import json +from pathlib import Path +from unittest.mock import patch + +from pyrit.identifiers import ComponentIdentifier +from pyrit.score.scorer_evaluation.scorer_metrics import ( + HarmScorerMetrics, + ObjectiveScorerMetrics, + ScorerMetricsWithIdentity, +) +from pyrit.score.scorer_evaluation.scorer_metrics_io import ( + _append_jsonl_entry, + _load_jsonl, + _metrics_to_registry_dict, + add_evaluation_results, + find_harm_metrics_by_eval_hash, + find_objective_metrics_by_eval_hash, + get_all_harm_metrics, + get_all_objective_metrics, + replace_evaluation_results, +) + + +def _make_identifier(*, class_name: str = "TestScorer") -> ComponentIdentifier: + return ComponentIdentifier( + class_name=class_name, + class_module="pyrit.score.test", + params={"model_name": "gpt-4"}, + ) + + +def _make_objective_metrics(**overrides) -> ObjectiveScorerMetrics: + defaults = { + "num_responses": 100, + "num_human_raters": 3, + "accuracy": 0.92, + "accuracy_standard_error": 0.02, + "f1_score": 0.91, + "precision": 0.93, + "recall": 0.90, + } + defaults.update(overrides) + return ObjectiveScorerMetrics(**defaults) + + +def _make_harm_metrics(**overrides) -> HarmScorerMetrics: + defaults = { + "num_responses": 50, + "num_human_raters": 2, + "mean_absolute_error": 0.08, + "mae_standard_error": 0.01, + "t_statistic": 1.5, + "p_value": 0.13, + "krippendorff_alpha_combined": 0.85, + } + defaults.update(overrides) + return HarmScorerMetrics(**defaults) + + +def _write_jsonl(path: Path, entries: list[dict]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + for entry in entries: + f.write(json.dumps(entry) + "\n") + + +# --- _load_jsonl tests --- + + +def test_load_jsonl_file_not_found(tmp_path): + result = _load_jsonl(tmp_path / "missing.jsonl") + assert result == [] + + +def test_load_jsonl_valid_entries(tmp_path): + path = tmp_path / "data.jsonl" + entries = [{"a": 1}, {"b": 2}] + _write_jsonl(path, entries) + result = _load_jsonl(path) + assert result == entries + + +def test_load_jsonl_skips_invalid_json(tmp_path): + path = tmp_path / "data.jsonl" + path.write_text('{"valid": true}\nnot json\n{"also_valid": true}\n', encoding="utf-8") + result = _load_jsonl(path) + assert len(result) == 2 + assert result[0] == {"valid": True} + assert result[1] == {"also_valid": True} + + +def test_load_jsonl_skips_blank_lines(tmp_path): + path = tmp_path / "data.jsonl" + path.write_text('{"a": 1}\n\n\n{"b": 2}\n', encoding="utf-8") + result = _load_jsonl(path) + assert len(result) == 2 + + +# --- _append_jsonl_entry tests --- + + +def test_append_jsonl_entry_creates_file(tmp_path): + import threading + + path = tmp_path / "subdir" / "out.jsonl" + lock = threading.Lock() + entry = {"key": "value"} + _append_jsonl_entry(file_path=path, lock=lock, entry=entry) + + assert path.exists() + lines = path.read_text(encoding="utf-8").strip().split("\n") + assert len(lines) == 1 + assert json.loads(lines[0]) == entry + + +def test_append_jsonl_entry_appends(tmp_path): + import threading + + path = tmp_path / "out.jsonl" + _write_jsonl(path, [{"first": 1}]) + lock = threading.Lock() + _append_jsonl_entry(file_path=path, lock=lock, entry={"second": 2}) + + lines = path.read_text(encoding="utf-8").strip().split("\n") + assert len(lines) == 2 + + +# --- _metrics_to_registry_dict tests --- + + +def test_metrics_to_registry_dict_excludes_trial_scores(): + metrics = _make_objective_metrics() + result = _metrics_to_registry_dict(metrics) + assert "trial_scores" not in result + + +def test_metrics_to_registry_dict_excludes_none_values(): + metrics = _make_objective_metrics(average_score_time_seconds=None, dataset_name=None) + result = _metrics_to_registry_dict(metrics) + assert "average_score_time_seconds" not in result + assert "dataset_name" not in result + + +def test_metrics_to_registry_dict_excludes_private_fields(): + metrics = _make_harm_metrics() + result = _metrics_to_registry_dict(metrics) + assert "_harm_definition_obj" not in result + + +def test_metrics_to_registry_dict_includes_values(): + metrics = _make_objective_metrics() + result = _metrics_to_registry_dict(metrics) + assert result["accuracy"] == 0.92 + assert result["f1_score"] == 0.91 + assert result["num_responses"] == 100 + + +# --- find_objective_metrics_by_eval_hash tests --- + + +def test_find_objective_metrics_by_eval_hash_found(tmp_path): + identifier = _make_identifier() + entry = identifier.to_dict() + entry["eval_hash"] = "hash_abc" + entry["metrics"] = _metrics_to_registry_dict(_make_objective_metrics(accuracy=0.88)) + path = tmp_path / "objective_achieved_metrics.jsonl" + _write_jsonl(path, [entry]) + + result = find_objective_metrics_by_eval_hash(eval_hash="hash_abc", file_path=path) + assert result is not None + assert result.accuracy == 0.88 + + +def test_find_objective_metrics_by_eval_hash_not_found(tmp_path): + path = tmp_path / "objective_achieved_metrics.jsonl" + _write_jsonl(path, []) + result = find_objective_metrics_by_eval_hash(eval_hash="missing", file_path=path) + assert result is None + + +def test_find_objective_metrics_by_eval_hash_missing_file(tmp_path): + result = find_objective_metrics_by_eval_hash(eval_hash="nope", file_path=tmp_path / "nonexistent.jsonl") + assert result is None + + +def test_find_objective_metrics_default_path(): + with patch("pyrit.score.scorer_evaluation.scorer_metrics_io._load_jsonl", return_value=[]) as mock_load: + result = find_objective_metrics_by_eval_hash(eval_hash="test_hash") + assert result is None + call_args = mock_load.call_args[0][0] + assert "objective" in str(call_args) + assert "objective_achieved_metrics.jsonl" in str(call_args) + + +# --- find_harm_metrics_by_eval_hash tests --- + + +def test_find_harm_metrics_by_eval_hash_found(): + identifier = _make_identifier() + entry = identifier.to_dict() + entry["eval_hash"] = "harm_hash" + entry["metrics"] = _metrics_to_registry_dict(_make_harm_metrics(mean_absolute_error=0.12)) + + with patch("pyrit.score.scorer_evaluation.scorer_metrics_io._load_jsonl") as mock_load: + mock_load.return_value = [entry] + result = find_harm_metrics_by_eval_hash(eval_hash="harm_hash", harm_category="hate_speech") + assert result is not None + assert result.mean_absolute_error == 0.12 + + +def test_find_harm_metrics_by_eval_hash_not_found(): + with patch("pyrit.score.scorer_evaluation.scorer_metrics_io._load_jsonl", return_value=[]): + result = find_harm_metrics_by_eval_hash(eval_hash="missing", harm_category="violence") + assert result is None + + +# --- get_all_objective_metrics tests --- + + +def test_get_all_objective_metrics_from_file(tmp_path): + identifier = _make_identifier(class_name="Scorer1") + metrics = _make_objective_metrics() + entry = identifier.to_dict() + entry["eval_hash"] = "h1" + entry["metrics"] = _metrics_to_registry_dict(metrics) + path = tmp_path / "objective_achieved_metrics.jsonl" + _write_jsonl(path, [entry]) + + results = get_all_objective_metrics(file_path=path) + assert len(results) == 1 + assert isinstance(results[0], ScorerMetricsWithIdentity) + assert results[0].metrics.accuracy == 0.92 + assert results[0].scorer_identifier.class_name == "Scorer1" + + +def test_get_all_objective_metrics_empty_file(tmp_path): + path = tmp_path / "empty.jsonl" + _write_jsonl(path, []) + results = get_all_objective_metrics(file_path=path) + assert results == [] + + +def test_get_all_objective_metrics_default_path(): + with patch("pyrit.score.scorer_evaluation.scorer_metrics_io._load_metrics_from_file", return_value=[]) as mock_load: + results = get_all_objective_metrics() + assert results == [] + call_path = mock_load.call_args[1]["file_path"] + assert "objective_achieved_metrics.jsonl" in str(call_path) + + +# --- get_all_harm_metrics tests --- + + +def test_get_all_harm_metrics(): + identifier = _make_identifier() + metrics = _make_harm_metrics() + entry = identifier.to_dict() + entry["metrics"] = _metrics_to_registry_dict(metrics) + + with patch("pyrit.score.scorer_evaluation.scorer_metrics_io._load_jsonl") as mock_load: + mock_load.return_value = [entry] + results = get_all_harm_metrics(harm_category="hate_speech") + assert len(results) == 1 + assert results[0].metrics.mean_absolute_error == 0.08 + + +# --- add_evaluation_results tests --- + + +def test_add_evaluation_results_creates_entry(tmp_path): + import pyrit.score.scorer_evaluation.scorer_metrics_io as sio + + original_locks = sio._file_write_locks.copy() + try: + path = tmp_path / "objective" / "test_metrics.jsonl" + identifier = _make_identifier() + metrics = _make_objective_metrics() + + add_evaluation_results( + file_path=path, + scorer_identifier=identifier, + eval_hash="eval_abc", + metrics=metrics, + ) + + assert path.exists() + entries = _load_jsonl(path) + assert len(entries) == 1 + assert entries[0]["eval_hash"] == "eval_abc" + assert entries[0]["metrics"]["accuracy"] == 0.92 + assert entries[0]["class_name"] == "TestScorer" + finally: + sio._file_write_locks = original_locks + + +def test_add_evaluation_results_appends_multiple(tmp_path): + import pyrit.score.scorer_evaluation.scorer_metrics_io as sio + + original_locks = sio._file_write_locks.copy() + try: + path = tmp_path / "test_metrics.jsonl" + + add_evaluation_results( + file_path=path, + scorer_identifier=_make_identifier(class_name="Scorer1"), + eval_hash="h1", + metrics=_make_objective_metrics(accuracy=0.80), + ) + add_evaluation_results( + file_path=path, + scorer_identifier=_make_identifier(class_name="Scorer2"), + eval_hash="h2", + metrics=_make_objective_metrics(accuracy=0.90), + ) + + entries = _load_jsonl(path) + assert len(entries) == 2 + assert entries[0]["eval_hash"] == "h1" + assert entries[1]["eval_hash"] == "h2" + finally: + sio._file_write_locks = original_locks + + +# --- replace_evaluation_results tests --- + + +def test_replace_evaluation_results_replaces_existing(tmp_path): + import pyrit.score.scorer_evaluation.scorer_metrics_io as sio + + original_locks = sio._file_write_locks.copy() + try: + path = tmp_path / "test_metrics.jsonl" + identifier = _make_identifier() + + add_evaluation_results( + file_path=path, + scorer_identifier=identifier, + eval_hash="h1", + metrics=_make_objective_metrics(accuracy=0.80), + ) + + replace_evaluation_results( + file_path=path, + scorer_identifier=identifier, + eval_hash="h1", + metrics=_make_objective_metrics(accuracy=0.95), + ) + + entries = _load_jsonl(path) + assert len(entries) == 1 + assert entries[0]["metrics"]["accuracy"] == 0.95 + finally: + sio._file_write_locks = original_locks + + +def test_replace_evaluation_results_adds_when_not_exists(tmp_path): + import pyrit.score.scorer_evaluation.scorer_metrics_io as sio + + original_locks = sio._file_write_locks.copy() + try: + path = tmp_path / "test_metrics.jsonl" + + replace_evaluation_results( + file_path=path, + scorer_identifier=_make_identifier(), + eval_hash="new_hash", + metrics=_make_objective_metrics(accuracy=0.85), + ) + + entries = _load_jsonl(path) + assert len(entries) == 1 + assert entries[0]["eval_hash"] == "new_hash" + finally: + sio._file_write_locks = original_locks + + +def test_replace_evaluation_results_preserves_other_entries(tmp_path): + import pyrit.score.scorer_evaluation.scorer_metrics_io as sio + + original_locks = sio._file_write_locks.copy() + try: + path = tmp_path / "test_metrics.jsonl" + + add_evaluation_results( + file_path=path, + scorer_identifier=_make_identifier(class_name="A"), + eval_hash="keep_me", + metrics=_make_objective_metrics(accuracy=0.70), + ) + add_evaluation_results( + file_path=path, + scorer_identifier=_make_identifier(class_name="B"), + eval_hash="replace_me", + metrics=_make_objective_metrics(accuracy=0.80), + ) + + replace_evaluation_results( + file_path=path, + scorer_identifier=_make_identifier(class_name="B_new"), + eval_hash="replace_me", + metrics=_make_objective_metrics(accuracy=0.99), + ) + + entries = _load_jsonl(path) + assert len(entries) == 2 + hashes = {e["eval_hash"] for e in entries} + assert hashes == {"keep_me", "replace_me"} + replaced = [e for e in entries if e["eval_hash"] == "replace_me"][0] + assert replaced["metrics"]["accuracy"] == 0.99 + finally: + sio._file_write_locks = original_locks diff --git a/tests/unit/score/test_scorer_printer.py b/tests/unit/score/test_scorer_printer.py new file mode 100644 index 000000000..edd8b6a26 --- /dev/null +++ b/tests/unit/score/test_scorer_printer.py @@ -0,0 +1,42 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import pytest + +from pyrit.identifiers import ComponentIdentifier +from pyrit.score.printer.scorer_printer import ScorerPrinter + + +def test_scorer_printer_cannot_be_instantiated(): + with pytest.raises(TypeError, match="Can't instantiate abstract class"): + ScorerPrinter() # type: ignore[abstract] + + +def test_scorer_printer_subclass_must_implement_print_objective_scorer(): + class IncompletePrinter(ScorerPrinter): + def print_harm_scorer(self, scorer_identifier: ComponentIdentifier, *, harm_category: str) -> None: + pass + + with pytest.raises(TypeError, match="Can't instantiate abstract class"): + IncompletePrinter() # type: ignore[abstract] + + +def test_scorer_printer_subclass_must_implement_print_harm_scorer(): + class IncompletePrinter(ScorerPrinter): + def print_objective_scorer(self, *, scorer_identifier: ComponentIdentifier) -> None: + pass + + with pytest.raises(TypeError, match="Can't instantiate abstract class"): + IncompletePrinter() # type: ignore[abstract] + + +def test_scorer_printer_complete_subclass_can_be_instantiated(): + class CompletePrinter(ScorerPrinter): + def print_objective_scorer(self, *, scorer_identifier: ComponentIdentifier) -> None: + pass + + def print_harm_scorer(self, scorer_identifier: ComponentIdentifier, *, harm_category: str) -> None: + pass + + printer = CompletePrinter() + assert isinstance(printer, ScorerPrinter)