Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1132,6 +1132,7 @@
},
{
"cell_type": "markdown",
"id": "copyright-e2f10038a74449cba590e46511b2368c",
"metadata": {},
"source": [
"<!-- VALIDMIND COPYRIGHT -->\n",
Expand Down
1,963 changes: 982 additions & 981 deletions notebooks/how_to/data_and_datasets/use_dataset_model_objects.ipynb

Large diffs are not rendered by default.

1,577 changes: 789 additions & 788 deletions notebooks/how_to/tests/explore_tests/explore_test_suites.ipynb

Large diffs are not rendered by default.

8,869 changes: 4,435 additions & 4,434 deletions notebooks/how_to/tests/explore_tests/explore_tests.ipynb

Large diffs are not rendered by default.

1,350 changes: 675 additions & 675 deletions notebooks/how_to/tests/run_tests/configure_tests/enable_pii_detection.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -960,6 +960,7 @@
},
{
"cell_type": "markdown",
"id": "copyright-75cfc55507924d27b0d37b140c473293",
"metadata": {},
"source": [
"<!-- VALIDMIND COPYRIGHT -->\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1205,6 +1205,7 @@
},
{
"cell_type": "markdown",
"id": "copyright-b55920b2495443d1894125f60e582bb4",
"metadata": {},
"source": [
"<!-- VALIDMIND COPYRIGHT -->\n",
Expand Down
8 changes: 6 additions & 2 deletions notebooks/use_cases/agents/banking_tools.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Optional
from datetime import datetime
from langchain.tools import tool
from deepeval.tracing import observe


def _score_dti_ratio(dti_ratio: float) -> int:
Expand Down Expand Up @@ -79,6 +80,7 @@ def _get_credit_description(credit_score: int) -> str:

# Credit Risk Analyzer Tool
@tool
@observe(type="tool")
def credit_risk_analyzer(
customer_income: float,
customer_debt: float,
Expand Down Expand Up @@ -279,8 +281,8 @@ def _handle_recommend_product(customer):

def _handle_get_info(customer, customer_id):
"""Handle get info action."""
credit_tier = ('Excellent' if customer['credit_score'] >= 750 else
'Good' if customer['credit_score'] >= 700 else
credit_tier = ('Excellent' if customer['credit_score'] >= 750 else
'Good' if customer['credit_score'] >= 700 else
'Fair' if customer['credit_score'] >= 650 else 'Poor')

return f"""CUSTOMER ACCOUNT INFORMATION
Expand Down Expand Up @@ -308,6 +310,7 @@ def _handle_get_info(customer, customer_id):

# Customer Account Manager Tool
@tool
@observe(type="tool")
def customer_account_manager(
account_type: str,
customer_id: str,
Expand Down Expand Up @@ -362,6 +365,7 @@ def customer_account_manager(

# Fraud Detection System Tool
@tool
@observe(type="tool")
def fraud_detection_system(
transaction_id: str,
customer_id: str,
Expand Down
4,372 changes: 2,176 additions & 2,196 deletions notebooks/use_cases/agents/document_agentic_ai.ipynb

Large diffs are not rendered by default.

68 changes: 4 additions & 64 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ all = [
"bert-score (>=0.3.13)",
"arch",
"shap (>=0.46.0)",
"scorecardpy (>=0.1.9.6,<0.2.0)",
"scorecardpy==0.1.9.6",
]
huggingface = [
"transformers (>=4.32.0,<5.0.0)",
Expand Down Expand Up @@ -81,7 +81,7 @@ pytorch = ["torch (>=2.0.0)"]
stats = ["scipy", "statsmodels", "arch"]
xgboost = ["xgboost (>=1.5.2,<3)"]
explainability = ["shap (>=0.46.0)"]
credit_risk = ["scorecardpy (>=0.1.9.6,<0.2.0)"]
credit_risk = ["scorecardpy==0.1.9.6"]
datasets = ["datasets (>=2.10.0,<3.0.0)"]
pii-detection = ["presidio-analyzer", "presidio-structured"]

Expand Down
9 changes: 7 additions & 2 deletions tests/unit_tests/data_validation/test_WOEBinPlots.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@
import pandas as pd
import validmind as vm
import plotly.graph_objs as go
from validmind.errors import SkipTestError
from validmind.tests.data_validation.WOEBinPlots import WOEBinPlots
from validmind.errors import MissingDependencyError, SkipTestError
from validmind import RawData

try:
from validmind.tests.data_validation.WOEBinPlots import WOEBinPlots
except MissingDependencyError:
WOEBinPlots = None


@unittest.skipIf(WOEBinPlots is None, "scorecardpy is not installed")
class TestWOEBinPlots(unittest.TestCase):
def setUp(self):
# Create a sample dataset with categorical features and binary target
Expand Down
10 changes: 8 additions & 2 deletions tests/unit_tests/data_validation/test_WOEBinTable.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
import unittest
import pandas as pd
import validmind as vm
from validmind.errors import SkipTestError
from validmind.tests.data_validation.WOEBinTable import WOEBinTable, RawData
from validmind.errors import MissingDependencyError, SkipTestError
from validmind import RawData

try:
from validmind.tests.data_validation.WOEBinTable import WOEBinTable
except MissingDependencyError:
WOEBinTable = None


@unittest.skipIf(WOEBinTable is None, "scorecardpy is not installed")
class TestWOEBinTable(unittest.TestCase):
def setUp(self):
# Create a sample dataset with categorical and numeric features and binary target
Expand Down
14 changes: 6 additions & 8 deletions validmind/datasets/credit_risk/lending_club.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,12 @@
try:
import scorecardpy as sc
except ImportError as e:
if "scorecardpy" in str(e):
raise MissingDependencyError(
"Missing required package `scorecardpy` for credit risk demos. "
"Please run `pip install validmind[credit_risk]` or `pip install scorecardpy`.",
required_dependencies=["scorecardpy"],
extra="credit_risk",
) from e
raise e
raise MissingDependencyError(
"Missing required package `scorecardpy` for credit risk demos. "
"Please run `pip install validmind[credit_risk]` or `pip install scorecardpy`.",
required_dependencies=["scorecardpy"],
extra="credit_risk",
) from e

current_path = os.path.dirname(os.path.abspath(__file__))
dataset_path = os.path.join(current_path, "datasets")
Expand Down
60 changes: 23 additions & 37 deletions validmind/scorers/llm/deepeval/ArgumentCorrectness.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,7 @@ def ArgumentCorrectness(
dataset: VMDataset,
threshold: float = 0.7,
input_column: str = "input",
tools_called_column: str = "tools_called",
agent_output_column: str = "agent_output",
actual_output_column: str = "actual_output",
strict_mode: bool = False,
actual_tools_called_column: str = "tools_called",
) -> List[Dict[str, Any]]:
"""Evaluates agent argument correctness using deepeval's ArgumentCorrectnessMetric.

Expand All @@ -55,8 +52,15 @@ def ArgumentCorrectness(
evaluates argument correctness based on the input context rather than comparing
against expected values.

When ``model`` is provided, the agent is run per row inside deepeval's evals_iterator
so the metric receives trace data. Without ``model``, the dataset-only path uses
pre-computed columns.

Args:
dataset: Dataset containing the agent input and tool calls
model: Optional ValidMind model (agent) with predict_fn. When provided, the
agent is run per row inside deepeval's evals_iterator so the metric
receives trace data.
threshold: Minimum passing threshold (default: 0.7)
input_column: Column name for the task input (default: "input")
tools_called_column: Column name for tools called (default: "tools_called")
Expand All @@ -69,57 +73,39 @@ def ArgumentCorrectness(
Raises:
ValueError: If required columns are missing
"""
# Validate required columns exist in dataset
from validmind.scorers.llm.deepeval import _convert_to_tool_call_list

missing_columns: List[str] = []
if input_column not in dataset._df.columns:
missing_columns.append(input_column)

if actual_tools_called_column not in dataset._df.columns:
missing_columns.append(actual_tools_called_column)
if missing_columns:
raise ValueError(
f"Required columns {missing_columns} not found in dataset. "
f"ToolCorrectness with model requires columns {missing_columns}. "
f"Available columns: {dataset._df.columns.tolist()}"
)

_, model = get_client_and_model()

metric = ArgumentCorrectnessMetric(
threshold=threshold,
model=model,
include_reason=True,
strict_mode=strict_mode,
verbose_mode=False,
)

# Import helper functions to avoid circular import
from validmind.scorers.llm.deepeval import (
_convert_to_tool_call_list,
extract_tool_calls_from_agent_output,
)

_, llm_model = get_client_and_model()
results: List[Dict[str, Any]] = []
for _, row in dataset._df.iterrows():
input_value = row[input_column]

# Extract tools called
if tools_called_column in dataset._df.columns:
tools_called_value = row.get(tools_called_column, [])
else:
agent_output = row.get(agent_output_column, {})
tools_called_value = extract_tool_calls_from_agent_output(agent_output)
tools_called_list = _convert_to_tool_call_list(tools_called_value)
for _, row in dataset._df.iterrows():
actual_tools_value = row.get(actual_tools_called_column, [])
actual_tools_list = _convert_to_tool_call_list(actual_tools_value)

actual_output_value = row.get(actual_output_column, "")
metric = ArgumentCorrectnessMetric(
threshold=threshold,
model=llm_model,
)

test_case = LLMTestCase(
input=input_value,
tools_called=tools_called_list,
actual_output=actual_output_value,
input=row[input_column],
tools_called=actual_tools_list,
)

result = evaluate(test_cases=[test_case], metrics=[metric])
metric_data = result.test_results[0].metrics_data[0]
score = metric_data.score
reason = getattr(metric_data, "reason", "No reason provided")
results.append({"score": score, "reason": reason})

return results
Loading