validmind · AnilSorathiya · Feb 19, 2026 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/notebooks/code_sharing/operational_deposit/operational_deposit_poc.ipynb b/notebooks/code_sharing/operational_deposit/operational_deposit_poc.ipynb
@@ -1132,6 +1132,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "copyright-e2f10038a74449cba590e46511b2368c",
    "metadata": {},
    "source": [
     "<!-- VALIDMIND COPYRIGHT -->\n",

diff --git a/notebooks/how_to/data_and_datasets/use_dataset_model_objects.ipynb b/notebooks/how_to/data_and_datasets/use_dataset_model_objects.ipynb
diff --git a/notebooks/how_to/tests/explore_tests/explore_test_suites.ipynb b/notebooks/how_to/tests/explore_tests/explore_test_suites.ipynb
diff --git a/notebooks/how_to/tests/explore_tests/explore_tests.ipynb b/notebooks/how_to/tests/explore_tests/explore_tests.ipynb
diff --git a/notebooks/how_to/tests/run_tests/configure_tests/enable_pii_detection.ipynb b/notebooks/how_to/tests/run_tests/configure_tests/enable_pii_detection.ipynb
diff --git a/notebooks/tutorials/model_development/4-finalize_testing_documentation.ipynb b/notebooks/tutorials/model_development/4-finalize_testing_documentation.ipynb
@@ -960,6 +960,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "copyright-75cfc55507924d27b0d37b140c473293",
    "metadata": {},
    "source": [
     "<!-- VALIDMIND COPYRIGHT -->\n",

diff --git a/notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb b/notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb
@@ -1205,6 +1205,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "copyright-b55920b2495443d1894125f60e582bb4",
    "metadata": {},
    "source": [
     "<!-- VALIDMIND COPYRIGHT -->\n",

diff --git a/notebooks/use_cases/agents/banking_tools.py b/notebooks/use_cases/agents/banking_tools.py
@@ -1,6 +1,7 @@
 from typing import Optional
 from datetime import datetime
 from langchain.tools import tool
+from deepeval.tracing import observe
 
 
 def _score_dti_ratio(dti_ratio: float) -> int:
@@ -79,6 +80,7 @@ def _get_credit_description(credit_score: int) -> str:
 
 # Credit Risk Analyzer Tool
 @tool
+@observe(type="tool")
 def credit_risk_analyzer(
     customer_income: float,
     customer_debt: float,
@@ -279,8 +281,8 @@ def _handle_recommend_product(customer):
 
 def _handle_get_info(customer, customer_id):
     """Handle get info action."""
-    credit_tier = ('Excellent' if customer['credit_score'] >= 750 else
-                   'Good' if customer['credit_score'] >= 700 else
+    credit_tier = ('Excellent' if customer['credit_score'] >= 750 else 
+                   'Good' if customer['credit_score'] >= 700 else 
                    'Fair' if customer['credit_score'] >= 650 else 'Poor')
 
     return f"""CUSTOMER ACCOUNT INFORMATION
@@ -308,6 +310,7 @@ def _handle_get_info(customer, customer_id):
 
 # Customer Account Manager Tool
 @tool
+@observe(type="tool")
 def customer_account_manager(
     account_type: str,
     customer_id: str,
@@ -362,6 +365,7 @@ def customer_account_manager(
 
 # Fraud Detection System Tool
 @tool
+@observe(type="tool")
 def fraud_detection_system(
     transaction_id: str,
     customer_id: str,

diff --git a/notebooks/use_cases/agents/document_agentic_ai.ipynb b/notebooks/use_cases/agents/document_agentic_ai.ipynb
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -53,7 +53,7 @@ all = [
   "bert-score (>=0.3.13)",
   "arch",
   "shap (>=0.46.0)",
-  "scorecardpy (>=0.1.9.6,<0.2.0)",
+  "scorecardpy==0.1.9.6",
 ]
 huggingface = [
   "transformers (>=4.32.0,<5.0.0)",
@@ -81,7 +81,7 @@ pytorch = ["torch (>=2.0.0)"]
 stats = ["scipy", "statsmodels", "arch"]
 xgboost = ["xgboost (>=1.5.2,<3)"]
 explainability = ["shap (>=0.46.0)"]
-credit_risk = ["scorecardpy (>=0.1.9.6,<0.2.0)"]
+credit_risk = ["scorecardpy==0.1.9.6"]
 datasets = ["datasets (>=2.10.0,<3.0.0)"]
 pii-detection = ["presidio-analyzer", "presidio-structured"]
 

diff --git a/tests/unit_tests/data_validation/test_WOEBinPlots.py b/tests/unit_tests/data_validation/test_WOEBinPlots.py
@@ -2,11 +2,16 @@
 import pandas as pd
 import validmind as vm
 import plotly.graph_objs as go
-from validmind.errors import SkipTestError
-from validmind.tests.data_validation.WOEBinPlots import WOEBinPlots
+from validmind.errors import MissingDependencyError, SkipTestError
 from validmind import RawData
 
+try:
+    from validmind.tests.data_validation.WOEBinPlots import WOEBinPlots
+except MissingDependencyError:
+    WOEBinPlots = None
 
+
+@unittest.skipIf(WOEBinPlots is None, "scorecardpy is not installed")
 class TestWOEBinPlots(unittest.TestCase):
     def setUp(self):
         # Create a sample dataset with categorical features and binary target

diff --git a/tests/unit_tests/data_validation/test_WOEBinTable.py b/tests/unit_tests/data_validation/test_WOEBinTable.py
@@ -1,10 +1,16 @@
 import unittest
 import pandas as pd
 import validmind as vm
-from validmind.errors import SkipTestError
-from validmind.tests.data_validation.WOEBinTable import WOEBinTable, RawData
+from validmind.errors import MissingDependencyError, SkipTestError
+from validmind import RawData
 
+try:
+    from validmind.tests.data_validation.WOEBinTable import WOEBinTable
+except MissingDependencyError:
+    WOEBinTable = None
 
+
+@unittest.skipIf(WOEBinTable is None, "scorecardpy is not installed")
 class TestWOEBinTable(unittest.TestCase):
     def setUp(self):
         # Create a sample dataset with categorical and numeric features and binary target

diff --git a/validmind/datasets/credit_risk/lending_club.py b/validmind/datasets/credit_risk/lending_club.py
@@ -20,14 +20,12 @@
 try:
     import scorecardpy as sc
 except ImportError as e:
-    if "scorecardpy" in str(e):
-        raise MissingDependencyError(
-            "Missing required package `scorecardpy` for credit risk demos. "
-            "Please run `pip install validmind[credit_risk]` or `pip install scorecardpy`.",
-            required_dependencies=["scorecardpy"],
-            extra="credit_risk",
-        ) from e
-    raise e
+    raise MissingDependencyError(
+        "Missing required package `scorecardpy` for credit risk demos. "
+        "Please run `pip install validmind[credit_risk]` or `pip install scorecardpy`.",
+        required_dependencies=["scorecardpy"],
+        extra="credit_risk",
+    ) from e
 
 current_path = os.path.dirname(os.path.abspath(__file__))
 dataset_path = os.path.join(current_path, "datasets")

diff --git a/validmind/scorers/llm/deepeval/ArgumentCorrectness.py b/validmind/scorers/llm/deepeval/ArgumentCorrectness.py
@@ -40,10 +40,7 @@ def ArgumentCorrectness(
     dataset: VMDataset,
     threshold: float = 0.7,
     input_column: str = "input",
-    tools_called_column: str = "tools_called",
-    agent_output_column: str = "agent_output",
-    actual_output_column: str = "actual_output",
-    strict_mode: bool = False,
+    actual_tools_called_column: str = "tools_called",
 ) -> List[Dict[str, Any]]:
     """Evaluates agent argument correctness using deepeval's ArgumentCorrectnessMetric.
 
@@ -55,8 +52,15 @@ def ArgumentCorrectness(
     evaluates argument correctness based on the input context rather than comparing
     against expected values.
 
+    When ``model`` is provided, the agent is run per row inside deepeval's evals_iterator
+    so the metric receives trace data. Without ``model``, the dataset-only path uses
+    pre-computed columns.
+
     Args:
         dataset: Dataset containing the agent input and tool calls
+        model: Optional ValidMind model (agent) with predict_fn. When provided, the
+            agent is run per row inside deepeval's evals_iterator so the metric
+            receives trace data.
         threshold: Minimum passing threshold (default: 0.7)
         input_column: Column name for the task input (default: "input")
         tools_called_column: Column name for tools called (default: "tools_called")
@@ -69,57 +73,39 @@ def ArgumentCorrectness(
     Raises:
         ValueError: If required columns are missing
     """
-    # Validate required columns exist in dataset
+    from validmind.scorers.llm.deepeval import _convert_to_tool_call_list
+
     missing_columns: List[str] = []
     if input_column not in dataset._df.columns:
         missing_columns.append(input_column)
-
+    if actual_tools_called_column not in dataset._df.columns:
+        missing_columns.append(actual_tools_called_column)
     if missing_columns:
         raise ValueError(
-            f"Required columns {missing_columns} not found in dataset. "
+            f"ToolCorrectness with model requires columns {missing_columns}. "
             f"Available columns: {dataset._df.columns.tolist()}"
         )
 
-    _, model = get_client_and_model()
-
-    metric = ArgumentCorrectnessMetric(
-        threshold=threshold,
-        model=model,
-        include_reason=True,
-        strict_mode=strict_mode,
-        verbose_mode=False,
-    )
-
-    # Import helper functions to avoid circular import
-    from validmind.scorers.llm.deepeval import (
-        _convert_to_tool_call_list,
-        extract_tool_calls_from_agent_output,
-    )
-
+    _, llm_model = get_client_and_model()
     results: List[Dict[str, Any]] = []
-    for _, row in dataset._df.iterrows():
-        input_value = row[input_column]
 
-        # Extract tools called
-        if tools_called_column in dataset._df.columns:
-            tools_called_value = row.get(tools_called_column, [])
-        else:
-            agent_output = row.get(agent_output_column, {})
-            tools_called_value = extract_tool_calls_from_agent_output(agent_output)
-        tools_called_list = _convert_to_tool_call_list(tools_called_value)
+    for _, row in dataset._df.iterrows():
+        actual_tools_value = row.get(actual_tools_called_column, [])
+        actual_tools_list = _convert_to_tool_call_list(actual_tools_value)
 
-        actual_output_value = row.get(actual_output_column, "")
+        metric = ArgumentCorrectnessMetric(
+            threshold=threshold,
+            model=llm_model,
+        )
 
         test_case = LLMTestCase(
-            input=input_value,
-            tools_called=tools_called_list,
-            actual_output=actual_output_value,
+            input=row[input_column],
+            tools_called=actual_tools_list,
         )
 
         result = evaluate(test_cases=[test_case], metrics=[metric])
         metric_data = result.test_results[0].metrics_data[0]
         score = metric_data.score
         reason = getattr(metric_data, "reason", "No reason provided")
         results.append({"score": score, "reason": reason})
-
     return results