From a96d64b65b454717bd47cd2d5b33f64048cde0f2 Mon Sep 17 00:00:00 2001 From: Juan Date: Fri, 27 Feb 2026 21:07:39 +0100 Subject: [PATCH 1/2] Align MissingValues Pass/Fail evaluation with the displayed missing-percentage metric --- .../4-finalize_validation_reporting.ipynb | 4 +-- .../application_scorecard_with_bias.ipynb | 2 +- .../application_scorecard_with_ml.ipynb | 6 ++-- ...document_excel_application_scorecard.ipynb | 4 +-- .../validate_application_scorecard.ipynb | 4 +-- ...ication_scorecard_ongoing_monitoring.ipynb | 2 +- .../data_validation/test_MissingValues.py | 6 ++-- .../datasets/credit_risk/lending_club.py | 4 +-- .../tests/data_validation/MissingValues.py | 29 +++++++++++-------- 9 files changed, 34 insertions(+), 27 deletions(-) diff --git a/notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb b/notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb index 613de2a45..3eaee5a3a 100644 --- a/notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb +++ b/notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb @@ -967,7 +967,7 @@ " },\n", " 'validmind.data_validation.MissingValues:raw_data': {\n", " 'inputs': {'dataset': 'raw_dataset'},\n", - " 'params': {'min_threshold': 1}\n", + " 'params': {'min_percentage_threshold': 1}\n", " },\n", " 'validmind.data_validation.ClassImbalance:raw_data': {\n", " 'inputs': {'dataset': 'raw_dataset'},\n", @@ -1010,7 +1010,7 @@ " },\n", " 'validmind.data_validation.MissingValues:preprocessed_data': {\n", " 'inputs': {'dataset': 'raw_dataset_preprocessed'},\n", - " 'params': {'min_threshold': 1}\n", + " 'params': {'min_percentage_threshold': 1}\n", " },\n", " 'validmind.data_validation.TabularNumericalHistograms:preprocessed_data': {\n", " 'inputs': {'dataset': 'raw_dataset_preprocessed'}\n", diff --git a/notebooks/use_cases/credit_risk/application_scorecard_with_bias.ipynb b/notebooks/use_cases/credit_risk/application_scorecard_with_bias.ipynb index 7c2f6a6f3..41769972a 100644 --- a/notebooks/use_cases/credit_risk/application_scorecard_with_bias.ipynb +++ b/notebooks/use_cases/credit_risk/application_scorecard_with_bias.ipynb @@ -760,7 +760,7 @@ " \"dataset\": \"raw_dataset\",\n", " },\n", " params={\n", - " \"min_threshold\": 1,\n", + " \"min_percentage_threshold\": 1,\n", " }\n", ")\n", "test.log()" diff --git a/notebooks/use_cases/credit_risk/application_scorecard_with_ml.ipynb b/notebooks/use_cases/credit_risk/application_scorecard_with_ml.ipynb index 7d85be4fb..cdbb02b0e 100644 --- a/notebooks/use_cases/credit_risk/application_scorecard_with_ml.ipynb +++ b/notebooks/use_cases/credit_risk/application_scorecard_with_ml.ipynb @@ -785,7 +785,7 @@ " \"dataset\": vm_raw_dataset,\n", " },\n", " params={\n", - " \"min_threshold\": 1\n", + " \"min_percentage_threshold\": 1\n", " }\n", ").log()" ] @@ -819,7 +819,7 @@ " \"dataset\": vm_raw_dataset,\n", " },\n", " params={\n", - " \"min_threshold\": 1\n", + " \"min_percentage_threshold\": 1\n", " }\n", ").log()" ] @@ -1747,7 +1747,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ diff --git a/notebooks/use_cases/credit_risk/document_excel_application_scorecard.ipynb b/notebooks/use_cases/credit_risk/document_excel_application_scorecard.ipynb index 22336ad18..252fb1a60 100644 --- a/notebooks/use_cases/credit_risk/document_excel_application_scorecard.ipynb +++ b/notebooks/use_cases/credit_risk/document_excel_application_scorecard.ipynb @@ -603,7 +603,7 @@ " },\n", " 'validmind.data_validation.MissingValues:raw_data': {\n", " 'inputs': {'dataset': 'raw_dataset'},\n", - " 'params': {'min_threshold': 1}\n", + " 'params': {'min_percentage_threshold': 1}\n", " },\n", " 'validmind.data_validation.ClassImbalance:raw_data': {\n", " 'inputs': {'dataset': 'raw_dataset'},\n", @@ -647,7 +647,7 @@ " },\n", " 'validmind.data_validation.MissingValues:preprocessed_data': {\n", " 'inputs': {'dataset': 'preprocess_dataset'},\n", - " 'params': {'min_threshold': 1}\n", + " 'params': {'min_percentage_threshold': 1}\n", " },\n", " 'validmind.data_validation.TabularNumericalHistograms:preprocessed_data': {\n", " 'inputs': {'dataset': 'preprocess_dataset'}\n", diff --git a/notebooks/use_cases/model_validation/validate_application_scorecard.ipynb b/notebooks/use_cases/model_validation/validate_application_scorecard.ipynb index 82b692998..3e20fa1eb 100644 --- a/notebooks/use_cases/model_validation/validate_application_scorecard.ipynb +++ b/notebooks/use_cases/model_validation/validate_application_scorecard.ipynb @@ -1629,7 +1629,7 @@ " },\n", " 'validmind.data_validation.MissingValues:raw_data': {\n", " 'inputs': {'dataset': 'raw_dataset'},\n", - " 'params': {'min_threshold': 1}\n", + " 'params': {'min_percentage_threshold': 1}\n", " },\n", " 'validmind.data_validation.ClassImbalance:raw_data': {\n", " 'inputs': {'dataset': 'raw_dataset'},\n", @@ -1672,7 +1672,7 @@ " },\n", " 'validmind.data_validation.MissingValues:preprocessed_data': {\n", " 'inputs': {'dataset': 'preprocess_dataset'},\n", - " 'params': {'min_threshold': 1}\n", + " 'params': {'min_percentage_threshold': 1}\n", " },\n", " 'validmind.data_validation.TabularNumericalHistograms:preprocessed_data': {\n", " 'inputs': {'dataset': 'preprocess_dataset'}\n", diff --git a/notebooks/use_cases/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb b/notebooks/use_cases/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb index 8ebb5b183..b822bbcde 100644 --- a/notebooks/use_cases/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb +++ b/notebooks/use_cases/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb @@ -623,7 +623,7 @@ " \"dataset\": vm_monitoring_ds,\n", " },\n", " params={\n", - " \"min_threshold\": 1\n", + " \"min_percentage_threshold\": 1\n", " }\n", ").log()" ] diff --git a/tests/unit_tests/data_validation/test_MissingValues.py b/tests/unit_tests/data_validation/test_MissingValues.py index f8d7aa2b0..424edaaba 100644 --- a/tests/unit_tests/data_validation/test_MissingValues.py +++ b/tests/unit_tests/data_validation/test_MissingValues.py @@ -61,7 +61,7 @@ def test_missing_values_counts(self): self.assertEqual(some_missing["Percentage of Missing Values (%)"], 20.0) self.assertEqual(all_missing["Percentage of Missing Values (%)"], 100.0) - # Check Pass/Fail status (with default min_threshold=1) + # Check Pass/Fail status (with default min_percentage_threshold=1.0) self.assertEqual(no_missing["Pass/Fail"], "Pass") self.assertEqual(some_missing["Pass/Fail"], "Fail") self.assertEqual(all_missing["Pass/Fail"], "Fail") @@ -71,7 +71,9 @@ def test_missing_values_counts(self): def test_threshold_parameter(self): # Test with higher threshold that allows some missing values - summary, passed, raw_data = MissingValues(self.vm_dataset, min_threshold=25) + summary, passed, raw_data = MissingValues( + self.vm_dataset, min_percentage_threshold=25 + ) # Get results some_missing = next(s for s in summary if s["Column"] == "some_missing") diff --git a/validmind/datasets/credit_risk/lending_club.py b/validmind/datasets/credit_risk/lending_club.py index 06491631b..d3d99f050 100644 --- a/validmind/datasets/credit_risk/lending_club.py +++ b/validmind/datasets/credit_risk/lending_club.py @@ -514,7 +514,7 @@ def get_demo_test_config( "inputs": { "dataset": "raw_dataset", }, - "params": {"min_threshold": 1}, + "params": {"min_percentage_threshold": 1}, } default_config["validmind.data_validation.ClassImbalance:raw_data"] = { "inputs": { @@ -582,7 +582,7 @@ def get_demo_test_config( "inputs": { "dataset": "preprocess_dataset", }, - "params": {"min_threshold": 1}, + "params": {"min_percentage_threshold": 1}, } default_config[ "validmind.data_validation.TabularNumericalHistograms:preprocessed_data" diff --git a/validmind/tests/data_validation/MissingValues.py b/validmind/tests/data_validation/MissingValues.py index 81db72a36..63b924f88 100644 --- a/validmind/tests/data_validation/MissingValues.py +++ b/validmind/tests/data_validation/MissingValues.py @@ -11,29 +11,30 @@ @tags("tabular_data", "data_quality") @tasks("classification", "regression") def MissingValues( - dataset: VMDataset, min_threshold: int = 1 + dataset: VMDataset, + min_percentage_threshold: float = 1.0, ) -> Tuple[List[Dict[str, Any]], bool, RawData]: """ - Evaluates dataset quality by ensuring missing value ratio across all features does not exceed a set threshold. + Evaluates dataset quality by ensuring missing value percentage across all features does not exceed a set threshold. ### Purpose The Missing Values test is designed to evaluate the quality of a dataset by measuring the number of missing values across all features. The objective is to ensure that the ratio of missing data to total data is less than a - predefined threshold, defaulting to 1, in order to maintain the data quality necessary for reliable predictive - strength in a machine learning model. + predefined threshold (as a percentage), defaulting to 1.0, in order to maintain the data quality necessary for + reliable predictive strength in a machine learning model. ### Test Mechanism The mechanism for this test involves iterating through each column of the dataset, counting missing values (represented as NaNs), and calculating the percentage they represent against the total number of rows. The test - then checks if these missing value counts are less than the predefined `min_threshold`. The results are shown in a - table summarizing each column, the number of missing values, the percentage of missing values in each column, and a - Pass/Fail status based on the threshold comparison. + then checks if the missing value percentage is less than or equal to the predefined `min_percentage_threshold`. The results are + shown in a table summarizing each column, the number of missing values, the percentage of missing values in each + column, and a Pass/Fail status based on the threshold comparison. ### Signs of High Risk - - When the number of missing values in any column exceeds the `min_threshold` value. + - When the missing value percentage in any column exceeds the `min_percentage_threshold` value. - Presence of missing values across many columns, leading to multiple instances of failing the threshold. ### Strengths @@ -45,24 +46,28 @@ def MissingValues( ### Limitations - Does not suggest the root causes of the missing values or recommend ways to impute or handle them. - - May overlook features with significant missing data but still less than the `min_threshold`, potentially + - May overlook features with significant missing data but still less than the `min_percentage_threshold`, potentially impacting the model. - Does not account for data encoded as values like "-999" or "None," which might not technically classify as missing but could bear similar implications. """ df = dataset.df missing = df.isna().sum() + n_rows = df.shape[0] + missing_pct = (missing / n_rows * 100) if n_rows else (missing * 0.0) return ( [ { "Column": col, "Number of Missing Values": missing[col], - "Percentage of Missing Values (%)": missing[col] / df.shape[0] * 100, - "Pass/Fail": "Pass" if missing[col] < min_threshold else "Fail", + "Percentage of Missing Values (%)": missing_pct[col], + "Pass/Fail": "Pass" + if missing_pct[col] <= min_percentage_threshold + else "Fail", } for col in missing.index ], - all(missing[col] < min_threshold for col in missing.index), + all(missing_pct[col] <= min_percentage_threshold for col in missing.index), RawData(missing_values=missing, dataset=dataset.input_id), ) From 61b69d9832ef5406c928d72e5d02675398edbd7f Mon Sep 17 00:00:00 2001 From: Juan Date: Sat, 28 Feb 2026 10:17:16 +0100 Subject: [PATCH 2/2] 2.12.1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ee24a9e67..64064607a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "validmind" -version = "2.12.0" +version = "2.12.1" description = "ValidMind Library" readme = "README.pypi.md" requires-python = ">=3.9,<3.13"