Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -967,7 +967,7 @@
" },\n",
" 'validmind.data_validation.MissingValues:raw_data': {\n",
" 'inputs': {'dataset': 'raw_dataset'},\n",
" 'params': {'min_threshold': 1}\n",
" 'params': {'min_percentage_threshold': 1}\n",
" },\n",
" 'validmind.data_validation.ClassImbalance:raw_data': {\n",
" 'inputs': {'dataset': 'raw_dataset'},\n",
Expand Down Expand Up @@ -1010,7 +1010,7 @@
" },\n",
" 'validmind.data_validation.MissingValues:preprocessed_data': {\n",
" 'inputs': {'dataset': 'raw_dataset_preprocessed'},\n",
" 'params': {'min_threshold': 1}\n",
" 'params': {'min_percentage_threshold': 1}\n",
" },\n",
" 'validmind.data_validation.TabularNumericalHistograms:preprocessed_data': {\n",
" 'inputs': {'dataset': 'raw_dataset_preprocessed'}\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -760,7 +760,7 @@
" \"dataset\": \"raw_dataset\",\n",
" },\n",
" params={\n",
" \"min_threshold\": 1,\n",
" \"min_percentage_threshold\": 1,\n",
" }\n",
")\n",
"test.log()"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -785,7 +785,7 @@
" \"dataset\": vm_raw_dataset,\n",
" },\n",
" params={\n",
" \"min_threshold\": 1\n",
" \"min_percentage_threshold\": 1\n",
" }\n",
").log()"
]
Expand Down Expand Up @@ -819,7 +819,7 @@
" \"dataset\": vm_raw_dataset,\n",
" },\n",
" params={\n",
" \"min_threshold\": 1\n",
" \"min_percentage_threshold\": 1\n",
" }\n",
").log()"
]
Expand Down Expand Up @@ -1747,7 +1747,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -603,7 +603,7 @@
" },\n",
" 'validmind.data_validation.MissingValues:raw_data': {\n",
" 'inputs': {'dataset': 'raw_dataset'},\n",
" 'params': {'min_threshold': 1}\n",
" 'params': {'min_percentage_threshold': 1}\n",
" },\n",
" 'validmind.data_validation.ClassImbalance:raw_data': {\n",
" 'inputs': {'dataset': 'raw_dataset'},\n",
Expand Down Expand Up @@ -647,7 +647,7 @@
" },\n",
" 'validmind.data_validation.MissingValues:preprocessed_data': {\n",
" 'inputs': {'dataset': 'preprocess_dataset'},\n",
" 'params': {'min_threshold': 1}\n",
" 'params': {'min_percentage_threshold': 1}\n",
" },\n",
" 'validmind.data_validation.TabularNumericalHistograms:preprocessed_data': {\n",
" 'inputs': {'dataset': 'preprocess_dataset'}\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1629,7 +1629,7 @@
" },\n",
" 'validmind.data_validation.MissingValues:raw_data': {\n",
" 'inputs': {'dataset': 'raw_dataset'},\n",
" 'params': {'min_threshold': 1}\n",
" 'params': {'min_percentage_threshold': 1}\n",
" },\n",
" 'validmind.data_validation.ClassImbalance:raw_data': {\n",
" 'inputs': {'dataset': 'raw_dataset'},\n",
Expand Down Expand Up @@ -1672,7 +1672,7 @@
" },\n",
" 'validmind.data_validation.MissingValues:preprocessed_data': {\n",
" 'inputs': {'dataset': 'preprocess_dataset'},\n",
" 'params': {'min_threshold': 1}\n",
" 'params': {'min_percentage_threshold': 1}\n",
" },\n",
" 'validmind.data_validation.TabularNumericalHistograms:preprocessed_data': {\n",
" 'inputs': {'dataset': 'preprocess_dataset'}\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -623,7 +623,7 @@
" \"dataset\": vm_monitoring_ds,\n",
" },\n",
" params={\n",
" \"min_threshold\": 1\n",
" \"min_percentage_threshold\": 1\n",
" }\n",
").log()"
]
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "validmind"
version = "2.12.0"
version = "2.12.1"
description = "ValidMind Library"
readme = "README.pypi.md"
requires-python = ">=3.9,<3.13"
Expand Down
6 changes: 4 additions & 2 deletions tests/unit_tests/data_validation/test_MissingValues.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_missing_values_counts(self):
self.assertEqual(some_missing["Percentage of Missing Values (%)"], 20.0)
self.assertEqual(all_missing["Percentage of Missing Values (%)"], 100.0)

# Check Pass/Fail status (with default min_threshold=1)
# Check Pass/Fail status (with default min_percentage_threshold=1.0)
self.assertEqual(no_missing["Pass/Fail"], "Pass")
self.assertEqual(some_missing["Pass/Fail"], "Fail")
self.assertEqual(all_missing["Pass/Fail"], "Fail")
Expand All @@ -71,7 +71,9 @@ def test_missing_values_counts(self):

def test_threshold_parameter(self):
# Test with higher threshold that allows some missing values
summary, passed, raw_data = MissingValues(self.vm_dataset, min_threshold=25)
summary, passed, raw_data = MissingValues(
self.vm_dataset, min_percentage_threshold=25
)

# Get results
some_missing = next(s for s in summary if s["Column"] == "some_missing")
Expand Down
4 changes: 2 additions & 2 deletions validmind/datasets/credit_risk/lending_club.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,7 +514,7 @@ def get_demo_test_config(
"inputs": {
"dataset": "raw_dataset",
},
"params": {"min_threshold": 1},
"params": {"min_percentage_threshold": 1},
}
default_config["validmind.data_validation.ClassImbalance:raw_data"] = {
"inputs": {
Expand Down Expand Up @@ -582,7 +582,7 @@ def get_demo_test_config(
"inputs": {
"dataset": "preprocess_dataset",
},
"params": {"min_threshold": 1},
"params": {"min_percentage_threshold": 1},
}
default_config[
"validmind.data_validation.TabularNumericalHistograms:preprocessed_data"
Expand Down
29 changes: 17 additions & 12 deletions validmind/tests/data_validation/MissingValues.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,29 +11,30 @@
@tags("tabular_data", "data_quality")
@tasks("classification", "regression")
def MissingValues(
dataset: VMDataset, min_threshold: int = 1
dataset: VMDataset,
min_percentage_threshold: float = 1.0,
) -> Tuple[List[Dict[str, Any]], bool, RawData]:
"""
Evaluates dataset quality by ensuring missing value ratio across all features does not exceed a set threshold.
Evaluates dataset quality by ensuring missing value percentage across all features does not exceed a set threshold.

### Purpose

The Missing Values test is designed to evaluate the quality of a dataset by measuring the number of missing values
across all features. The objective is to ensure that the ratio of missing data to total data is less than a
predefined threshold, defaulting to 1, in order to maintain the data quality necessary for reliable predictive
strength in a machine learning model.
predefined threshold (as a percentage), defaulting to 1.0, in order to maintain the data quality necessary for
reliable predictive strength in a machine learning model.

### Test Mechanism

The mechanism for this test involves iterating through each column of the dataset, counting missing values
(represented as NaNs), and calculating the percentage they represent against the total number of rows. The test
then checks if these missing value counts are less than the predefined `min_threshold`. The results are shown in a
table summarizing each column, the number of missing values, the percentage of missing values in each column, and a
Pass/Fail status based on the threshold comparison.
then checks if the missing value percentage is less than or equal to the predefined `min_percentage_threshold`. The results are
shown in a table summarizing each column, the number of missing values, the percentage of missing values in each
column, and a Pass/Fail status based on the threshold comparison.

### Signs of High Risk

- When the number of missing values in any column exceeds the `min_threshold` value.
- When the missing value percentage in any column exceeds the `min_percentage_threshold` value.
- Presence of missing values across many columns, leading to multiple instances of failing the threshold.

### Strengths
Expand All @@ -45,24 +46,28 @@ def MissingValues(
### Limitations

- Does not suggest the root causes of the missing values or recommend ways to impute or handle them.
- May overlook features with significant missing data but still less than the `min_threshold`, potentially
- May overlook features with significant missing data but still less than the `min_percentage_threshold`, potentially
impacting the model.
- Does not account for data encoded as values like "-999" or "None," which might not technically classify as
missing but could bear similar implications.
"""
df = dataset.df
missing = df.isna().sum()
n_rows = df.shape[0]
missing_pct = (missing / n_rows * 100) if n_rows else (missing * 0.0)

return (
[
{
"Column": col,
"Number of Missing Values": missing[col],
"Percentage of Missing Values (%)": missing[col] / df.shape[0] * 100,
"Pass/Fail": "Pass" if missing[col] < min_threshold else "Fail",
"Percentage of Missing Values (%)": missing_pct[col],
"Pass/Fail": "Pass"
if missing_pct[col] <= min_percentage_threshold
else "Fail",
}
for col in missing.index
],
all(missing[col] < min_threshold for col in missing.index),
all(missing_pct[col] <= min_percentage_threshold for col in missing.index),
RawData(missing_values=missing, dataset=dataset.input_id),
)