diff --git a/notebooks/code_sharing/clustering/quickstart_cluster_demo.ipynb b/notebooks/code_sharing/clustering/quickstart_cluster_demo.ipynb index 5fdb3cb21..963bbedb9 100644 --- a/notebooks/code_sharing/clustering/quickstart_cluster_demo.ipynb +++ b/notebooks/code_sharing/clustering/quickstart_cluster_demo.ipynb @@ -112,6 +112,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " #document=\"documentation\",\n", ")" ] }, @@ -449,7 +450,7 @@ }, { "cell_type": "markdown", - "id": "copyright-ab09192f89b6461dab8c821b43a5b591", + "id": "copyright-72162a6186134d199a1bfbbc74f51255", "metadata": {}, "source": [ "\n", diff --git a/notebooks/code_sharing/credit_risk/assign_prediction_probabilities.ipynb b/notebooks/code_sharing/credit_risk/assign_prediction_probabilities.ipynb index 246c0a30f..c031fd556 100644 --- a/notebooks/code_sharing/credit_risk/assign_prediction_probabilities.ipynb +++ b/notebooks/code_sharing/credit_risk/assign_prediction_probabilities.ipynb @@ -161,6 +161,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -735,7 +736,7 @@ }, { "cell_type": "markdown", - "id": "copyright-ec110f5334fc4d4e917b2c3d9e25fd65", + "id": "copyright-0871b365e3924d0995a276dbbbcf62c1", "metadata": {}, "source": [ "\n", diff --git a/notebooks/code_sharing/deepeval_integration_demo.ipynb b/notebooks/code_sharing/deepeval_integration_demo.ipynb index ac8d0bafc..504b2a402 100644 --- a/notebooks/code_sharing/deepeval_integration_demo.ipynb +++ b/notebooks/code_sharing/deepeval_integration_demo.ipynb @@ -207,6 +207,7 @@ " api_key=\"...\",\n", " api_secret=\"...\",\n", " model=\"...\",\n", + " document=\"documentation\",\n", ")" ] }, @@ -1037,7 +1038,7 @@ }, { "cell_type": "markdown", - "id": "copyright-94c232c772c1435aa4529b67cfcc0bb2", + "id": "copyright-0bacca5c16774a5bb77c86f9b25602da", "metadata": {}, "source": [ "\n", diff --git a/notebooks/code_sharing/embeddings/quickstart_embeddings_demo.ipynb b/notebooks/code_sharing/embeddings/quickstart_embeddings_demo.ipynb index 2aed999fd..efcda264e 100644 --- a/notebooks/code_sharing/embeddings/quickstart_embeddings_demo.ipynb +++ b/notebooks/code_sharing/embeddings/quickstart_embeddings_demo.ipynb @@ -102,6 +102,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, diff --git a/notebooks/code_sharing/geval_deepeval_integration_demo.ipynb b/notebooks/code_sharing/geval_deepeval_integration_demo.ipynb index 5e6d4a24e..8a393e082 100644 --- a/notebooks/code_sharing/geval_deepeval_integration_demo.ipynb +++ b/notebooks/code_sharing/geval_deepeval_integration_demo.ipynb @@ -180,6 +180,7 @@ " api_key=\"...\",\n", " api_secret=\"...\",\n", " model=\"...\",\n", + " document=\"documentation\",\n", ")" ] }, @@ -485,7 +486,7 @@ }, { "cell_type": "markdown", - "id": "copyright-bc8fa92bf77149ecbc9fbce7b817acce", + "id": "copyright-66949a76c825474d92b8082056a7202d", "metadata": {}, "source": [ "\n", diff --git a/notebooks/code_sharing/insurance_mortality/insurance_validation_demo.ipynb b/notebooks/code_sharing/insurance_mortality/insurance_validation_demo.ipynb index 1c481e0a9..f01727d9d 100644 --- a/notebooks/code_sharing/insurance_mortality/insurance_validation_demo.ipynb +++ b/notebooks/code_sharing/insurance_mortality/insurance_validation_demo.ipynb @@ -170,6 +170,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -500,7 +501,7 @@ }, { "cell_type": "markdown", - "id": "copyright-151aeb12b20c41ce951d6a1948ccfeac", + "id": "copyright-2442ef8075fe4d588c08432329f35b19", "metadata": {}, "source": [ "\n", diff --git a/notebooks/code_sharing/operational_deposit/operational_deposit_poc.ipynb b/notebooks/code_sharing/operational_deposit/operational_deposit_poc.ipynb index 8d0b912f2..b360d1eb6 100644 --- a/notebooks/code_sharing/operational_deposit/operational_deposit_poc.ipynb +++ b/notebooks/code_sharing/operational_deposit/operational_deposit_poc.ipynb @@ -221,6 +221,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -1132,7 +1133,7 @@ }, { "cell_type": "markdown", - "id": "copyright-e2f10038a74449cba590e46511b2368c", + "id": "copyright-25518ea001774e8895e7a51fa032bfe1", "metadata": {}, "source": [ "\n", diff --git a/notebooks/code_sharing/output_templates/customizing_tests_with_output_templates.ipynb b/notebooks/code_sharing/output_templates/customizing_tests_with_output_templates.ipynb index 9c82dc439..7908df0e7 100644 --- a/notebooks/code_sharing/output_templates/customizing_tests_with_output_templates.ipynb +++ b/notebooks/code_sharing/output_templates/customizing_tests_with_output_templates.ipynb @@ -329,6 +329,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -817,7 +818,7 @@ }, { "cell_type": "markdown", - "id": "copyright-3bef29e042fe42af9d5a90752889fef1", + "id": "copyright-9714145fb42f4245b8b0b51d499c007f", "metadata": {}, "source": [ "\n", diff --git a/notebooks/code_sharing/plots_and_stats_demo.ipynb b/notebooks/code_sharing/plots_and_stats_demo.ipynb index 8bc70d77e..8ebadbce7 100644 --- a/notebooks/code_sharing/plots_and_stats_demo.ipynb +++ b/notebooks/code_sharing/plots_and_stats_demo.ipynb @@ -134,6 +134,7 @@ " api_key=\"...\",\n", " api_secret=\"...\",\n", " model=\"...\",\n", + " document=\"documentation\",\n", ")" ] }, @@ -685,7 +686,7 @@ }, { "cell_type": "markdown", - "id": "copyright-45adba169fed412bb9ef9b4389e4ee1a", + "id": "copyright-47612ed922f3409fbd077461fc500bb9", "metadata": {}, "source": [ "\n", diff --git a/notebooks/code_sharing/regression/regression_unit_metrics.ipynb b/notebooks/code_sharing/regression/regression_unit_metrics.ipynb index 3d126563f..7a8dad352 100644 --- a/notebooks/code_sharing/regression/regression_unit_metrics.ipynb +++ b/notebooks/code_sharing/regression/regression_unit_metrics.ipynb @@ -206,6 +206,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -759,7 +760,7 @@ }, { "cell_type": "markdown", - "id": "copyright-3ad307f0c325424782cdc45a37badc9c", + "id": "copyright-46bd32efda67419684f8a989076192f8", "metadata": {}, "source": [ "\n", diff --git a/notebooks/code_sharing/test_configuration_updates_demo.ipynb b/notebooks/code_sharing/test_configuration_updates_demo.ipynb index 528f4de99..d5952d8cd 100644 --- a/notebooks/code_sharing/test_configuration_updates_demo.ipynb +++ b/notebooks/code_sharing/test_configuration_updates_demo.ipynb @@ -22,6 +22,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")\n", "\n", "vm.preview_template()" @@ -56,7 +57,7 @@ }, { "cell_type": "markdown", - "id": "copyright-e9a1d468f954434aaf061db9e18b44f7", + "id": "copyright-abc8f066db8f4ec48a34965195899902", "metadata": {}, "source": [ "\n", diff --git a/notebooks/how_to/data_and_datasets/dataset_inputs/configure_dataset_features.ipynb b/notebooks/how_to/data_and_datasets/dataset_inputs/configure_dataset_features.ipynb index ddb9f6731..fc15f4d37 100644 --- a/notebooks/how_to/data_and_datasets/dataset_inputs/configure_dataset_features.ipynb +++ b/notebooks/how_to/data_and_datasets/dataset_inputs/configure_dataset_features.ipynb @@ -214,6 +214,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -431,7 +432,7 @@ }, { "cell_type": "markdown", - "id": "copyright-77bd4876ab7945d6a1f592e432d81ca5", + "id": "copyright-32870f8bce7f4ed0903136a69d02b421", "metadata": {}, "source": [ "\n", diff --git a/notebooks/how_to/data_and_datasets/dataset_inputs/load_datasets_predictions.ipynb b/notebooks/how_to/data_and_datasets/dataset_inputs/load_datasets_predictions.ipynb index cf82a23c2..5353b326d 100644 --- a/notebooks/how_to/data_and_datasets/dataset_inputs/load_datasets_predictions.ipynb +++ b/notebooks/how_to/data_and_datasets/dataset_inputs/load_datasets_predictions.ipynb @@ -234,6 +234,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -1016,7 +1017,7 @@ }, { "cell_type": "markdown", - "id": "copyright-0763ff57c8834b5e80d683d17186580e", + "id": "copyright-76fcd2c215674068b812492b7c639056", "metadata": {}, "source": [ "\n", diff --git a/notebooks/how_to/data_and_datasets/use_dataset_model_objects.ipynb b/notebooks/how_to/data_and_datasets/use_dataset_model_objects.ipynb index f7a2eb55f..28543ac9a 100644 --- a/notebooks/how_to/data_and_datasets/use_dataset_model_objects.ipynb +++ b/notebooks/how_to/data_and_datasets/use_dataset_model_objects.ipynb @@ -235,6 +235,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -945,7 +946,7 @@ }, { "cell_type": "markdown", - "id": "copyright-6841287fee5e4319a84276ef23c34e1a", + "id": "copyright-51ea085ad91646c2b5d13175a2cfb89d", "metadata": {}, "source": [ "\n", diff --git a/notebooks/how_to/metrics/log_metrics_over_time.ipynb b/notebooks/how_to/metrics/log_metrics_over_time.ipynb index 52d0e2e31..31f9f1dcf 100644 --- a/notebooks/how_to/metrics/log_metrics_over_time.ipynb +++ b/notebooks/how_to/metrics/log_metrics_over_time.ipynb @@ -229,6 +229,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -918,7 +919,7 @@ }, { "cell_type": "markdown", - "id": "copyright-2b6b06cfa3254951b8ebbc1178037888", + "id": "copyright-584966fafc334aec9585d8f880ddba0c", "metadata": {}, "source": [ "\n", diff --git a/notebooks/how_to/metrics/run_unit_metrics.ipynb b/notebooks/how_to/metrics/run_unit_metrics.ipynb index 0386439ef..3112b8a91 100644 --- a/notebooks/how_to/metrics/run_unit_metrics.ipynb +++ b/notebooks/how_to/metrics/run_unit_metrics.ipynb @@ -222,6 +222,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -755,7 +756,7 @@ }, { "cell_type": "markdown", - "id": "copyright-238ef79bf0f045d9a112ffd540267e96", + "id": "copyright-5f5c77c06f204f53a1dfda94461b7aee", "metadata": {}, "source": [ "\n", diff --git a/notebooks/how_to/scoring/assign_scores_complete_tutorial.ipynb b/notebooks/how_to/scoring/assign_scores_complete_tutorial.ipynb index 933da518a..5dad199c0 100644 --- a/notebooks/how_to/scoring/assign_scores_complete_tutorial.ipynb +++ b/notebooks/how_to/scoring/assign_scores_complete_tutorial.ipynb @@ -224,6 +224,7 @@ " api_key=\"...\",\n", " api_secret=\"...\",\n", " model=\"...\",\n", + " document=\"documentation\",\n", ")\n" ] }, @@ -772,7 +773,7 @@ }, { "cell_type": "markdown", - "id": "copyright-25e9888fae40481b9007f6a2e2bdd60b", + "id": "copyright-412343d267194a01a369e790077492b0", "metadata": {}, "source": [ "\n", diff --git a/notebooks/how_to/tests/custom_tests/implement_custom_tests.ipynb b/notebooks/how_to/tests/custom_tests/implement_custom_tests.ipynb index 08cbb4f36..35b75b457 100644 --- a/notebooks/how_to/tests/custom_tests/implement_custom_tests.ipynb +++ b/notebooks/how_to/tests/custom_tests/implement_custom_tests.ipynb @@ -1,1094 +1,1095 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Implement custom tests\n", - "\n", - "Custom tests extend the functionality of ValidMind, allowing you to document any model or use case with added flexibility.\n", - "\n", - "ValidMind provides a comprehensive set of tests out-of-the-box to evaluate and document your models and datasets. We recognize there will be cases where the default tests do not support a model or dataset, or specific documentation is needed. In these cases, you can create and use your own custom code to accomplish what you need. To streamline custom code integration, we support the creation of custom test functions.\n", - "\n", - "This interactive notebook provides a step-by-step guide for implementing and registering custom tests with ValidMind, running them individually, viewing the results on the ValidMind Platform, and incorporating them into your model documentation template." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "::: {.content-hidden when-format=\"html\"}\n", - "## Contents \n", - "- [About ValidMind](#toc1__) \n", - " - [Before you begin](#toc1_1__) \n", - " - [New to ValidMind?](#toc1_2__) \n", - " - [Key concepts](#toc1_3__) \n", - "- [Setting up](#toc2__) \n", - " - [Install the ValidMind Library](#toc2_1__) \n", - " - [Initialize the ValidMind Library](#toc2_2__) \n", - " - [Register sample model](#toc2_2_1__) \n", - " - [Apply documentation template](#toc2_2_2__) \n", - " - [Get your code snippet](#toc2_2_3__) \n", - "- [Implement a Custom Test](#toc3__) \n", - "- [Run the Custom Test](#toc4__) \n", - " - [Setup the Model and Dataset](#toc4_1__) \n", - " - [Run the Custom Test](#toc4_2__) \n", - "- [Adding Custom Test to Model Documentation](#toc5__) \n", - "- [Some More Custom Tests](#toc6__) \n", - " - [Custom Test: Table of Model Hyperparameters](#toc6_1__) \n", - " - [Custom Test: External API Call](#toc6_2__) \n", - " - [Custom Test: Passing Parameters](#toc6_3__) \n", - " - [Custom Test: Multiple Tables and Plots in a Single Test](#toc6_4__) \n", - " - [Custom Test: Images](#toc6_5__) \n", - " - [Custom Test: Description](#toc6_6__) \n", - "- [Conclusion](#toc7__) \n", - "- [Next steps](#toc8__) \n", - " - [Work with your model documentation](#toc8_1__) \n", - " - [Discover more learning resources](#toc8_2__) \n", - "- [Upgrade ValidMind](#toc9__) \n", - "\n", - ":::\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## About ValidMind\n", - "\n", - "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", - "\n", - "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", - "\n", - "\n", - "\n", - "### Before you begin\n", - "\n", - "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", - "\n", - "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", - "\n", - "\n", - "\n", - "### New to ValidMind?\n", - "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", - "\n", - "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", - "

\n", - "Register with ValidMind
\n", - "\n", - "\n", - "\n", - "### Key concepts\n", - "\n", - "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", - "\n", - "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", - "\n", - "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", - "\n", - "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", - "\n", - "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", - "\n", - " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", - " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", - " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", - "\n", - "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", - "\n", - "**Outputs**: Custom test can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", - "\n", - "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", - "\n", - "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Setting up" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Install the ValidMind Library\n", - "\n", - "To install the library:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q validmind" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the ValidMind Library" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Register sample model\n", - "\n", - "Let's first register a sample model for use with this notebook:\n", - "\n", - "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", - "\n", - "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", - "\n", - "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", - "\n", - "4. Select your own name under the **MODEL OWNER** drop-down.\n", - "\n", - "5. Click **Register Model** to add the model to your inventory." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Apply documentation template\n", - "\n", - "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", - "\n", - "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", - "\n", - "2. Under **TEMPLATE**, select `Binary classification`.\n", - "\n", - "3. Click **Use Template** to apply the template." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Get your code snippet\n", - "\n", - "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", - "\n", - "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", - "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load your model identifier credentials from an `.env` file\n", - "\n", - "%load_ext dotenv\n", - "%dotenv .env\n", - "\n", - "# Or replace with your code snippet\n", - "\n", - "import validmind as vm\n", - "\n", - "vm.init(\n", - " # api_host=\"...\",\n", - " # api_key=\"...\",\n", - " # api_secret=\"...\",\n", - " # model=\"...\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Implement a Custom Test\n", - "\n", - "Let's start off by creating a simple custom test that creates a Confusion Matrix for a binary classification model. We will use the `sklearn.metrics.confusion_matrix` function to calculate the confusion matrix and then display it as a heatmap using `plotly`. (This is already a built-in test in ValidMind, but we will use it as an example to demonstrate how to create custom tests.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "from sklearn import metrics\n", - "\n", - "\n", - "@vm.test(\"my_custom_tests.ConfusionMatrix\")\n", - "def confusion_matrix(dataset, model):\n", - " \"\"\"The confusion matrix is a table that is often used to describe the performance of a classification model on a set of data for which the true values are known.\n", - "\n", - " The confusion matrix is a 2x2 table that contains 4 values:\n", - "\n", - " - True Positive (TP): the number of correct positive predictions\n", - " - True Negative (TN): the number of correct negative predictions\n", - " - False Positive (FP): the number of incorrect positive predictions\n", - " - False Negative (FN): the number of incorrect negative predictions\n", - "\n", - " The confusion matrix can be used to assess the holistic performance of a classification model by showing the accuracy, precision, recall, and F1 score of the model on a single figure.\n", - " \"\"\"\n", - " y_true = dataset.y\n", - " y_pred = dataset.y_pred(model)\n", - "\n", - " confusion_matrix = metrics.confusion_matrix(y_true, y_pred)\n", - "\n", - " cm_display = metrics.ConfusionMatrixDisplay(\n", - " confusion_matrix=confusion_matrix, display_labels=[False, True]\n", - " )\n", - " cm_display.plot()\n", - "\n", - " plt.close() # close the plot to avoid displaying it\n", - "\n", - " return cm_display.figure_ # return the figure object itself" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Thats our custom test defined and ready to go... Let's take a look at whats going on here:\n", - "\n", - "- The function `confusion_matrix` takes two arguments `dataset` and `model`. This is a VMDataset and VMModel object respectively.\n", - "- The function docstring provides a description of what the test does. This will be displayed along with the result in this notebook as well as in the ValidMind Platform.\n", - "- The function body calculates the confusion matrix using the `sklearn.metrics.confusion_matrix` function and then plots it using `sklearn.metric.ConfusionMatrixDisplay`.\n", - "- The function then returns the `ConfusionMatrixDisplay.figure_` object - this is important as the ValidMind Library expects the output of the custom test to be a plot or a table.\n", - "- The `@vm.test` decorator is doing the work of creating a wrapper around the function that will allow it to be run by the ValidMind Library. It also registers the test so it can be found by the ID `my_custom_tests.ConfusionMatrix` (see the section below on how test IDs work in ValidMind and why this format is important)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Run the Custom Test\n", - "\n", - "Now that we have defined and registered our custom test, lets see how we can run it and properly use it in the ValidMind Platform." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Setup the Model and Dataset\n", - "\n", - "First let's setup a an example model and dataset to run our custom metic against. Since this is a Confusion Matrix, we will use the Customer Churn dataset that ValidMind provides and train a simple XGBoost model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import xgboost as xgb\n", - "from validmind.datasets.classification import customer_churn\n", - "\n", - "raw_df = customer_churn.load_data()\n", - "train_df, validation_df, test_df = customer_churn.preprocess(raw_df)\n", - "\n", - "x_train = train_df.drop(customer_churn.target_column, axis=1)\n", - "y_train = train_df[customer_churn.target_column]\n", - "x_val = validation_df.drop(customer_churn.target_column, axis=1)\n", - "y_val = validation_df[customer_churn.target_column]\n", - "\n", - "model = xgb.XGBClassifier(early_stopping_rounds=10)\n", - "model.set_params(\n", - " eval_metric=[\"error\", \"logloss\", \"auc\"],\n", - ")\n", - "model.fit(\n", - " x_train,\n", - " y_train,\n", - " eval_set=[(x_val, y_val)],\n", - " verbose=False,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Easy enough! Now we have a model and dataset setup and trained. One last thing to do is bring the dataset and model into the ValidMind Library:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# for now, we'll just use the test dataset\n", - "vm_test_ds = vm.init_dataset(\n", - " dataset=test_df,\n", - " target_column=customer_churn.target_column,\n", - " input_id=\"test_dataset\",\n", - ")\n", - "\n", - "vm_model = vm.init_model(model, input_id=\"model\")\n", - "\n", - "# link the model to the dataset\n", - "vm_test_ds.assign_predictions(model=vm_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Run the Custom Test\n", - "\n", - "Now that we have our model and dataset setup, we have everything we need to run our custom test. We can do this by importing the `run_test` function from the `validmind.tests` module and passing in the test ID of our custom test along with the model and dataset we want to run it against.\n", - "\n", - ">Notice how the `inputs` dictionary is used to map an `input_id` which we set above to the `model` and `dataset` keys that are expected by our custom test function. This is how the ValidMind Library knows which inputs to pass to different tests and is key when using many different datasets and models." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.tests import run_test\n", - "\n", - "result = run_test(\n", - " \"my_custom_tests.ConfusionMatrix\",\n", - " inputs={\"model\": \"model\", \"dataset\": \"test_dataset\"},\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You'll notice that the docstring becomes a markdown description of the test. The figure is then displayed as the test result. What you see above is how it will look in the ValidMind Platform as well. Let's go ahead and log the result to see how that works." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result.log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Adding Custom Test to Model Documentation\n", - "\n", - "To do this, go to the documentation page of the model you registered above and navigate to the `Model Development` -> `Model Evaluation` section. Then hover between any existing content block to reveal the `+` button as shown in the screenshot below.\n", - "\n", - "![screenshot showing insert button for test-driven blocks](./insert-test-driven-block.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now click on the `+` button and select the `Test-Driven Block` option. This will open a dialog where you can select `My Custom Tests Confusion Matrix` from the list of available tests. You can preview the result and then click `Insert Block` to add it to the documentation.\n", - "\n", - "![screenshot showing how to insert a test-driven block](./insert-test-driven-block-custom.png)\n", - "\n", - "The test should match the result you see above. It is now part of your documentation and will now be run everytime you run `vm.run_documentation_tests()` for your model. Let's do that now." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm.reload()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you preview the template, it should show the custom test in the `Model Development`->`Model Evaluation` section:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm.preview_template()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Just so we can run all of the tests in the template, let's initialize the train and raw dataset.\n", - "\n", - "(Refer to [**Quickstart for model documentation**](../../../quickstart/quickstart_model_documentation.ipynb) and the ValidMind docs for more information on what we are doing here)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_raw_dataset = vm.init_dataset(\n", - " dataset=raw_df,\n", - " input_id=\"raw_dataset\",\n", - " target_column=customer_churn.target_column,\n", - " class_labels=customer_churn.class_labels,\n", - ")\n", - "\n", - "vm_train_ds = vm.init_dataset(\n", - " dataset=train_df,\n", - " input_id=\"train_dataset\",\n", - " target_column=customer_churn.target_column,\n", - ")\n", - "vm_train_ds.assign_predictions(model=vm_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To run all the tests in the template, you can use the `vm.run_documentation_tests()` and pass the inputs we initialized above and the demo config from our customer_churn module. We will have to add a section to the config for our new test to tell it which inputs it should receive. This is done by simply adding a new element in the config dictionary where the key is the ID of the test and the value is a dictionary with the following structure:\n", - "```python\n", - "{\n", - " \"inputs\": {\n", - " \"model\": \"test_dataset\",\n", - " \"dataset\": \"model\",\n", - " }\n", - "}\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.utils import preview_test_config\n", - "\n", - "test_config = customer_churn.get_demo_test_config()\n", - "test_config[\"my_custom_tests.ConfusionMatrix\"] = {\n", - " \"inputs\": {\n", - " \"dataset\": \"test_dataset\",\n", - " \"model\": \"model\",\n", - " }\n", - "}\n", - "preview_test_config(test_config)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "full_suite = vm.run_documentation_tests(config=test_config)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Some More Custom Tests\n", - "\n", - "Now that you understand the entire process of creating custom tests and using them in your documentation, let's create a few more to see different ways you can utilize custom tests." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Custom Test: Table of Model Hyperparameters\n", - "\n", - "This custom test will display a table of the hyperparameters used in the model:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@vm.test(\"my_custom_tests.Hyperparameters\")\n", - "def hyperparameters(model):\n", - " \"\"\"The hyperparameters of a machine learning model are the settings that control the learning process.\n", - " These settings are specified before the learning process begins and can have a significant impact on the\n", - " performance of the model.\n", - "\n", - " The hyperparameters of a model can be used to tune the model to achieve the best possible performance\n", - " on a given dataset. By examining the hyperparameters of a model, you can gain insight into how the model\n", - " was trained and how it might be improved.\n", - " \"\"\"\n", - " hyperparameters = model.model.get_xgb_params() # dictionary of hyperparameters\n", - "\n", - " # turn the dictionary into a table where each row contains a hyperparameter and its value\n", - " return [{\"Hyperparam\": k, \"Value\": v} for k, v in hyperparameters.items() if v]\n", - "\n", - "\n", - "result = run_test(\"my_custom_tests.Hyperparameters\", inputs={\"model\": \"model\"})\n", - "result.log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since the test has been run and logged, you can add it to your documentation using the same process as above. It should look like this:\n", - "\n", - "![screenshot showing hyperparameters test](./hyperparameters-custom-metric.png)\n", - "\n", - "For our simple toy model, there are aren't really any proper hyperparameters but you can see how this could be useful for more complex models that have gone through hyperparameter tuning." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Custom Test: External API Call\n", - "\n", - "This custom test will make an external API call to get the current BTC price and display it as a table. This demonstrates how you might integrate external data sources into your model documentation in a programmatic way. You could, for instance, setup a pipeline that runs a test like this every day to keep your model documentation in sync with an external system." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "import random\n", - "\n", - "\n", - "@vm.test(\"my_custom_tests.ExternalAPI\")\n", - "def external_api():\n", - " \"\"\"This test calls an external API to get a list of fake users. It then creates\n", - " a table with the relevant data so it can be displayed in the documentation.\n", - "\n", - " The purpose of this test is to demonstrate how to call an external API and use the\n", - " data in a test. A test like this could even be setup to run in a scheduled\n", - " pipeline to keep your documentation in-sync with an external data source.\n", - " \"\"\"\n", - " url = \"https://jsonplaceholder.typicode.com/users\"\n", - " response = requests.get(url)\n", - " data = response.json()\n", - "\n", - " # extract the time and the current BTC price in USD\n", - " return {\n", - " \"Model Owners/Stakeholders\": [\n", - " {\n", - " \"Name\": user[\"name\"],\n", - " \"Role\": random.choice([\"Owner\", \"Stakeholder\"]),\n", - " \"Email\": user[\"email\"],\n", - " \"Phone\": user[\"phone\"],\n", - " \"Slack Handle\": f\"@{user['name'].lower().replace(' ', '.')}\",\n", - " }\n", - " for user in data[:3]\n", - " ],\n", - " \"Model Developers\": [\n", - " {\n", - " \"Name\": user[\"name\"],\n", - " \"Role\": \"Developer\",\n", - " \"Email\": user[\"email\"],\n", - " }\n", - " for user in data[3:7]\n", - " ],\n", - " \"Model Validators\": [\n", - " {\n", - " \"Name\": user[\"name\"],\n", - " \"Role\": \"Validator\",\n", - " \"Email\": user[\"email\"],\n", - " }\n", - " for user in data[7:]\n", - " ],\n", - " }\n", - "\n", - "\n", - "result = run_test(\"my_custom_tests.ExternalAPI\")\n", - "result.log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Again, you can add this to your documentation to see how it looks:\n", - "\n", - "![screenshot showing BTC price metric](./external-data-custom-test.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Custom Test: Passing Parameters\n", - "\n", - "Custom test functions, as stated earlier, can take both inputs and params. When you define your function there is no need to distinguish between the two, the ValidMind Library will handle that for you. You simply need to add both to the function as arguments and the library will pass in the correct values.\n", - "\n", - "So for instance, if you wanted to parameterize the first custom test we created, the confusion matrix, you could do so like this:\n", - "\n", - "```python\n", - "def confusion_matrix(dataset: VMDataset, model: VMModel, my_param: str = \"Default Value\"):\n", - " pass\n", - "```\n", - "\n", - "And then when you run the test, you can pass in the parameter like this:\n", - "\n", - "```python\n", - "vm.run_test(\n", - " \"my_custom_tests.ConfusionMatrix\",\n", - " inputs={\"model\": \"model\", \"dataset\": \"test_dataset\"},\n", - " params={\"my_param\": \"My Value\"},\n", - ")\n", - "```\n", - "\n", - "Or if you are running the entire documentation template, you would update the config like this:\n", - "\n", - "```python\n", - "test_config[\"my_custom_tests.ConfusionMatrix\"] = {\n", - " \"inputs\": {\n", - " \"dataset\": \"test_dataset\",\n", - " \"model\": \"model\",\n", - " },\n", - " \"params\": {\n", - " \"my_param\": \"My Value\",\n", - " },\n", - "}\n", - "```\n", - "\n", - "Let's go ahead and create a toy test that takes a parameter and uses it in the result:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import plotly.express as px\n", - "\n", - "\n", - "@vm.test(\"my_custom_tests.ParameterExample\")\n", - "def parameter_example(\n", - " plot_title=\"Default Plot Title\", x_col=\"sepal_width\", y_col=\"sepal_length\"\n", - "):\n", - " \"\"\"This test takes two parameters and creates a scatter plot based on them.\n", - "\n", - " The purpose of this test is to demonstrate how to create a test that takes\n", - " parameters and uses them to generate a plot. This can be useful for creating\n", - " tests that are more flexible and can be used in a variety of scenarios.\n", - " \"\"\"\n", - " # return px.scatter(px.data.iris(), x=x_col, y=y_col, color=\"species\")\n", - " return px.scatter(\n", - " px.data.iris(), x=x_col, y=y_col, color=\"species\", title=plot_title\n", - " )\n", - "\n", - "\n", - "result = run_test(\n", - " \"my_custom_tests.ParameterExample\",\n", - " params={\n", - " \"plot_title\": \"My Cool Plot\",\n", - " \"x_col\": \"sepal_width\",\n", - " \"y_col\": \"sepal_length\",\n", - " },\n", - ")\n", - "result.log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Play around with this and see how you can use parameters, default values and other features to make your custom tests more flexible and useful.\n", - "\n", - "Here's how this one looks in the documentation:\n", - "![screenshot showing parameterized test](./parameterized-custom-metric.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Custom Test: Multiple Tables and Plots in a Single Test\n", - "\n", - "Custom test functions, as stated earlier, can return more than just one table or plot. In fact, any number of tables and plots can be returned. Let's see an example of this:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import plotly.express as px\n", - "\n", - "\n", - "@vm.test(\"my_custom_tests.ComplexOutput\")\n", - "def complex_output():\n", - " \"\"\"This test demonstrates how to return many tables and figures in a single test\"\"\"\n", - " # create a couple tables\n", - " table = [{\"A\": 1, \"B\": 2}, {\"A\": 3, \"B\": 4}]\n", - " table2 = [{\"C\": 5, \"D\": 6}, {\"C\": 7, \"D\": 8}]\n", - "\n", - " # create a few figures showing some random data\n", - " fig1 = px.line(x=np.arange(10), y=np.random.rand(10), title=\"Random Line Plot\")\n", - " fig2 = px.bar(x=[\"A\", \"B\", \"C\"], y=np.random.rand(3), title=\"Random Bar Plot\")\n", - " fig3 = px.scatter(\n", - " x=np.random.rand(10), y=np.random.rand(10), title=\"Random Scatter Plot\"\n", - " )\n", - "\n", - " return (\n", - " {\n", - " \"My Cool Table\": table,\n", - " \"Another Table\": table2,\n", - " },\n", - " fig1,\n", - " fig2,\n", - " fig3,\n", - " )\n", - "\n", - "\n", - "result = run_test(\"my_custom_tests.ComplexOutput\")\n", - "result.log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice how you can return the tables as a dictionary where the key is the title of the table and the value is the table itself. You could also just return the tables by themselves but this way you can give them a title to more easily identify them in the result.\n", - "\n", - "![screenshot showing multiple tables and plots](./multiple-tables-plots-custom-metric.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Custom Test: Images\n", - "\n", - "If you are using a plotting library that isn't supported by ValidMind (i.e. not `matplotlib` or `plotly`), you can still return the image directly as a bytes-like object. This could also be used to bring any type of image into your documentation in a programmatic way. For instance, you may want to include a diagram of your model architecture or a screenshot of a dashboard that your model is integrated with. As long as you can produce the image with Python or open it from a file, you can include it in your documentation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import io\n", - "import matplotlib.pyplot as plt\n", - "\n", - "\n", - "@vm.test(\"my_custom_tests.Image\")\n", - "def image():\n", - " \"\"\"This test demonstrates how to return an image in a test\"\"\"\n", - "\n", - " # create a simple plot\n", - " fig, ax = plt.subplots()\n", - " ax.plot([1, 2, 3, 4])\n", - " ax.set_title(\"Simple Line Plot\")\n", - "\n", - " # save the plot as a PNG image (in-memory buffer)\n", - " img_data = io.BytesIO()\n", - " fig.savefig(img_data, format=\"png\")\n", - " img_data.seek(0)\n", - "\n", - " plt.close() # close the plot to avoid displaying it\n", - "\n", - " return img_data.read()\n", - "\n", - "\n", - "result = run_test(\"my_custom_tests.Image\")\n", - "result.log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Adding this custom test to your documentation will display the image:\n", - "\n", - "![screenshot showing image custom test](./image-in-custom-metric.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you want to log an image as a test result, you can do so by passing the path to the image as a parameter to the custom test and then opening the file in the test function. Here's an example:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@vm.test(\"my_custom_tests.MyPNGCorrelationMatrix\")\n", - "def Image(path: str):\n", - " \"\"\"Opens a png image file and logs it as a test result to ValidMind\"\"\"\n", - " if not path.endswith(\".png\"):\n", - " raise ValueError(\"Image must be a PNG file\")\n", - "\n", - " # return raw image bytes\n", - " with open(path, \"rb\") as f:\n", - " return f.read()\n", - " \n", - "run_test(\n", - " \"my_custom_tests.MyPNGCorrelationMatrix\",\n", - " params={\"path\": \"./pearson-correlation-matrix.png\"},\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The image is displayed in the test result:\n", - "\n", - "![screenshot showing image from file](./pearson-correlation-matrix-test-output.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Custom Test: Description\n", - "\n", - "If you want to write a custom test description for your custom test instead of it is interpreted through llm, you can do so by returning string in your test." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "@vm.test(\"my_custom_tests.MyCustomTest\")\n", - "def my_custom_test(dataset, model):\n", - " \"\"\"\n", - " This is a custom computed test that computes confusion matrix for a binary classification model and return a string as a test description.\n", - " \"\"\"\n", - " y_true = dataset.y\n", - " y_pred = dataset.y_pred(model)\n", - "\n", - " confusion_matrix = metrics.confusion_matrix(y_true, y_pred)\n", - "\n", - " cm_display = metrics.ConfusionMatrixDisplay(\n", - " confusion_matrix=confusion_matrix, display_labels=[False, True]\n", - " )\n", - " cm_display.plot()\n", - "\n", - " plt.close() # close the plot to avoid displaying it\n", - "\n", - " return cm_display.figure_, \"Test Description - Confusion Matrix\", pd.DataFrame({\"Value\": [1, 2, 3]}) # return the figure object itself\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can see here test result description has been customized here. The same result description will be displayed in the UI." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"my_custom_tests.MyCustomTest\",\n", - " inputs={\"model\": \"model\", \"dataset\": \"test_dataset\"},\n", - ")\n", - "result.log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Conclusion\n", - "\n", - "In this notebook, we have demonstrated how to create custom tests in ValidMind. We have shown how to define custom test functions, register them with the ValidMind Library, run them against models and datasets, and add them to model documentation templates. We have also shown how to return tables and plots from custom tests and how to use them in the ValidMind Platform. We hope this tutorial has been helpful in understanding how to create and use custom tests in ValidMind." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Next steps\n", - "\n", - "You can look at the results of this test suite right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation.\n", - "\n", - "\n", - "\n", - "### Work with your model documentation\n", - "\n", - "1. From the **Model Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", - "\n", - "2. Click and expand the **Model Development** section.\n", - "\n", - "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/model-documentation/working-with-model-documentation.html)\n", - "\n", - "\n", - "\n", - "### Discover more learning resources\n", - "\n", - "We offer many interactive notebooks to help you automate testing, documenting, validating, and more:\n", - "\n", - "- [Run tests & test suites](https://docs.validmind.ai/developer/how-to/testing-overview.html)\n", - "- [Use ValidMind Library features](https://docs.validmind.ai/developer/how-to/feature-overview.html)\n", - "- [Code samples by use case](https://docs.validmind.ai/guide/samples-jupyter-notebooks.html)\n", - "\n", - "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Upgrade ValidMind\n", - "\n", - "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", - "\n", - "Retrieve the information for the currently installed version of ValidMind:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip show validmind" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", - "\n", - "```bash\n", - "%pip install --upgrade validmind\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You may need to restart your kernel after running the upgrade package for changes to be applied." - ] - }, - { - "cell_type": "markdown", - "id": "copyright-997b933948594ddd929ee9419957dfe3", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "***\n", - "\n", - "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", - "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", - "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Implement custom tests\n", + "\n", + "Custom tests extend the functionality of ValidMind, allowing you to document any model or use case with added flexibility.\n", + "\n", + "ValidMind provides a comprehensive set of tests out-of-the-box to evaluate and document your models and datasets. We recognize there will be cases where the default tests do not support a model or dataset, or specific documentation is needed. In these cases, you can create and use your own custom code to accomplish what you need. To streamline custom code integration, we support the creation of custom test functions.\n", + "\n", + "This interactive notebook provides a step-by-step guide for implementing and registering custom tests with ValidMind, running them individually, viewing the results on the ValidMind Platform, and incorporating them into your model documentation template." + ] }, - "nbformat": 4, - "nbformat_minor": 4 + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [About ValidMind](#toc1__) \n", + " - [Before you begin](#toc1_1__) \n", + " - [New to ValidMind?](#toc1_2__) \n", + " - [Key concepts](#toc1_3__) \n", + "- [Setting up](#toc2__) \n", + " - [Install the ValidMind Library](#toc2_1__) \n", + " - [Initialize the ValidMind Library](#toc2_2__) \n", + " - [Register sample model](#toc2_2_1__) \n", + " - [Apply documentation template](#toc2_2_2__) \n", + " - [Get your code snippet](#toc2_2_3__) \n", + "- [Implement a Custom Test](#toc3__) \n", + "- [Run the Custom Test](#toc4__) \n", + " - [Setup the Model and Dataset](#toc4_1__) \n", + " - [Run the Custom Test](#toc4_2__) \n", + "- [Adding Custom Test to Model Documentation](#toc5__) \n", + "- [Some More Custom Tests](#toc6__) \n", + " - [Custom Test: Table of Model Hyperparameters](#toc6_1__) \n", + " - [Custom Test: External API Call](#toc6_2__) \n", + " - [Custom Test: Passing Parameters](#toc6_3__) \n", + " - [Custom Test: Multiple Tables and Plots in a Single Test](#toc6_4__) \n", + " - [Custom Test: Images](#toc6_5__) \n", + " - [Custom Test: Description](#toc6_6__) \n", + "- [Conclusion](#toc7__) \n", + "- [Next steps](#toc8__) \n", + " - [Work with your model documentation](#toc8_1__) \n", + " - [Discover more learning resources](#toc8_2__) \n", + "- [Upgrade ValidMind](#toc9__) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", + "\n", + "\n", + "\n", + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", + "\n", + "\n", + "\n", + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", + "

\n", + "Register with ValidMind
\n", + "\n", + "\n", + "\n", + "### Key concepts\n", + "\n", + "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", + "\n", + "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom test can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", + "\n", + "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", + "\n", + "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Install the ValidMind Library\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind Library" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Register sample model\n", + "\n", + "Let's first register a sample model for use with this notebook:\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + "4. Select your own name under the **MODEL OWNER** drop-down.\n", + "\n", + "5. Click **Register Model** to add the model to your inventory." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Apply documentation template\n", + "\n", + "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", + "\n", + "2. Under **TEMPLATE**, select `Binary classification`.\n", + "\n", + "3. Click **Use Template** to apply the template." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Get your code snippet\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", + "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + " document=\"documentation\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Implement a Custom Test\n", + "\n", + "Let's start off by creating a simple custom test that creates a Confusion Matrix for a binary classification model. We will use the `sklearn.metrics.confusion_matrix` function to calculate the confusion matrix and then display it as a heatmap using `plotly`. (This is already a built-in test in ValidMind, but we will use it as an example to demonstrate how to create custom tests.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from sklearn import metrics\n", + "\n", + "\n", + "@vm.test(\"my_custom_tests.ConfusionMatrix\")\n", + "def confusion_matrix(dataset, model):\n", + " \"\"\"The confusion matrix is a table that is often used to describe the performance of a classification model on a set of data for which the true values are known.\n", + "\n", + " The confusion matrix is a 2x2 table that contains 4 values:\n", + "\n", + " - True Positive (TP): the number of correct positive predictions\n", + " - True Negative (TN): the number of correct negative predictions\n", + " - False Positive (FP): the number of incorrect positive predictions\n", + " - False Negative (FN): the number of incorrect negative predictions\n", + "\n", + " The confusion matrix can be used to assess the holistic performance of a classification model by showing the accuracy, precision, recall, and F1 score of the model on a single figure.\n", + " \"\"\"\n", + " y_true = dataset.y\n", + " y_pred = dataset.y_pred(model)\n", + "\n", + " confusion_matrix = metrics.confusion_matrix(y_true, y_pred)\n", + "\n", + " cm_display = metrics.ConfusionMatrixDisplay(\n", + " confusion_matrix=confusion_matrix, display_labels=[False, True]\n", + " )\n", + " cm_display.plot()\n", + "\n", + " plt.close() # close the plot to avoid displaying it\n", + "\n", + " return cm_display.figure_ # return the figure object itself" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Thats our custom test defined and ready to go... Let's take a look at whats going on here:\n", + "\n", + "- The function `confusion_matrix` takes two arguments `dataset` and `model`. This is a VMDataset and VMModel object respectively.\n", + "- The function docstring provides a description of what the test does. This will be displayed along with the result in this notebook as well as in the ValidMind Platform.\n", + "- The function body calculates the confusion matrix using the `sklearn.metrics.confusion_matrix` function and then plots it using `sklearn.metric.ConfusionMatrixDisplay`.\n", + "- The function then returns the `ConfusionMatrixDisplay.figure_` object - this is important as the ValidMind Library expects the output of the custom test to be a plot or a table.\n", + "- The `@vm.test` decorator is doing the work of creating a wrapper around the function that will allow it to be run by the ValidMind Library. It also registers the test so it can be found by the ID `my_custom_tests.ConfusionMatrix` (see the section below on how test IDs work in ValidMind and why this format is important)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Run the Custom Test\n", + "\n", + "Now that we have defined and registered our custom test, lets see how we can run it and properly use it in the ValidMind Platform." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Setup the Model and Dataset\n", + "\n", + "First let's setup a an example model and dataset to run our custom metic against. Since this is a Confusion Matrix, we will use the Customer Churn dataset that ValidMind provides and train a simple XGBoost model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import xgboost as xgb\n", + "from validmind.datasets.classification import customer_churn\n", + "\n", + "raw_df = customer_churn.load_data()\n", + "train_df, validation_df, test_df = customer_churn.preprocess(raw_df)\n", + "\n", + "x_train = train_df.drop(customer_churn.target_column, axis=1)\n", + "y_train = train_df[customer_churn.target_column]\n", + "x_val = validation_df.drop(customer_churn.target_column, axis=1)\n", + "y_val = validation_df[customer_churn.target_column]\n", + "\n", + "model = xgb.XGBClassifier(early_stopping_rounds=10)\n", + "model.set_params(\n", + " eval_metric=[\"error\", \"logloss\", \"auc\"],\n", + ")\n", + "model.fit(\n", + " x_train,\n", + " y_train,\n", + " eval_set=[(x_val, y_val)],\n", + " verbose=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Easy enough! Now we have a model and dataset setup and trained. One last thing to do is bring the dataset and model into the ValidMind Library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# for now, we'll just use the test dataset\n", + "vm_test_ds = vm.init_dataset(\n", + " dataset=test_df,\n", + " target_column=customer_churn.target_column,\n", + " input_id=\"test_dataset\",\n", + ")\n", + "\n", + "vm_model = vm.init_model(model, input_id=\"model\")\n", + "\n", + "# link the model to the dataset\n", + "vm_test_ds.assign_predictions(model=vm_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Run the Custom Test\n", + "\n", + "Now that we have our model and dataset setup, we have everything we need to run our custom test. We can do this by importing the `run_test` function from the `validmind.tests` module and passing in the test ID of our custom test along with the model and dataset we want to run it against.\n", + "\n", + ">Notice how the `inputs` dictionary is used to map an `input_id` which we set above to the `model` and `dataset` keys that are expected by our custom test function. This is how the ValidMind Library knows which inputs to pass to different tests and is key when using many different datasets and models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.tests import run_test\n", + "\n", + "result = run_test(\n", + " \"my_custom_tests.ConfusionMatrix\",\n", + " inputs={\"model\": \"model\", \"dataset\": \"test_dataset\"},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You'll notice that the docstring becomes a markdown description of the test. The figure is then displayed as the test result. What you see above is how it will look in the ValidMind Platform as well. Let's go ahead and log the result to see how that works." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result.log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Adding Custom Test to Model Documentation\n", + "\n", + "To do this, go to the documentation page of the model you registered above and navigate to the `Model Development` -> `Model Evaluation` section. Then hover between any existing content block to reveal the `+` button as shown in the screenshot below.\n", + "\n", + "![screenshot showing insert button for test-driven blocks](./insert-test-driven-block.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now click on the `+` button and select the `Test-Driven Block` option. This will open a dialog where you can select `My Custom Tests Confusion Matrix` from the list of available tests. You can preview the result and then click `Insert Block` to add it to the documentation.\n", + "\n", + "![screenshot showing how to insert a test-driven block](./insert-test-driven-block-custom.png)\n", + "\n", + "The test should match the result you see above. It is now part of your documentation and will now be run everytime you run `vm.run_documentation_tests()` for your model. Let's do that now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.reload()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you preview the template, it should show the custom test in the `Model Development`->`Model Evaluation` section:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.preview_template()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Just so we can run all of the tests in the template, let's initialize the train and raw dataset.\n", + "\n", + "(Refer to [**Quickstart for model documentation**](../../../quickstart/quickstart_model_documentation.ipynb) and the ValidMind docs for more information on what we are doing here)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_raw_dataset = vm.init_dataset(\n", + " dataset=raw_df,\n", + " input_id=\"raw_dataset\",\n", + " target_column=customer_churn.target_column,\n", + " class_labels=customer_churn.class_labels,\n", + ")\n", + "\n", + "vm_train_ds = vm.init_dataset(\n", + " dataset=train_df,\n", + " input_id=\"train_dataset\",\n", + " target_column=customer_churn.target_column,\n", + ")\n", + "vm_train_ds.assign_predictions(model=vm_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To run all the tests in the template, you can use the `vm.run_documentation_tests()` and pass the inputs we initialized above and the demo config from our customer_churn module. We will have to add a section to the config for our new test to tell it which inputs it should receive. This is done by simply adding a new element in the config dictionary where the key is the ID of the test and the value is a dictionary with the following structure:\n", + "```python\n", + "{\n", + " \"inputs\": {\n", + " \"model\": \"test_dataset\",\n", + " \"dataset\": \"model\",\n", + " }\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.utils import preview_test_config\n", + "\n", + "test_config = customer_churn.get_demo_test_config()\n", + "test_config[\"my_custom_tests.ConfusionMatrix\"] = {\n", + " \"inputs\": {\n", + " \"dataset\": \"test_dataset\",\n", + " \"model\": \"model\",\n", + " }\n", + "}\n", + "preview_test_config(test_config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "full_suite = vm.run_documentation_tests(config=test_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Some More Custom Tests\n", + "\n", + "Now that you understand the entire process of creating custom tests and using them in your documentation, let's create a few more to see different ways you can utilize custom tests." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Custom Test: Table of Model Hyperparameters\n", + "\n", + "This custom test will display a table of the hyperparameters used in the model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@vm.test(\"my_custom_tests.Hyperparameters\")\n", + "def hyperparameters(model):\n", + " \"\"\"The hyperparameters of a machine learning model are the settings that control the learning process.\n", + " These settings are specified before the learning process begins and can have a significant impact on the\n", + " performance of the model.\n", + "\n", + " The hyperparameters of a model can be used to tune the model to achieve the best possible performance\n", + " on a given dataset. By examining the hyperparameters of a model, you can gain insight into how the model\n", + " was trained and how it might be improved.\n", + " \"\"\"\n", + " hyperparameters = model.model.get_xgb_params() # dictionary of hyperparameters\n", + "\n", + " # turn the dictionary into a table where each row contains a hyperparameter and its value\n", + " return [{\"Hyperparam\": k, \"Value\": v} for k, v in hyperparameters.items() if v]\n", + "\n", + "\n", + "result = run_test(\"my_custom_tests.Hyperparameters\", inputs={\"model\": \"model\"})\n", + "result.log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since the test has been run and logged, you can add it to your documentation using the same process as above. It should look like this:\n", + "\n", + "![screenshot showing hyperparameters test](./hyperparameters-custom-metric.png)\n", + "\n", + "For our simple toy model, there are aren't really any proper hyperparameters but you can see how this could be useful for more complex models that have gone through hyperparameter tuning." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Custom Test: External API Call\n", + "\n", + "This custom test will make an external API call to get the current BTC price and display it as a table. This demonstrates how you might integrate external data sources into your model documentation in a programmatic way. You could, for instance, setup a pipeline that runs a test like this every day to keep your model documentation in sync with an external system." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import random\n", + "\n", + "\n", + "@vm.test(\"my_custom_tests.ExternalAPI\")\n", + "def external_api():\n", + " \"\"\"This test calls an external API to get a list of fake users. It then creates\n", + " a table with the relevant data so it can be displayed in the documentation.\n", + "\n", + " The purpose of this test is to demonstrate how to call an external API and use the\n", + " data in a test. A test like this could even be setup to run in a scheduled\n", + " pipeline to keep your documentation in-sync with an external data source.\n", + " \"\"\"\n", + " url = \"https://jsonplaceholder.typicode.com/users\"\n", + " response = requests.get(url)\n", + " data = response.json()\n", + "\n", + " # extract the time and the current BTC price in USD\n", + " return {\n", + " \"Model Owners/Stakeholders\": [\n", + " {\n", + " \"Name\": user[\"name\"],\n", + " \"Role\": random.choice([\"Owner\", \"Stakeholder\"]),\n", + " \"Email\": user[\"email\"],\n", + " \"Phone\": user[\"phone\"],\n", + " \"Slack Handle\": f\"@{user['name'].lower().replace(' ', '.')}\",\n", + " }\n", + " for user in data[:3]\n", + " ],\n", + " \"Model Developers\": [\n", + " {\n", + " \"Name\": user[\"name\"],\n", + " \"Role\": \"Developer\",\n", + " \"Email\": user[\"email\"],\n", + " }\n", + " for user in data[3:7]\n", + " ],\n", + " \"Model Validators\": [\n", + " {\n", + " \"Name\": user[\"name\"],\n", + " \"Role\": \"Validator\",\n", + " \"Email\": user[\"email\"],\n", + " }\n", + " for user in data[7:]\n", + " ],\n", + " }\n", + "\n", + "\n", + "result = run_test(\"my_custom_tests.ExternalAPI\")\n", + "result.log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again, you can add this to your documentation to see how it looks:\n", + "\n", + "![screenshot showing BTC price metric](./external-data-custom-test.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Custom Test: Passing Parameters\n", + "\n", + "Custom test functions, as stated earlier, can take both inputs and params. When you define your function there is no need to distinguish between the two, the ValidMind Library will handle that for you. You simply need to add both to the function as arguments and the library will pass in the correct values.\n", + "\n", + "So for instance, if you wanted to parameterize the first custom test we created, the confusion matrix, you could do so like this:\n", + "\n", + "```python\n", + "def confusion_matrix(dataset: VMDataset, model: VMModel, my_param: str = \"Default Value\"):\n", + " pass\n", + "```\n", + "\n", + "And then when you run the test, you can pass in the parameter like this:\n", + "\n", + "```python\n", + "vm.run_test(\n", + " \"my_custom_tests.ConfusionMatrix\",\n", + " inputs={\"model\": \"model\", \"dataset\": \"test_dataset\"},\n", + " params={\"my_param\": \"My Value\"},\n", + ")\n", + "```\n", + "\n", + "Or if you are running the entire documentation template, you would update the config like this:\n", + "\n", + "```python\n", + "test_config[\"my_custom_tests.ConfusionMatrix\"] = {\n", + " \"inputs\": {\n", + " \"dataset\": \"test_dataset\",\n", + " \"model\": \"model\",\n", + " },\n", + " \"params\": {\n", + " \"my_param\": \"My Value\",\n", + " },\n", + "}\n", + "```\n", + "\n", + "Let's go ahead and create a toy test that takes a parameter and uses it in the result:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.express as px\n", + "\n", + "\n", + "@vm.test(\"my_custom_tests.ParameterExample\")\n", + "def parameter_example(\n", + " plot_title=\"Default Plot Title\", x_col=\"sepal_width\", y_col=\"sepal_length\"\n", + "):\n", + " \"\"\"This test takes two parameters and creates a scatter plot based on them.\n", + "\n", + " The purpose of this test is to demonstrate how to create a test that takes\n", + " parameters and uses them to generate a plot. This can be useful for creating\n", + " tests that are more flexible and can be used in a variety of scenarios.\n", + " \"\"\"\n", + " # return px.scatter(px.data.iris(), x=x_col, y=y_col, color=\"species\")\n", + " return px.scatter(\n", + " px.data.iris(), x=x_col, y=y_col, color=\"species\", title=plot_title\n", + " )\n", + "\n", + "\n", + "result = run_test(\n", + " \"my_custom_tests.ParameterExample\",\n", + " params={\n", + " \"plot_title\": \"My Cool Plot\",\n", + " \"x_col\": \"sepal_width\",\n", + " \"y_col\": \"sepal_length\",\n", + " },\n", + ")\n", + "result.log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Play around with this and see how you can use parameters, default values and other features to make your custom tests more flexible and useful.\n", + "\n", + "Here's how this one looks in the documentation:\n", + "![screenshot showing parameterized test](./parameterized-custom-metric.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Custom Test: Multiple Tables and Plots in a Single Test\n", + "\n", + "Custom test functions, as stated earlier, can return more than just one table or plot. In fact, any number of tables and plots can be returned. Let's see an example of this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import plotly.express as px\n", + "\n", + "\n", + "@vm.test(\"my_custom_tests.ComplexOutput\")\n", + "def complex_output():\n", + " \"\"\"This test demonstrates how to return many tables and figures in a single test\"\"\"\n", + " # create a couple tables\n", + " table = [{\"A\": 1, \"B\": 2}, {\"A\": 3, \"B\": 4}]\n", + " table2 = [{\"C\": 5, \"D\": 6}, {\"C\": 7, \"D\": 8}]\n", + "\n", + " # create a few figures showing some random data\n", + " fig1 = px.line(x=np.arange(10), y=np.random.rand(10), title=\"Random Line Plot\")\n", + " fig2 = px.bar(x=[\"A\", \"B\", \"C\"], y=np.random.rand(3), title=\"Random Bar Plot\")\n", + " fig3 = px.scatter(\n", + " x=np.random.rand(10), y=np.random.rand(10), title=\"Random Scatter Plot\"\n", + " )\n", + "\n", + " return (\n", + " {\n", + " \"My Cool Table\": table,\n", + " \"Another Table\": table2,\n", + " },\n", + " fig1,\n", + " fig2,\n", + " fig3,\n", + " )\n", + "\n", + "\n", + "result = run_test(\"my_custom_tests.ComplexOutput\")\n", + "result.log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice how you can return the tables as a dictionary where the key is the title of the table and the value is the table itself. You could also just return the tables by themselves but this way you can give them a title to more easily identify them in the result.\n", + "\n", + "![screenshot showing multiple tables and plots](./multiple-tables-plots-custom-metric.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Custom Test: Images\n", + "\n", + "If you are using a plotting library that isn't supported by ValidMind (i.e. not `matplotlib` or `plotly`), you can still return the image directly as a bytes-like object. This could also be used to bring any type of image into your documentation in a programmatic way. For instance, you may want to include a diagram of your model architecture or a screenshot of a dashboard that your model is integrated with. As long as you can produce the image with Python or open it from a file, you can include it in your documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import io\n", + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "@vm.test(\"my_custom_tests.Image\")\n", + "def image():\n", + " \"\"\"This test demonstrates how to return an image in a test\"\"\"\n", + "\n", + " # create a simple plot\n", + " fig, ax = plt.subplots()\n", + " ax.plot([1, 2, 3, 4])\n", + " ax.set_title(\"Simple Line Plot\")\n", + "\n", + " # save the plot as a PNG image (in-memory buffer)\n", + " img_data = io.BytesIO()\n", + " fig.savefig(img_data, format=\"png\")\n", + " img_data.seek(0)\n", + "\n", + " plt.close() # close the plot to avoid displaying it\n", + "\n", + " return img_data.read()\n", + "\n", + "\n", + "result = run_test(\"my_custom_tests.Image\")\n", + "result.log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Adding this custom test to your documentation will display the image:\n", + "\n", + "![screenshot showing image custom test](./image-in-custom-metric.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to log an image as a test result, you can do so by passing the path to the image as a parameter to the custom test and then opening the file in the test function. Here's an example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@vm.test(\"my_custom_tests.MyPNGCorrelationMatrix\")\n", + "def Image(path: str):\n", + " \"\"\"Opens a png image file and logs it as a test result to ValidMind\"\"\"\n", + " if not path.endswith(\".png\"):\n", + " raise ValueError(\"Image must be a PNG file\")\n", + "\n", + " # return raw image bytes\n", + " with open(path, \"rb\") as f:\n", + " return f.read()\n", + " \n", + "run_test(\n", + " \"my_custom_tests.MyPNGCorrelationMatrix\",\n", + " params={\"path\": \"./pearson-correlation-matrix.png\"},\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The image is displayed in the test result:\n", + "\n", + "![screenshot showing image from file](./pearson-correlation-matrix-test-output.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Custom Test: Description\n", + "\n", + "If you want to write a custom test description for your custom test instead of it is interpreted through llm, you can do so by returning string in your test." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "@vm.test(\"my_custom_tests.MyCustomTest\")\n", + "def my_custom_test(dataset, model):\n", + " \"\"\"\n", + " This is a custom computed test that computes confusion matrix for a binary classification model and return a string as a test description.\n", + " \"\"\"\n", + " y_true = dataset.y\n", + " y_pred = dataset.y_pred(model)\n", + "\n", + " confusion_matrix = metrics.confusion_matrix(y_true, y_pred)\n", + "\n", + " cm_display = metrics.ConfusionMatrixDisplay(\n", + " confusion_matrix=confusion_matrix, display_labels=[False, True]\n", + " )\n", + " cm_display.plot()\n", + "\n", + " plt.close() # close the plot to avoid displaying it\n", + "\n", + " return cm_display.figure_, \"Test Description - Confusion Matrix\", pd.DataFrame({\"Value\": [1, 2, 3]}) # return the figure object itself\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can see here test result description has been customized here. The same result description will be displayed in the UI." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"my_custom_tests.MyCustomTest\",\n", + " inputs={\"model\": \"model\", \"dataset\": \"test_dataset\"},\n", + ")\n", + "result.log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Conclusion\n", + "\n", + "In this notebook, we have demonstrated how to create custom tests in ValidMind. We have shown how to define custom test functions, register them with the ValidMind Library, run them against models and datasets, and add them to model documentation templates. We have also shown how to return tables and plots from custom tests and how to use them in the ValidMind Platform. We hope this tutorial has been helpful in understanding how to create and use custom tests in ValidMind." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Next steps\n", + "\n", + "You can look at the results of this test suite right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation.\n", + "\n", + "\n", + "\n", + "### Work with your model documentation\n", + "\n", + "1. From the **Model Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", + "\n", + "2. Click and expand the **Model Development** section.\n", + "\n", + "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/model-documentation/working-with-model-documentation.html)\n", + "\n", + "\n", + "\n", + "### Discover more learning resources\n", + "\n", + "We offer many interactive notebooks to help you automate testing, documenting, validating, and more:\n", + "\n", + "- [Run tests & test suites](https://docs.validmind.ai/developer/how-to/testing-overview.html)\n", + "- [Use ValidMind Library features](https://docs.validmind.ai/developer/how-to/feature-overview.html)\n", + "- [Code samples by use case](https://docs.validmind.ai/guide/samples-jupyter-notebooks.html)\n", + "\n", + "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Upgrade ValidMind\n", + "\n", + "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", + "\n", + "Retrieve the information for the currently installed version of ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip show validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", + "\n", + "```bash\n", + "%pip install --upgrade validmind\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may need to restart your kernel after running the upgrade package for changes to be applied." + ] + }, + { + "cell_type": "markdown", + "id": "copyright-bcdac57ebb8d440f86ba120ee6511db3", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "***\n", + "\n", + "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", + "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", + "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/notebooks/how_to/tests/custom_tests/integrate_external_test_providers.ipynb b/notebooks/how_to/tests/custom_tests/integrate_external_test_providers.ipynb index e04dddaab..f2411481b 100644 --- a/notebooks/how_to/tests/custom_tests/integrate_external_test_providers.ipynb +++ b/notebooks/how_to/tests/custom_tests/integrate_external_test_providers.ipynb @@ -239,6 +239,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -955,7 +956,7 @@ }, { "cell_type": "markdown", - "id": "copyright-e9a85f828fdb448ba20c565dec9a0b75", + "id": "copyright-e8c45d61719242ee8fb3dda88dcc3206", "metadata": {}, "source": [ "\n", diff --git a/notebooks/how_to/tests/run_tests/1_run_dataset_based_tests.ipynb b/notebooks/how_to/tests/run_tests/1_run_dataset_based_tests.ipynb index 4f1b90d03..3f5ea875a 100644 --- a/notebooks/how_to/tests/run_tests/1_run_dataset_based_tests.ipynb +++ b/notebooks/how_to/tests/run_tests/1_run_dataset_based_tests.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "1c13a3b2", + "id": "976bb3d9", "metadata": {}, "source": [ "# Run dataset-based tests\n", @@ -12,7 +12,7 @@ }, { "cell_type": "markdown", - "id": "3a959fbb", + "id": "8c4d9b9c", "metadata": {}, "source": [ "::: {.content-hidden when-format=\"html\"}\n", @@ -55,7 +55,7 @@ }, { "cell_type": "markdown", - "id": "cf3c045a", + "id": "f49237b3", "metadata": {}, "source": [ "\n", @@ -69,7 +69,7 @@ }, { "cell_type": "markdown", - "id": "74f542a6", + "id": "907737bd", "metadata": {}, "source": [ "\n", @@ -83,7 +83,7 @@ }, { "cell_type": "markdown", - "id": "45ddf55d", + "id": "115cdfa7", "metadata": {}, "source": [ "\n", @@ -99,7 +99,7 @@ }, { "cell_type": "markdown", - "id": "bc2bccea", + "id": "c3051ca8", "metadata": {}, "source": [ "\n", @@ -134,7 +134,7 @@ }, { "cell_type": "markdown", - "id": "dc321547", + "id": "656db165", "metadata": {}, "source": [ "\n", @@ -144,7 +144,7 @@ }, { "cell_type": "markdown", - "id": "40d02fcd", + "id": "30fa24d7", "metadata": {}, "source": [ "\n", @@ -161,7 +161,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c5369ed9", + "id": "524602cc", "metadata": {}, "outputs": [], "source": [ @@ -170,7 +170,7 @@ }, { "cell_type": "markdown", - "id": "498577ab", + "id": "b38fc5f6", "metadata": {}, "source": [ "\n", @@ -180,7 +180,7 @@ }, { "cell_type": "markdown", - "id": "962d4a97", + "id": "451c5a1b", "metadata": {}, "source": [ "\n", @@ -202,7 +202,7 @@ }, { "cell_type": "markdown", - "id": "c10fd53a", + "id": "0e55ac40", "metadata": {}, "source": [ "\n", @@ -220,7 +220,7 @@ }, { "cell_type": "markdown", - "id": "c94fa081", + "id": "3545620d", "metadata": {}, "source": [ "\n", @@ -236,7 +236,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d67f5f91", + "id": "0ed9e84d", "metadata": {}, "outputs": [], "source": [ @@ -254,12 +254,13 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, { "cell_type": "markdown", - "id": "cf733d21", + "id": "8fea9380", "metadata": {}, "source": [ "\n", @@ -274,7 +275,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fe69e36d", + "id": "e44a2345", "metadata": {}, "outputs": [], "source": [ @@ -283,7 +284,7 @@ }, { "cell_type": "markdown", - "id": "20c430d5", + "id": "43ee2f43", "metadata": {}, "source": [ "\n", @@ -300,7 +301,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e6e6ce77", + "id": "a63e7a43", "metadata": {}, "outputs": [], "source": [ @@ -309,7 +310,7 @@ }, { "cell_type": "markdown", - "id": "021f929d", + "id": "011de751", "metadata": {}, "source": [ "We've identified from the output that the test ID for the pearson correlation matrix test is `validmind.data_validation.PearsonCorrelationMatrix`.\n", @@ -320,7 +321,7 @@ { "cell_type": "code", "execution_count": null, - "id": "81172dfe", + "id": "9886cd27", "metadata": {}, "outputs": [], "source": [ @@ -330,7 +331,7 @@ }, { "cell_type": "markdown", - "id": "5ad0e325", + "id": "f1f7a84a", "metadata": {}, "source": [ "Since this test requires a dataset, you can expect it to throw an error when we run it without passing in a `dataset` as input:" @@ -339,7 +340,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ad3f5c5f", + "id": "ee38704a", "metadata": {}, "outputs": [], "source": [ @@ -351,7 +352,7 @@ }, { "cell_type": "markdown", - "id": "966cff48", + "id": "60ede8e0", "metadata": {}, "source": [ "
Learn more about the individual tests available in the ValidMind Library\n", @@ -361,7 +362,7 @@ }, { "cell_type": "markdown", - "id": "53ec2612", + "id": "6bcd01d2", "metadata": {}, "source": [ "\n", @@ -371,7 +372,7 @@ }, { "cell_type": "markdown", - "id": "13e7e839", + "id": "35331764", "metadata": {}, "source": [ "\n", @@ -390,7 +391,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c456bd2d", + "id": "25774f44", "metadata": {}, "outputs": [], "source": [ @@ -413,7 +414,7 @@ }, { "cell_type": "markdown", - "id": "feb7a98a", + "id": "3b3032fc", "metadata": {}, "source": [ "\n", @@ -434,7 +435,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11ff127d", + "id": "70c52c03", "metadata": {}, "outputs": [], "source": [ @@ -448,7 +449,7 @@ }, { "cell_type": "markdown", - "id": "53552ee1", + "id": "ec65df1b", "metadata": {}, "source": [ "\n", @@ -465,7 +466,7 @@ }, { "cell_type": "markdown", - "id": "9606a8cc", + "id": "c46789a4", "metadata": {}, "source": [ "\n", @@ -478,7 +479,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2337b1ac", + "id": "0c636915", "metadata": {}, "outputs": [], "source": [ @@ -490,7 +491,7 @@ }, { "cell_type": "markdown", - "id": "59238e60", + "id": "12694f87", "metadata": {}, "source": [ "\n", @@ -507,7 +508,7 @@ { "cell_type": "code", "execution_count": null, - "id": "086610c4", + "id": "99eaf2da", "metadata": {}, "outputs": [], "source": [ @@ -517,7 +518,7 @@ }, { "cell_type": "markdown", - "id": "d122a6c7", + "id": "561b225a", "metadata": {}, "source": [ "Use `list_tests()`, this time filtering tests by tags for `binary_classification` relating to `tabular_data`:" @@ -526,7 +527,7 @@ { "cell_type": "code", "execution_count": null, - "id": "223bb93a", + "id": "97a45b6b", "metadata": {}, "outputs": [], "source": [ @@ -535,7 +536,7 @@ }, { "cell_type": "markdown", - "id": "fadad067", + "id": "4ba2ec07", "metadata": {}, "source": [ "Let's use `describe_test()` again to retrieve more information about the test, including confirmation that it accepts some additional parameters, such as `min_percent_threshold` which allows you configure the threshold for an acceptable class imbalance:" @@ -544,7 +545,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3e4deb7c", + "id": "ec456cd2", "metadata": {}, "outputs": [], "source": [ @@ -553,7 +554,7 @@ }, { "cell_type": "markdown", - "id": "029e1cbe", + "id": "e419dd51", "metadata": {}, "source": [ "\n", @@ -568,7 +569,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8b4d3e95", + "id": "1c137483", "metadata": {}, "outputs": [], "source": [ @@ -582,7 +583,7 @@ }, { "cell_type": "markdown", - "id": "4299ec78", + "id": "6cc499de", "metadata": {}, "source": [ "\n", @@ -600,7 +601,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0048f4a2", + "id": "2c6f19ad", "metadata": {}, "outputs": [], "source": [ @@ -615,7 +616,7 @@ }, { "cell_type": "markdown", - "id": "2b37be76", + "id": "30e82fc3", "metadata": {}, "source": [ "When the threshold is set to 20%, the results show that the class imbalance test fails." @@ -623,7 +624,7 @@ }, { "cell_type": "markdown", - "id": "53233e7e", + "id": "faa09935", "metadata": {}, "source": [ "\n", @@ -656,7 +657,7 @@ }, { "cell_type": "markdown", - "id": "c7f98076", + "id": "cbe20d76", "metadata": {}, "source": [ "\n", @@ -672,7 +673,7 @@ }, { "cell_type": "markdown", - "id": "e395333b", + "id": "ec08c9bc", "metadata": {}, "source": [ "\n", @@ -690,7 +691,7 @@ }, { "cell_type": "markdown", - "id": "7c564dd0", + "id": "bff625a1", "metadata": {}, "source": [ "\n", @@ -705,7 +706,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f5eedc29", + "id": "b5f64e27", "metadata": {}, "outputs": [], "source": [ @@ -714,7 +715,7 @@ }, { "cell_type": "markdown", - "id": "af1d2e69", + "id": "da29fb9d", "metadata": {}, "source": [ "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", @@ -726,7 +727,7 @@ }, { "cell_type": "markdown", - "id": "3b886540", + "id": "82837a85", "metadata": {}, "source": [ "You may need to restart your kernel after running the upgrade package for changes to be applied." @@ -734,7 +735,7 @@ }, { "cell_type": "markdown", - "id": "copyright-13aa5ddffa6b49578b051ddab33e1d48", + "id": "copyright-38501808b29c456ab97562eebdd497d4", "metadata": {}, "source": [ "\n", diff --git a/notebooks/how_to/tests/run_tests/configure_tests/customize_test_result_descriptions.ipynb b/notebooks/how_to/tests/run_tests/configure_tests/customize_test_result_descriptions.ipynb index ab36da913..a6f446934 100644 --- a/notebooks/how_to/tests/run_tests/configure_tests/customize_test_result_descriptions.ipynb +++ b/notebooks/how_to/tests/run_tests/configure_tests/customize_test_result_descriptions.ipynb @@ -177,6 +177,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -1000,7 +1001,7 @@ }, { "cell_type": "markdown", - "id": "copyright-5559bb769ab34ad5b09176ce718a7b6c", + "id": "copyright-b966b0e31cc54c0aa55795324865a6f5", "metadata": {}, "source": [ "\n", diff --git a/notebooks/how_to/tests/run_tests/configure_tests/enable_pii_detection.ipynb b/notebooks/how_to/tests/run_tests/configure_tests/enable_pii_detection.ipynb index cbb8af41d..45de1ac66 100644 --- a/notebooks/how_to/tests/run_tests/configure_tests/enable_pii_detection.ipynb +++ b/notebooks/how_to/tests/run_tests/configure_tests/enable_pii_detection.ipynb @@ -225,6 +225,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, diff --git a/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.ipynb b/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.ipynb index b8a80982c..c01a11ace 100644 --- a/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.ipynb +++ b/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.ipynb @@ -226,6 +226,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -531,7 +532,7 @@ }, { "cell_type": "markdown", - "id": "copyright-b40eb05198f94039ae2f66d120320271", + "id": "copyright-72af338f140e4a4bad5cb3954201d23e", "metadata": {}, "source": [ "\n", diff --git a/notebooks/quickstart/quickstart_model_documentation.ipynb b/notebooks/quickstart/quickstart_model_documentation.ipynb index 0f7192694..b2565f20d 100644 --- a/notebooks/quickstart/quickstart_model_documentation.ipynb +++ b/notebooks/quickstart/quickstart_model_documentation.ipynb @@ -284,7 +284,8 @@ " api_host=\"https://api.validmind.ai/api/v1/tracking\",\n", " api_key=\"\",\n", " api_secret=\"\",\n", - " model=\"\"\n", + " model=\"\",\n", + " document=\"documentation\",\n", ")" ] }, diff --git a/notebooks/quickstart/quickstart_model_validation.ipynb b/notebooks/quickstart/quickstart_model_validation.ipynb index 533ce3e4e..a3f6b1a25 100644 --- a/notebooks/quickstart/quickstart_model_validation.ipynb +++ b/notebooks/quickstart/quickstart_model_validation.ipynb @@ -320,6 +320,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"validation-report\",\n", ")" ] }, diff --git a/notebooks/templates/_install-initialize-validmind.ipynb b/notebooks/templates/_install-initialize-validmind.ipynb index 73a67824d..29d9be91e 100644 --- a/notebooks/templates/_install-initialize-validmind.ipynb +++ b/notebooks/templates/_install-initialize-validmind.ipynb @@ -9,7 +9,6 @@ }, { "cell_type": "markdown", - "id": "install-library", "metadata": {}, "source": [ "### Install the ValidMind Library\n", @@ -24,7 +23,6 @@ { "cell_type": "code", "execution_count": null, - "id": "install-python", "metadata": {}, "outputs": [], "source": [ @@ -33,7 +31,6 @@ }, { "cell_type": "markdown", - "id": "install-initialize", "metadata": {}, "source": [ "### Initialize the ValidMind Library" @@ -41,7 +38,6 @@ }, { "cell_type": "markdown", - "id": "install-register", "metadata": {}, "source": [ "#### Register sample model\n", @@ -61,7 +57,6 @@ }, { "cell_type": "markdown", - "id": "install-template", "metadata": {}, "source": [ "#### Apply documentation template\n", @@ -77,7 +72,6 @@ }, { "cell_type": "markdown", - "id": "install-snippet", "metadata": {}, "source": [ "#### Get your code snippet\n", @@ -91,7 +85,6 @@ { "cell_type": "code", "execution_count": null, - "id": "install-init", "metadata": {}, "outputs": [], "source": [ @@ -109,12 +102,12 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, { "cell_type": "markdown", - "id": "install-preview", "metadata": {}, "source": [ "### Preview the documentation template\n", @@ -127,7 +120,6 @@ { "cell_type": "code", "execution_count": null, - "id": "install-preview-template", "metadata": {}, "outputs": [], "source": [ diff --git a/notebooks/tutorials/model_development/1-set_up_validmind.ipynb b/notebooks/tutorials/model_development/1-set_up_validmind.ipynb index 61ee21e2b..97a1e9dc1 100644 --- a/notebooks/tutorials/model_development/1-set_up_validmind.ipynb +++ b/notebooks/tutorials/model_development/1-set_up_validmind.ipynb @@ -274,6 +274,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, diff --git a/notebooks/tutorials/model_development/2-start_development_process.ipynb b/notebooks/tutorials/model_development/2-start_development_process.ipynb index f286c7454..217df37d1 100644 --- a/notebooks/tutorials/model_development/2-start_development_process.ipynb +++ b/notebooks/tutorials/model_development/2-start_development_process.ipynb @@ -126,6 +126,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -987,7 +988,7 @@ }, { "cell_type": "markdown", - "id": "copyright-00fafdc4ca1a4450953055c3e342ef96", + "id": "copyright-b0df0754a7a74581809f22d50b198597", "metadata": {}, "source": [ "\n", diff --git a/notebooks/tutorials/model_development/3-integrate_custom_tests.ipynb b/notebooks/tutorials/model_development/3-integrate_custom_tests.ipynb index 9a3cbe7ef..f5bbf7f53 100644 --- a/notebooks/tutorials/model_development/3-integrate_custom_tests.ipynb +++ b/notebooks/tutorials/model_development/3-integrate_custom_tests.ipynb @@ -141,6 +141,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -968,7 +969,7 @@ }, { "cell_type": "markdown", - "id": "copyright-2f589b81f04949f6a2c6764859b8bc86", + "id": "copyright-d49f3f33bee24ebeb5cd3ec6ed2a4eab", "metadata": {}, "source": [ "\n", diff --git a/notebooks/tutorials/model_development/4-finalize_testing_documentation.ipynb b/notebooks/tutorials/model_development/4-finalize_testing_documentation.ipynb index 07643d1cb..b6064a005 100644 --- a/notebooks/tutorials/model_development/4-finalize_testing_documentation.ipynb +++ b/notebooks/tutorials/model_development/4-finalize_testing_documentation.ipynb @@ -138,6 +138,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -960,7 +961,7 @@ }, { "cell_type": "markdown", - "id": "copyright-75cfc55507924d27b0d37b140c473293", + "id": "copyright-2efae7bbb0f74bd5abe37252cd48b951", "metadata": {}, "source": [ "\n", diff --git a/notebooks/tutorials/model_validation/1-set_up_validmind_for_validation.ipynb b/notebooks/tutorials/model_validation/1-set_up_validmind_for_validation.ipynb index 6a2e9e128..60bf8373d 100644 --- a/notebooks/tutorials/model_validation/1-set_up_validmind_for_validation.ipynb +++ b/notebooks/tutorials/model_validation/1-set_up_validmind_for_validation.ipynb @@ -327,6 +327,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"validation-report\",\n", ")" ] }, diff --git a/notebooks/tutorials/model_validation/2-start_validation_process.ipynb b/notebooks/tutorials/model_validation/2-start_validation_process.ipynb index ce3e8c0ba..e0618c24a 100644 --- a/notebooks/tutorials/model_validation/2-start_validation_process.ipynb +++ b/notebooks/tutorials/model_validation/2-start_validation_process.ipynb @@ -129,6 +129,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"validation-report\",\n", ")" ] }, @@ -861,7 +862,7 @@ }, { "cell_type": "markdown", - "id": "copyright-f724d54c669648268e8471279f7a3461", + "id": "copyright-a798ee72616640f5bead3af25e6e9434", "metadata": {}, "source": [ "\n", diff --git a/notebooks/tutorials/model_validation/3-developing_challenger_model.ipynb b/notebooks/tutorials/model_validation/3-developing_challenger_model.ipynb index 5f7d7e8fd..cc3f25c4d 100644 --- a/notebooks/tutorials/model_validation/3-developing_challenger_model.ipynb +++ b/notebooks/tutorials/model_validation/3-developing_challenger_model.ipynb @@ -136,6 +136,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"validation-report\",\n", ")" ] }, @@ -865,7 +866,7 @@ }, { "cell_type": "markdown", - "id": "copyright-58757a9cc9de45069a5e6c57c8aaff14", + "id": "copyright-0b98aebe3459409f848c2c03b413d181", "metadata": {}, "source": [ "\n", diff --git a/notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb b/notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb index 88f0ea845..613de2a45 100644 --- a/notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb +++ b/notebooks/tutorials/model_validation/4-finalize_validation_reporting.ipynb @@ -145,6 +145,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"validation-report\",\n", ")" ] }, @@ -1205,7 +1206,7 @@ }, { "cell_type": "markdown", - "id": "copyright-b55920b2495443d1894125f60e582bb4", + "id": "copyright-8028ef6a3da9409a8eab5aa0cf246c57", "metadata": {}, "source": [ "\n", diff --git a/notebooks/use_cases/agents/document_agentic_ai.ipynb b/notebooks/use_cases/agents/document_agentic_ai.ipynb index 6490cd436..ae7e2c1ee 100644 --- a/notebooks/use_cases/agents/document_agentic_ai.ipynb +++ b/notebooks/use_cases/agents/document_agentic_ai.ipynb @@ -309,6 +309,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -2139,7 +2140,7 @@ }, { "cell_type": "markdown", - "id": "474ab007", + "id": "copyright-b9e82bcf4e364c4f8e5ae4bb0e4b2865", "metadata": {}, "source": [ "\n", diff --git a/notebooks/use_cases/capital_markets/quickstart_option_pricing_models.ipynb b/notebooks/use_cases/capital_markets/quickstart_option_pricing_models.ipynb index f7ece4405..175b02a59 100644 --- a/notebooks/use_cases/capital_markets/quickstart_option_pricing_models.ipynb +++ b/notebooks/use_cases/capital_markets/quickstart_option_pricing_models.ipynb @@ -246,6 +246,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, diff --git a/notebooks/use_cases/capital_markets/quickstart_option_pricing_models_quantlib.ipynb b/notebooks/use_cases/capital_markets/quickstart_option_pricing_models_quantlib.ipynb index 471359634..3dadf2ca1 100644 --- a/notebooks/use_cases/capital_markets/quickstart_option_pricing_models_quantlib.ipynb +++ b/notebooks/use_cases/capital_markets/quickstart_option_pricing_models_quantlib.ipynb @@ -1,1346 +1,1347 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "1e2a4689", - "metadata": {}, - "source": [ - "# Quickstart for Heston option pricing model using QuantLib\n", - "\n", - "Welcome! Let's get you started with the basic process of documenting models with ValidMind.\n", - "\n", - "The Heston option pricing model is a popular stochastic volatility model used to price options. Developed by Steven Heston in 1993, the model assumes that the asset's volatility follows a mean-reverting square-root process, allowing it to capture the empirical observation of volatility \"clustering\" in financial markets. This model is particularly useful for assets where volatility is not constant, making it a favored approach in quantitative finance for pricing complex derivatives.\n", - "\n", - "Here’s an overview of the Heston model as implemented in QuantLib, a powerful library for quantitative finance:\n", - "\n", - "\n", - "\n", - "### Model Assumptions and Characteristics\n", - "1. **Stochastic Volatility**: The volatility is modeled as a stochastic process, following a mean-reverting square-root process (Cox-Ingersoll-Ross process).\n", - "2. **Correlated Asset and Volatility Processes**: The asset price and volatility are assumed to be correlated, allowing the model to capture the \"smile\" effect observed in implied volatilities.\n", - "3. **Risk-Neutral Dynamics**: The Heston model is typically calibrated under a risk-neutral measure, which allows for direct application to pricing.\n", - "\n", - "\n", - "\n", - "### Heston Model Parameters\n", - "The model is governed by a set of key parameters:\n", - "- **S0**: Initial stock price\n", - "- **v0**: Initial variance of the asset price\n", - "- **kappa**: Speed of mean reversion of the variance\n", - "- **theta**: Long-term mean level of variance\n", - "- **sigma**: Volatility of volatility (vol of vol)\n", - "- **rho**: Correlation between the asset price and variance processes\n", - "\n", - "The dynamics of the asset price \\( S \\) and the variance \\( v \\) under the Heston model are given by:\n", - "\n", - "$$\n", - "dS_t = r S_t \\, dt + \\sqrt{v_t} S_t \\, dW^S_t\n", - "$$\n", - "\n", - "$$\n", - "dv_t = \\kappa (\\theta - v_t) \\, dt + \\sigma \\sqrt{v_t} \\, dW^v_t\n", - "$$\n", - "\n", - "where \\( $dW^S$ \\) and \\( $dW^v$ \\) are Wiener processes with correlation \\( $\\rho$ \\).\n", - "\n", - "\n", - "\n", - "### Advantages and Limitations\n", - "- **Advantages**:\n", - " - Ability to capture volatility smiles and skews.\n", - " - More realistic pricing for options on assets with stochastic volatility.\n", - "- **Limitations**:\n", - " - Calibration can be complex due to the number of parameters.\n", - " - Computationally intensive compared to simpler models like Black-Scholes.\n", - "\n", - "This setup provides a robust framework for pricing and analyzing options with stochastic volatility dynamics. QuantLib’s implementation makes it easy to experiment with different parameter configurations and observe their effects on pricing.\n", - "\n", - "You will learn how to initialize the ValidMind Library, develop a option pricing model, and then write custom tests that can be used for sensitivity and stress testing to quickly generate documentation about model." - ] - }, - { - "cell_type": "markdown", - "id": "69ec219a", - "metadata": {}, - "source": [ - "::: {.content-hidden when-format=\"html\"}\n", - "## Contents \n", - " - [Model Assumptions and Characteristics](#toc1_1__) \n", - " - [Heston Model Parameters](#toc1_2__) \n", - " - [Advantages and Limitations](#toc1_3__) \n", - "- [About ValidMind](#toc2__) \n", - " - [Before you begin](#toc2_1__) \n", - " - [New to ValidMind?](#toc2_2__) \n", - " - [Key concepts](#toc2_3__) \n", - "- [Setting up](#toc3__) \n", - " - [Install the ValidMind Library](#toc3_1__) \n", - " - [Initialize the ValidMind Library](#toc3_2__) \n", - " - [Register sample model](#toc3_2_1__) \n", - " - [Apply documentation template](#toc3_2_2__) \n", - " - [Get your code snippet](#toc3_2_3__) \n", - " - [Initialize the Python environment](#toc3_3__) \n", - " - [Preview the documentation template](#toc3_4__) \n", - "- [Data Preparation](#toc4__) \n", - " - [Helper functions](#toc4_1_1__) \n", - " - [Market Data Quality and Availability](#toc4_2__) \n", - " - [Initialize the ValidMind datasets](#toc4_3__) \n", - " - [Data Quality](#toc4_4__) \n", - " - [Isolation Forest Outliers Test](#toc4_4_1__) \n", - " - [Model parameters](#toc4_4_2__) \n", - "- [Model development - Heston Option price](#toc5__) \n", - " - [Model Calibration](#toc5_1__) \n", - " - [Model Evaluation](#toc5_2__) \n", - " - [Benchmark Testing](#toc5_2_1__) \n", - " - [Sensitivity Testing](#toc5_2_2__) \n", - " - [Stress Testing](#toc5_2_3__) \n", - "- [Next steps](#toc6__) \n", - " - [Work with your model documentation](#toc6_1__) \n", - " - [Discover more learning resources](#toc6_2__) \n", - "\n", - ":::\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "id": "b9fb5d17", - "metadata": {}, - "source": [ - "\n", - "\n", - "## About ValidMind\n", - "\n", - "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", - "\n", - "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", - "\n", - "\n", - "\n", - "### Before you begin\n", - "\n", - "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", - "\n", - "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", - "\n", - "\n", - "\n", - "### New to ValidMind?\n", - "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", - "\n", - "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", - "

\n", - "Register with ValidMind
\n", - "\n", - "\n", - "\n", - "### Key concepts\n", - "\n", - "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", - "\n", - "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", - "\n", - "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", - "\n", - "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", - "\n", - "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", - "\n", - " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", - " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", - " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", - "\n", - "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", - "\n", - "**Outputs**: Custom tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", - "\n", - "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", - "\n", - "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." - ] - }, - { - "cell_type": "markdown", - "id": "f2dccf35", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Setting up" - ] - }, - { - "cell_type": "markdown", - "id": "5a5ce085", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Install the ValidMind Library\n", - "\n", - "To install the library:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "409352bf", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q validmind" - ] - }, - { - "cell_type": "markdown", - "id": "65e870b2", - "metadata": {}, - "source": [ - "To install the QuantLib library:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3a34debf", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q QuantLib" - ] - }, - { - "cell_type": "markdown", - "id": "fb30ae07", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the ValidMind Library" - ] - }, - { - "cell_type": "markdown", - "id": "c6f87017", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Register sample model\n", - "\n", - "Let's first register a sample model for use with this notebook:\n", - "\n", - "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", - "\n", - "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", - "\n", - "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", - "\n", - "4. Select your own name under the **MODEL OWNER** drop-down.\n", - "\n", - "5. Click **Register Model** to add the model to your inventory." - ] - }, - { - "cell_type": "markdown", - "id": "cbb2e2c9", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Apply documentation template\n", - "\n", - "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", - "\n", - "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", - "\n", - "2. Under **TEMPLATE**, select `Capital Markets`.\n", - "\n", - "3. Click **Use Template** to apply the template." - ] - }, - { - "cell_type": "markdown", - "id": "41c4edca", - "metadata": {}, - "source": [ - "
Can't select this template?\n", - "

\n", - "Your organization administrators may need to add it to your template library:\n", - "\n", - "
" - ] - }, - { - "cell_type": "markdown", - "id": "2012eb82", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Get your code snippet\n", - "\n", - "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", - "\n", - "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", - "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0cd3f67e", - "metadata": {}, - "outputs": [], - "source": [ - "# Load your model identifier credentials from an `.env` file\n", - "\n", - "%load_ext dotenv\n", - "%dotenv .env\n", - "\n", - "# Or replace with your code snippet\n", - "\n", - "import validmind as vm\n", - "\n", - "vm.init(\n", - " # api_host=\"...\",\n", - " # api_key=\"...\",\n", - " # api_secret=\"...\",\n", - " # model=\"...\",\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "id": "6d944cc9", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the Python environment\n", - "\n", - "Next, let's import the necessary libraries and set up your Python environment for data analysis:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f8cf2746", - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "\n", - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "from scipy.optimize import minimize\n", - "import yfinance as yf\n", - "import QuantLib as ql\n", - "from validmind.tests import run_test" - ] - }, - { - "cell_type": "markdown", - "id": "bc431ee0", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Preview the documentation template\n", - "\n", - "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", - "\n", - "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e844028", - "metadata": {}, - "outputs": [], - "source": [ - "vm.preview_template()" - ] - }, - { - "cell_type": "markdown", - "id": "0c0ee8b9", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Data Preparation" - ] - }, - { - "cell_type": "markdown", - "id": "5a4d2c36", - "metadata": {}, - "source": [ - "### Market Data Sources\n", - "\n", - "\n", - "\n", - "#### Helper functions\n", - "Let's define helper function retrieve to option data from Yahoo Finance." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b96a500f", - "metadata": {}, - "outputs": [], - "source": [ - "def get_market_data(ticker, expiration_date_str):\n", - " \"\"\"\n", - " Fetch option market data from Yahoo Finance for the given ticker and expiration date.\n", - " Returns a list of tuples: (strike, maturity, option_price).\n", - " \"\"\"\n", - " # Create a Ticker object for the specified stock\n", - " stock = yf.Ticker(ticker)\n", - "\n", - " # Get all available expiration dates for options\n", - " option_dates = stock.options\n", - "\n", - " # Check if the requested expiration date is available\n", - " if expiration_date_str not in option_dates:\n", - " raise ValueError(f\"Expiration date {expiration_date_str} not available for {ticker}. Available dates: {option_dates}\")\n", - "\n", - " # Get the option chain for the specified expiration date\n", - " option_chain = stock.option_chain(expiration_date_str)\n", - "\n", - " # Get call options (or you can use puts as well based on your requirement)\n", - " calls = option_chain.calls\n", - "\n", - " # Convert expiration_date_str to QuantLib Date\n", - " expiry_date_parts = list(map(int, expiration_date_str.split('-'))) # Split YYYY-MM-DD\n", - " maturity_date = ql.Date(expiry_date_parts[2], expiry_date_parts[1], expiry_date_parts[0]) # Convert to QuantLib Date\n", - "\n", - " # Create a list to store strike prices, maturity dates, and option prices\n", - " market_data = []\n", - " for index, row in calls.iterrows():\n", - " strike = row['strike']\n", - " option_price = row['lastPrice'] # You can also use 'bid', 'ask', 'mid', etc.\n", - " market_data.append((strike, maturity_date, option_price))\n", - " df = pd.DataFrame(market_data, columns = ['strike', 'maturity_date', 'option_price'])\n", - " return df" - ] - }, - { - "cell_type": "markdown", - "id": "c7769b73", - "metadata": {}, - "source": [ - "Let's define helper function retrieve to stock data from Yahoo Finance. This helper function to calculate spot price, dividend yield, volatility and risk free rate using the underline stock data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc44c448", - "metadata": {}, - "outputs": [], - "source": [ - "def get_option_parameters(ticker):\n", - " # Fetch historical data for the stock\n", - " stock_data = yf.Ticker(ticker)\n", - " \n", - " # Get the current spot price\n", - " spot_price = stock_data.history(period=\"1d\")['Close'].iloc[-1]\n", - " \n", - " # Get dividend yield\n", - " dividend_rate = stock_data.dividends.mean() / spot_price if not stock_data.dividends.empty else 0.0\n", - " \n", - " # Estimate volatility (standard deviation of log returns)\n", - " hist_data = stock_data.history(period=\"1y\")['Close']\n", - " log_returns = np.log(hist_data / hist_data.shift(1)).dropna()\n", - " volatility = np.std(log_returns) * np.sqrt(252) # Annualized volatility\n", - " \n", - " # Assume a risk-free rate from some known data (can be fetched from market data, here we use 0.001)\n", - " risk_free_rate = 0.001\n", - " \n", - " # Return the calculated parameters\n", - " return {\n", - " \"spot_price\": spot_price,\n", - " \"volatility\": volatility,\n", - " \"dividend_rate\": dividend_rate,\n", - " \"risk_free_rate\": risk_free_rate\n", - " }" - ] - }, - { - "cell_type": "markdown", - "id": "c7b739d3", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Market Data Quality and Availability\n", - "Next, let's specify ticker and expiration date to get market data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "50225fde", - "metadata": {}, - "outputs": [], - "source": [ - "ticker = \"MSFT\"\n", - "expiration_date = \"2024-12-13\" # Example expiration date in 'YYYY-MM-DD' form\n", - "\n", - "market_data = get_market_data(ticker=ticker, expiration_date_str=expiration_date)" - ] - }, - { - "cell_type": "markdown", - "id": "c539b95e", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the ValidMind datasets\n", - "\n", - "Before you can run tests, you must first initialize a ValidMind dataset object using the [`init_dataset`](https://docs.validmind.ai/validmind/validmind.html#init_dataset) function from the ValidMind (`vm`) module." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "113f9c17", - "metadata": {}, - "outputs": [], - "source": [ - "vm_market_data = vm.init_dataset(\n", - " dataset=market_data,\n", - " input_id=\"market_data\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "185beb24", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Data Quality\n", - "Let's check quality of the data using outliers and missing data tests." - ] - }, - { - "cell_type": "markdown", - "id": "7f14464c", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Isolation Forest Outliers Test\n", - "Let's detects anomalies in the dataset using the Isolation Forest algorithm, visualized through scatter plots." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "56c919ec", - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"validmind.data_validation.IsolationForestOutliers\",\n", - " inputs={\n", - " \"dataset\": vm_market_data,\n", - " },\n", - " title=\"Outliers detection using Isolation Forest\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "e4d0e5ca", - "metadata": {}, - "source": [ - "##### Missing Values Test\n", - "Let's evaluates dataset quality by ensuring the missing value ratio across all features does not exceed a set threshold." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e95c825f", - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"validmind.data_validation.MissingValues\",\n", - " inputs={\n", - " \"dataset\": vm_market_data,\n", - " },\n", - " title=\"Missing Values detection\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "829403a3", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Model parameters\n", - "Let's calculate the model parameters using from stock data " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "25936449", - "metadata": {}, - "outputs": [], - "source": [ - "option_params = get_option_parameters(ticker=ticker)" - ] - }, - { - "cell_type": "markdown", - "id": "0a0948b6", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Model development - Heston Option price" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e15b8221", - "metadata": {}, - "outputs": [], - "source": [ - "class HestonModel:\n", - "\n", - " def __init__(self, ticker, expiration_date_str, calculation_date, spot_price, dividend_rate, risk_free_rate):\n", - " self.ticker = ticker\n", - " self.expiration_date_str = expiration_date_str,\n", - " self.calculation_date = calculation_date\n", - " self.spot_price = spot_price\n", - " self.dividend_rate = dividend_rate\n", - " self.risk_free_rate = risk_free_rate\n", - " \n", - " def predict_option_price(self, strike, maturity_date, spot_price, v0=None, theta=None, kappa=None, sigma=None, rho=None):\n", - " # Set the evaluation date\n", - " ql.Settings.instance().evaluationDate = self.calculation_date\n", - "\n", - " # Construct the European Option\n", - " payoff = ql.PlainVanillaPayoff(ql.Option.Call, strike)\n", - " exercise = ql.EuropeanExercise(maturity_date)\n", - " european_option = ql.VanillaOption(payoff, exercise)\n", - "\n", - " # Yield term structures for risk-free rate and dividend\n", - " riskFreeTS = ql.YieldTermStructureHandle(ql.FlatForward(calculation_date, self.risk_free_rate, ql.Actual365Fixed()))\n", - " dividendTS = ql.YieldTermStructureHandle(ql.FlatForward(calculation_date, self.dividend_rate, ql.Actual365Fixed()))\n", - "\n", - " # Initial stock price\n", - " initialValue = ql.QuoteHandle(ql.SimpleQuote(spot_price))\n", - "\n", - " # Heston process parameters\n", - " heston_process = ql.HestonProcess(riskFreeTS, dividendTS, initialValue, v0, kappa, theta, sigma, rho)\n", - " hestonModel = ql.HestonModel(heston_process)\n", - "\n", - " # Use the Heston analytic engine\n", - " engine = ql.AnalyticHestonEngine(hestonModel)\n", - " european_option.setPricingEngine(engine)\n", - "\n", - " # Calculate the Heston model price\n", - " h_price = european_option.NPV()\n", - "\n", - " return h_price\n", - "\n", - " def predict_american_option_price(self, strike, maturity_date, spot_price, v0=None, theta=None, kappa=None, sigma=None, rho=None):\n", - " # Set the evaluation date\n", - " ql.Settings.instance().evaluationDate = self.calculation_date\n", - "\n", - " # Construct the American Option\n", - " payoff = ql.PlainVanillaPayoff(ql.Option.Call, strike)\n", - " exercise = ql.AmericanExercise(self.calculation_date, maturity_date)\n", - " american_option = ql.VanillaOption(payoff, exercise)\n", - "\n", - " # Yield term structures for risk-free rate and dividend\n", - " riskFreeTS = ql.YieldTermStructureHandle(ql.FlatForward(self.calculation_date, self.risk_free_rate, ql.Actual365Fixed()))\n", - " dividendTS = ql.YieldTermStructureHandle(ql.FlatForward(self.calculation_date, self.dividend_rate, ql.Actual365Fixed()))\n", - "\n", - " # Initial stock price\n", - " initialValue = ql.QuoteHandle(ql.SimpleQuote(spot_price))\n", - "\n", - " # Heston process parameters\n", - " heston_process = ql.HestonProcess(riskFreeTS, dividendTS, initialValue, v0, kappa, theta, sigma, rho)\n", - " heston_model = ql.HestonModel(heston_process)\n", - "\n", - "\n", - " payoff = ql.PlainVanillaPayoff(ql.Option.Call, strike)\n", - " exercise = ql.AmericanExercise(self.calculation_date, maturity_date)\n", - " american_option = ql.VanillaOption(payoff, exercise)\n", - " heston_fd_engine = ql.FdHestonVanillaEngine(heston_model)\n", - " american_option.setPricingEngine(heston_fd_engine)\n", - " option_price = american_option.NPV()\n", - "\n", - " return option_price\n", - "\n", - " def objective_function(self, params, market_data, spot_price, dividend_rate, risk_free_rate):\n", - " v0, theta, kappa, sigma, rho = params\n", - "\n", - " # Sum of squared differences between market prices and model prices\n", - " error = 0.0\n", - " for i, row in market_data.iterrows():\n", - " model_price = self.predict_option_price(row['strike'], row['maturity_date'], spot_price, \n", - " v0, theta, kappa, sigma, rho)\n", - " error += (model_price - row['option_price']) ** 2\n", - " \n", - " return error\n", - "\n", - " def calibrate_model(self, ticker, expiration_date_str):\n", - " # Get the option market data dynamically from Yahoo Finance\n", - " market_data = get_market_data(ticker, expiration_date_str)\n", - "\n", - " # Initial guesses for Heston parameters\n", - " initial_params = [0.04, 0.04, 0.1, 0.1, -0.75]\n", - "\n", - " # Bounds for the parameters to ensure realistic values\n", - " bounds = [(0.0001, 1.0), # v0\n", - " (0.0001, 1.0), # theta\n", - " (0.001, 2.0), # kappa\n", - " (0.001, 1.0), # sigma\n", - " (-0.75, 0.0)] # rho\n", - "\n", - " # Optimize the parameters to minimize the error between model and market prices\n", - " result = minimize(self.objective_function, initial_params, args=(market_data, self.spot_price, self.dividend_rate, self.risk_free_rate),\n", - " bounds=bounds, method='L-BFGS-B')\n", - "\n", - " # Optimized Heston parameters\n", - " v0_opt, theta_opt, kappa_opt, sigma_opt, rho_opt = result.x\n", - "\n", - " return v0_opt, theta_opt, kappa_opt, sigma_opt, rho_opt\n" - ] - }, - { - "cell_type": "markdown", - "id": "a941aa32", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Model Calibration\n", - "* The calibration process aims to optimize the Heston model parameters (v0, theta, kappa, sigma, rho) by minimizing the difference between model-predicted option prices and observed market prices.\n", - "* In this implementation, the model is calibrated to current market data, specifically using option prices from the selected ticker and expiration date.\n", - "\n", - "Let's specify `calculation_date` and `strike_price` as input parameters for the model to verify its functionality and confirm it operates as expected." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1d61dfca", - "metadata": {}, - "outputs": [], - "source": [ - "calculation_date = ql.Date(26, 11, 2024)\n", - "# Convert expiration date string to QuantLib.Date\n", - "expiry_date_parts = list(map(int, expiration_date.split('-')))\n", - "maturity_date = ql.Date(expiry_date_parts[2], expiry_date_parts[1], expiry_date_parts[0])\n", - "strike_price = 460.0\n", - "\n", - "hm = HestonModel(\n", - " ticker=ticker,\n", - " expiration_date_str= expiration_date,\n", - " calculation_date= calculation_date,\n", - " spot_price= option_params['spot_price'],\n", - " dividend_rate = option_params['dividend_rate'],\n", - " risk_free_rate = option_params['risk_free_rate']\n", - ")\n", - "\n", - "# Let's calibrate model\n", - "v0_opt, theta_opt, kappa_opt, sigma_opt, rho_opt = hm.calibrate_model(ticker, expiration_date)\n", - "print(f\"Optimized Heston parameters: v0={v0_opt}, theta={theta_opt}, kappa={kappa_opt}, sigma={sigma_opt}, rho={rho_opt}\")\n", - "\n", - "\n", - "# option price\n", - "h_price = hm.predict_option_price(strike_price, maturity_date, option_params['spot_price'], v0_opt, theta_opt, kappa_opt, sigma_opt, rho_opt)\n", - "print(\"The Heston model price for the option is:\", h_price)" - ] - }, - { - "cell_type": "markdown", - "id": "75313272", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Model Evaluation" - ] - }, - { - "cell_type": "markdown", - "id": "2e6471ef", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Benchmark Testing\n", - "The benchmark testing framework provides a robust way to validate the Heston model implementation and understand the relationships between European and American option prices under stochastic volatility conditions.\n", - "Let's compares European and American option prices using the Heston model." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "810cf887", - "metadata": {}, - "outputs": [], - "source": [ - "@vm.test(\"my_custom_tests.BenchmarkTest\")\n", - "def benchmark_test(hm_model, strikes, maturity_date, spot_price, v0=None, theta=None, kappa=None, sigma=None, rho=None):\n", - " \"\"\"\n", - " Compares European and American option prices using the Heston model.\n", - "\n", - " This test evaluates the price differences between European and American options\n", - " across multiple strike prices while keeping other parameters constant. The comparison\n", - " helps understand the early exercise premium of American options over their European\n", - " counterparts under stochastic volatility conditions.\n", - "\n", - " Args:\n", - " hm_model: HestonModel instance for option pricing calculations\n", - " strikes (list[float]): List of strike prices to test\n", - " maturity_date (ql.Date): Option expiration date in QuantLib format\n", - " spot_price (float): Current price of the underlying asset\n", - " v0 (float, optional): Initial variance. Defaults to None.\n", - " theta (float, optional): Long-term variance. Defaults to None.\n", - " kappa (float, optional): Mean reversion rate. Defaults to None.\n", - " sigma (float, optional): Volatility of variance. Defaults to None.\n", - " rho (float, optional): Correlation between asset and variance. Defaults to None.\n", - "\n", - " Returns:\n", - " dict: Contains a DataFrame with the following columns:\n", - " - Strike: Strike prices tested\n", - " - Maturity date: Expiration date for all options\n", - " - Spot price: Current underlying price\n", - " - european model price: Prices for European options\n", - " - american model price: Prices for American options\n", - "\"\"\"\n", - " american_derived_prices = []\n", - " european_derived_prices = []\n", - " for K in strikes:\n", - " european_derived_prices.append(hm_model.predict_option_price(K, maturity_date, spot_price, v0, theta, kappa, sigma, rho))\n", - " american_derived_prices.append(hm_model.predict_american_option_price(K, maturity_date, spot_price, v0, theta, kappa, sigma, rho))\n", - "\n", - " data = {\n", - " \"Strike\": strikes,\n", - " \"Maturity date\": [maturity_date] * len(strikes),\n", - " \"Spot price\": [spot_price] * len(strikes),\n", - " \"european model price\": european_derived_prices,\n", - " \"american model price\": american_derived_prices,\n", - "\n", - " }\n", - " df1 = pd.DataFrame(data)\n", - " return {\"strikes variation benchmarking\": df1}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3fdd6705", - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"my_custom_tests.BenchmarkTest\",\n", - " params={\n", - " \"hm_model\": hm,\n", - " \"strikes\": [400, 425, 460, 495, 520],\n", - " \"maturity_date\": maturity_date,\n", - " \"spot_price\": option_params['spot_price'],\n", - " \"v0\":v0_opt,\n", - " \"theta\": theta_opt,\n", - " \"kappa\":kappa_opt ,\n", - " \"sigma\": sigma_opt,\n", - " \"rho\":rho_opt\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "id": "e359b503", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Sensitivity Testing\n", - "The sensitivity testing framework provides a systematic approach to understanding how the Heston model responds to parameter changes, which is crucial for both model validation and practical application in trading and risk management." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "51922313", - "metadata": {}, - "outputs": [], - "source": [ - "@vm.test(\"my_test_provider.Sensitivity\")\n", - "def SensitivityTest(\n", - " model,\n", - " strike_price,\n", - " maturity_date,\n", - " spot_price,\n", - " v0_opt,\n", - " theta_opt,\n", - " kappa_opt,\n", - " sigma_opt,\n", - " rho_opt,\n", - "):\n", - " \"\"\"\n", - " Evaluates the sensitivity of American option prices to changes in model parameters.\n", - "\n", - " This test calculates option prices using the Heston model with optimized parameters.\n", - " It's designed to analyze how changes in various model inputs affect the option price,\n", - " which is crucial for understanding model behavior and risk management.\n", - "\n", - " Args:\n", - " model (HestonModel): Initialized Heston model instance wrapped in ValidMind model object\n", - " strike_price (float): Strike price of the option\n", - " maturity_date (ql.Date): Expiration date of the option in QuantLib format\n", - " spot_price (float): Current price of the underlying asset\n", - " v0_opt (float): Optimized initial variance parameter\n", - " theta_opt (float): Optimized long-term variance parameter\n", - " kappa_opt (float): Optimized mean reversion rate parameter\n", - " sigma_opt (float): Optimized volatility of variance parameter\n", - " rho_opt (float): Optimized correlation parameter between asset price and variance\n", - " \"\"\"\n", - " price = model.model.predict_american_option_price(\n", - " strike_price,\n", - " maturity_date,\n", - " spot_price,\n", - " v0_opt,\n", - " theta_opt,\n", - " kappa_opt,\n", - " sigma_opt,\n", - " rho_opt,\n", - " )\n", - "\n", - " return price\n" - ] - }, - { - "cell_type": "markdown", - "id": "408a05ef", - "metadata": {}, - "source": [ - "##### Common plot function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "104ca6dd", - "metadata": {}, - "outputs": [], - "source": [ - "def plot_results(df, params: dict = None):\n", - " fig2 = plt.figure(figsize=(10, 6))\n", - " plt.plot(df[params[\"x\"]], df[params[\"y\"]], label=params[\"label\"])\n", - " plt.xlabel(params[\"xlabel\"])\n", - " plt.ylabel(params[\"ylabel\"])\n", - " \n", - " plt.title(params[\"title\"])\n", - " plt.legend()\n", - " plt.grid(True)\n", - " plt.show() # close the plot to avoid displaying it" - ] - }, - { - "cell_type": "markdown", - "id": "ca72b9e5", - "metadata": {}, - "source": [ - "Let's create ValidMind model object" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ae7093fa", - "metadata": {}, - "outputs": [], - "source": [ - "hm_model = vm.init_model(model=hm, input_id=\"HestonModel\")" - ] - }, - { - "cell_type": "markdown", - "id": "b2141640", - "metadata": {}, - "source": [ - "##### Strike sensitivity\n", - "Let's analyzes how option prices change as the strike price varies. We create a range of strike prices around the current strike (460) and observe the impact on option prices while keeping all other parameters constant." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea7f1cbe", - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"my_test_provider.Sensitivity:ToStrike\",\n", - " inputs = {\n", - " \"model\": hm_model\n", - " },\n", - " param_grid={\n", - " \"strike_price\": list(np.linspace(460-50, 460+50, 10)),\n", - " \"maturity_date\": [maturity_date],\n", - " \"spot_price\": [option_params[\"spot_price\"]],\n", - " \"v0_opt\": [v0_opt],\n", - " \"theta_opt\": [theta_opt],\n", - " \"kappa_opt\": [kappa_opt],\n", - " \"sigma_opt\": [sigma_opt],\n", - " \"rho_opt\":[rho_opt]\n", - " },\n", - ")\n", - "result.log()\n", - "# Visualize how option prices change with different strike prices\n", - "plot_results(\n", - " pd.DataFrame(result.tables[0].data),\n", - " params={\n", - " \"x\": \"strike_price\",\n", - " \"y\":\"Value\",\n", - " \"label\":\"Strike price\",\n", - " \"xlabel\":\"Strike price\",\n", - " \"ylabel\":\"option price\",\n", - " \"title\":\"Heston option - Strike price Sensitivity\",\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "be143012", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Stress Testing\n", - "This stress testing framework provides a comprehensive view of how the Heston model behaves under different market conditions and helps identify potential risks in option pricing." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f2f01a40", - "metadata": {}, - "outputs": [], - "source": [ - "@vm.test(\"my_custom_tests.Stressing\")\n", - "def StressTest(\n", - " model,\n", - " strike_price,\n", - " maturity_date,\n", - " spot_price,\n", - " v0_opt,\n", - " theta_opt,\n", - " kappa_opt,\n", - " sigma_opt,\n", - " rho_opt,\n", - "):\n", - " \"\"\"\n", - " Performs stress testing on Heston model parameters to evaluate option price sensitivity.\n", - "\n", - " This test evaluates how the American option price responds to stressed market conditions\n", - " by varying key model parameters. It's designed to:\n", - " 1. Identify potential model vulnerabilities\n", - " 2. Understand price behavior under extreme scenarios\n", - " 3. Support risk management decisions\n", - " 4. Validate model stability across parameter ranges\n", - "\n", - " Args:\n", - " model (HestonModel): Initialized Heston model instance wrapped in ValidMind model object\n", - " strike_price (float): Option strike price\n", - " maturity_date (ql.Date): Option expiration date in QuantLib format\n", - " spot_price (float): Current price of the underlying asset\n", - " v0_opt (float): Initial variance parameter under stress testing\n", - " theta_opt (float): Long-term variance parameter under stress testing\n", - " kappa_opt (float): Mean reversion rate parameter under stress testing\n", - " sigma_opt (float): Volatility of variance parameter under stress testing\n", - " rho_opt (float): Correlation parameter under stress testing\n", - " \"\"\"\n", - " price = model.model.predict_american_option_price(\n", - " strike_price,\n", - " maturity_date,\n", - " spot_price,\n", - " v0_opt,\n", - " theta_opt,\n", - " kappa_opt,\n", - " sigma_opt,\n", - " rho_opt,\n", - " )\n", - "\n", - " return price\n" - ] - }, - { - "cell_type": "markdown", - "id": "31fcbe9c", - "metadata": {}, - "source": [ - "##### Rho (correlation) and Theta (long term vol) stress test\n", - "Next, let's evaluates the sensitivity of a model's output to changes in the correlation parameter (rho) and the long-term variance parameter (theta) within a stochastic volatility framework." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6119b5d9", - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"my_custom_tests.Stressing:TheRhoAndThetaParameters\",\n", - " inputs = {\n", - " \"model\": hm_model,\n", - " },\n", - " param_grid={\n", - " \"strike_price\": [460],\n", - " \"maturity_date\": [maturity_date],\n", - " \"spot_price\": [option_params[\"spot_price\"]],\n", - " \"v0_opt\": [v0_opt],\n", - " \"theta_opt\": list(np.linspace(0.1, theta_opt+0.4, 5)),\n", - " \"kappa_opt\": [kappa_opt],\n", - " \"sigma_opt\": [sigma_opt],\n", - " \"rho_opt\":list(np.linspace(rho_opt-0.2, rho_opt+0.2, 5))\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "id": "be39cb3a", - "metadata": {}, - "source": [ - "##### Sigma stress test\n", - "Let's evaluates the sensitivity of a model's output to changes in the volatility parameter, sigma. This test is crucial for understanding how variations in market volatility impact the model's valuation of financial instruments, particularly options." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0dc189b7", - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"my_custom_tests.Stressing:TheSigmaParameter\",\n", - " inputs = {\n", - " \"model\": hm_model,\n", - " },\n", - " param_grid={\n", - " \"strike_price\": [460],\n", - " \"maturity_date\": [maturity_date],\n", - " \"spot_price\": [option_params[\"spot_price\"]],\n", - " \"v0_opt\": [v0_opt],\n", - " \"theta_opt\": [theta_opt],\n", - " \"kappa_opt\": [kappa_opt],\n", - " \"sigma_opt\": list(np.linspace(0.1, sigma_opt+0.6, 5)),\n", - " \"rho_opt\": [rho_opt]\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "id": "173a5294", - "metadata": {}, - "source": [ - "##### Stress kappa\n", - "Let's evaluates the sensitivity of a model's output to changes in the kappa parameter, which is a mean reversion rate in stochastic volatility models." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dae9714f", - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"my_custom_tests.Stressing:TheKappaParameter\",\n", - " inputs = {\n", - " \"model\": hm_model,\n", - " },\n", - " param_grid={\n", - " \"strike_price\": [460],\n", - " \"maturity_date\": [maturity_date],\n", - " \"spot_price\": [option_params[\"spot_price\"]],\n", - " \"v0_opt\": [v0_opt],\n", - " \"theta_opt\": [theta_opt],\n", - " \"kappa_opt\": list(np.linspace(kappa_opt, kappa_opt+0.2, 5)),\n", - " \"sigma_opt\": [sigma_opt],\n", - " \"rho_opt\": [rho_opt]\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "id": "b4d1d968", - "metadata": {}, - "source": [ - "##### Stress theta\n", - "Let's evaluates the sensitivity of a model's output to changes in the parameter theta, which represents the long-term variance in a stochastic volatility model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e68df3db", - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"my_custom_tests.Stressing:TheThetaParameter\",\n", - " inputs = {\n", - " \"model\": hm_model,\n", - " },\n", - " param_grid={\n", - " \"strike_price\": [460],\n", - " \"maturity_date\": [maturity_date],\n", - " \"spot_price\": [option_params[\"spot_price\"]],\n", - " \"v0_opt\": [v0_opt],\n", - " \"theta_opt\": list(np.linspace(0.1, theta_opt+0.9, 5)),\n", - " \"kappa_opt\": [kappa_opt],\n", - " \"sigma_opt\": [sigma_opt],\n", - " \"rho_opt\": [rho_opt]\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "id": "32e70456", - "metadata": {}, - "source": [ - "##### Stress rho\n", - "Let's evaluates the sensitivity of a model's output to changes in the correlation parameter, rho, within a stochastic volatility (SV) model framework. This test is crucial for understanding how variations in rho, which represents the correlation between the asset price and its volatility, impact the model's valuation output." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5ca3fc2", - "metadata": {}, - "outputs": [], - "source": [ - "result = run_test(\n", - " \"my_custom_tests.Stressing:TheRhoParameter\",\n", - " inputs = {\n", - " \"model\": hm_model,\n", - " },\n", - " param_grid={\n", - " \"strike_price\": [460],\n", - " \"maturity_date\": [maturity_date],\n", - " \"spot_price\": [option_params[\"spot_price\"]],\n", - " \"v0_opt\": [v0_opt],\n", - " \"theta_opt\": [theta_opt],\n", - " \"kappa_opt\": [kappa_opt],\n", - " \"sigma_opt\": [sigma_opt],\n", - " \"rho_opt\": list(np.linspace(rho_opt-0.2, rho_opt+0.2, 5))\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "id": "892c5347", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Next steps\n", - "\n", - "You can look at the results of this test suite right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation.\n", - "\n", - "\n", - "\n", - "### Work with your model documentation\n", - "\n", - "1. From the **Model Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", - "\n", - "2. Click and expand the **Model Development** section.\n", - "\n", - "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/model-documentation/working-with-model-documentation.html)\n", - "\n", - "\n", - "\n", - "### Discover more learning resources\n", - "\n", - "We offer many interactive notebooks to help you automate testing, documenting, validating, and more:\n", - "\n", - "- [Run tests & test suites](https://docs.validmind.ai/developer/how-to/testing-overview.html)\n", - "- [Use ValidMind Library features](https://docs.validmind.ai/developer/how-to/feature-overview.html)\n", - "- [Code samples by use case](https://docs.validmind.ai/guide/samples-jupyter-notebooks.html)\n", - "\n", - "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." - ] - }, - { - "cell_type": "markdown", - "id": "copyright-de5d1e182b09403abddabc2850f2dd05", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "***\n", - "\n", - "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", - "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", - "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "validmind-1QuffXMV-py3.10", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "id": "1e2a4689", + "metadata": {}, + "source": [ + "# Quickstart for Heston option pricing model using QuantLib\n", + "\n", + "Welcome! Let's get you started with the basic process of documenting models with ValidMind.\n", + "\n", + "The Heston option pricing model is a popular stochastic volatility model used to price options. Developed by Steven Heston in 1993, the model assumes that the asset's volatility follows a mean-reverting square-root process, allowing it to capture the empirical observation of volatility \"clustering\" in financial markets. This model is particularly useful for assets where volatility is not constant, making it a favored approach in quantitative finance for pricing complex derivatives.\n", + "\n", + "Here’s an overview of the Heston model as implemented in QuantLib, a powerful library for quantitative finance:\n", + "\n", + "\n", + "\n", + "### Model Assumptions and Characteristics\n", + "1. **Stochastic Volatility**: The volatility is modeled as a stochastic process, following a mean-reverting square-root process (Cox-Ingersoll-Ross process).\n", + "2. **Correlated Asset and Volatility Processes**: The asset price and volatility are assumed to be correlated, allowing the model to capture the \"smile\" effect observed in implied volatilities.\n", + "3. **Risk-Neutral Dynamics**: The Heston model is typically calibrated under a risk-neutral measure, which allows for direct application to pricing.\n", + "\n", + "\n", + "\n", + "### Heston Model Parameters\n", + "The model is governed by a set of key parameters:\n", + "- **S0**: Initial stock price\n", + "- **v0**: Initial variance of the asset price\n", + "- **kappa**: Speed of mean reversion of the variance\n", + "- **theta**: Long-term mean level of variance\n", + "- **sigma**: Volatility of volatility (vol of vol)\n", + "- **rho**: Correlation between the asset price and variance processes\n", + "\n", + "The dynamics of the asset price \\( S \\) and the variance \\( v \\) under the Heston model are given by:\n", + "\n", + "$$\n", + "dS_t = r S_t \\, dt + \\sqrt{v_t} S_t \\, dW^S_t\n", + "$$\n", + "\n", + "$$\n", + "dv_t = \\kappa (\\theta - v_t) \\, dt + \\sigma \\sqrt{v_t} \\, dW^v_t\n", + "$$\n", + "\n", + "where \\( $dW^S$ \\) and \\( $dW^v$ \\) are Wiener processes with correlation \\( $\\rho$ \\).\n", + "\n", + "\n", + "\n", + "### Advantages and Limitations\n", + "- **Advantages**:\n", + " - Ability to capture volatility smiles and skews.\n", + " - More realistic pricing for options on assets with stochastic volatility.\n", + "- **Limitations**:\n", + " - Calibration can be complex due to the number of parameters.\n", + " - Computationally intensive compared to simpler models like Black-Scholes.\n", + "\n", + "This setup provides a robust framework for pricing and analyzing options with stochastic volatility dynamics. QuantLib’s implementation makes it easy to experiment with different parameter configurations and observe their effects on pricing.\n", + "\n", + "You will learn how to initialize the ValidMind Library, develop a option pricing model, and then write custom tests that can be used for sensitivity and stress testing to quickly generate documentation about model." + ] + }, + { + "cell_type": "markdown", + "id": "69ec219a", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + " - [Model Assumptions and Characteristics](#toc1_1__) \n", + " - [Heston Model Parameters](#toc1_2__) \n", + " - [Advantages and Limitations](#toc1_3__) \n", + "- [About ValidMind](#toc2__) \n", + " - [Before you begin](#toc2_1__) \n", + " - [New to ValidMind?](#toc2_2__) \n", + " - [Key concepts](#toc2_3__) \n", + "- [Setting up](#toc3__) \n", + " - [Install the ValidMind Library](#toc3_1__) \n", + " - [Initialize the ValidMind Library](#toc3_2__) \n", + " - [Register sample model](#toc3_2_1__) \n", + " - [Apply documentation template](#toc3_2_2__) \n", + " - [Get your code snippet](#toc3_2_3__) \n", + " - [Initialize the Python environment](#toc3_3__) \n", + " - [Preview the documentation template](#toc3_4__) \n", + "- [Data Preparation](#toc4__) \n", + " - [Helper functions](#toc4_1_1__) \n", + " - [Market Data Quality and Availability](#toc4_2__) \n", + " - [Initialize the ValidMind datasets](#toc4_3__) \n", + " - [Data Quality](#toc4_4__) \n", + " - [Isolation Forest Outliers Test](#toc4_4_1__) \n", + " - [Model parameters](#toc4_4_2__) \n", + "- [Model development - Heston Option price](#toc5__) \n", + " - [Model Calibration](#toc5_1__) \n", + " - [Model Evaluation](#toc5_2__) \n", + " - [Benchmark Testing](#toc5_2_1__) \n", + " - [Sensitivity Testing](#toc5_2_2__) \n", + " - [Stress Testing](#toc5_2_3__) \n", + "- [Next steps](#toc6__) \n", + " - [Work with your model documentation](#toc6_1__) \n", + " - [Discover more learning resources](#toc6_2__) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "b9fb5d17", + "metadata": {}, + "source": [ + "\n", + "\n", + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", + "\n", + "\n", + "\n", + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", + "\n", + "\n", + "\n", + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", + "

\n", + "Register with ValidMind
\n", + "\n", + "\n", + "\n", + "### Key concepts\n", + "\n", + "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", + "\n", + "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", + "\n", + "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", + "\n", + "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." + ] + }, + { + "cell_type": "markdown", + "id": "f2dccf35", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "id": "5a5ce085", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Install the ValidMind Library\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "409352bf", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "id": "65e870b2", + "metadata": {}, + "source": [ + "To install the QuantLib library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a34debf", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q QuantLib" + ] + }, + { + "cell_type": "markdown", + "id": "fb30ae07", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind Library" + ] + }, + { + "cell_type": "markdown", + "id": "c6f87017", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Register sample model\n", + "\n", + "Let's first register a sample model for use with this notebook:\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + "4. Select your own name under the **MODEL OWNER** drop-down.\n", + "\n", + "5. Click **Register Model** to add the model to your inventory." + ] + }, + { + "cell_type": "markdown", + "id": "cbb2e2c9", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Apply documentation template\n", + "\n", + "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", + "\n", + "2. Under **TEMPLATE**, select `Capital Markets`.\n", + "\n", + "3. Click **Use Template** to apply the template." + ] + }, + { + "cell_type": "markdown", + "id": "41c4edca", + "metadata": {}, + "source": [ + "
Can't select this template?\n", + "

\n", + "Your organization administrators may need to add it to your template library:\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "2012eb82", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Get your code snippet\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", + "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cd3f67e", + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + " # document=\"documentation\",\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "6d944cc9", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the Python environment\n", + "\n", + "Next, let's import the necessary libraries and set up your Python environment for data analysis:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8cf2746", + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from scipy.optimize import minimize\n", + "import yfinance as yf\n", + "import QuantLib as ql\n", + "from validmind.tests import run_test" + ] + }, + { + "cell_type": "markdown", + "id": "bc431ee0", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Preview the documentation template\n", + "\n", + "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", + "\n", + "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e844028", + "metadata": {}, + "outputs": [], + "source": [ + "vm.preview_template()" + ] + }, + { + "cell_type": "markdown", + "id": "0c0ee8b9", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Data Preparation" + ] + }, + { + "cell_type": "markdown", + "id": "5a4d2c36", + "metadata": {}, + "source": [ + "### Market Data Sources\n", + "\n", + "\n", + "\n", + "#### Helper functions\n", + "Let's define helper function retrieve to option data from Yahoo Finance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b96a500f", + "metadata": {}, + "outputs": [], + "source": [ + "def get_market_data(ticker, expiration_date_str):\n", + " \"\"\"\n", + " Fetch option market data from Yahoo Finance for the given ticker and expiration date.\n", + " Returns a list of tuples: (strike, maturity, option_price).\n", + " \"\"\"\n", + " # Create a Ticker object for the specified stock\n", + " stock = yf.Ticker(ticker)\n", + "\n", + " # Get all available expiration dates for options\n", + " option_dates = stock.options\n", + "\n", + " # Check if the requested expiration date is available\n", + " if expiration_date_str not in option_dates:\n", + " raise ValueError(f\"Expiration date {expiration_date_str} not available for {ticker}. Available dates: {option_dates}\")\n", + "\n", + " # Get the option chain for the specified expiration date\n", + " option_chain = stock.option_chain(expiration_date_str)\n", + "\n", + " # Get call options (or you can use puts as well based on your requirement)\n", + " calls = option_chain.calls\n", + "\n", + " # Convert expiration_date_str to QuantLib Date\n", + " expiry_date_parts = list(map(int, expiration_date_str.split('-'))) # Split YYYY-MM-DD\n", + " maturity_date = ql.Date(expiry_date_parts[2], expiry_date_parts[1], expiry_date_parts[0]) # Convert to QuantLib Date\n", + "\n", + " # Create a list to store strike prices, maturity dates, and option prices\n", + " market_data = []\n", + " for index, row in calls.iterrows():\n", + " strike = row['strike']\n", + " option_price = row['lastPrice'] # You can also use 'bid', 'ask', 'mid', etc.\n", + " market_data.append((strike, maturity_date, option_price))\n", + " df = pd.DataFrame(market_data, columns = ['strike', 'maturity_date', 'option_price'])\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "id": "c7769b73", + "metadata": {}, + "source": [ + "Let's define helper function retrieve to stock data from Yahoo Finance. This helper function to calculate spot price, dividend yield, volatility and risk free rate using the underline stock data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc44c448", + "metadata": {}, + "outputs": [], + "source": [ + "def get_option_parameters(ticker):\n", + " # Fetch historical data for the stock\n", + " stock_data = yf.Ticker(ticker)\n", + " \n", + " # Get the current spot price\n", + " spot_price = stock_data.history(period=\"1d\")['Close'].iloc[-1]\n", + " \n", + " # Get dividend yield\n", + " dividend_rate = stock_data.dividends.mean() / spot_price if not stock_data.dividends.empty else 0.0\n", + " \n", + " # Estimate volatility (standard deviation of log returns)\n", + " hist_data = stock_data.history(period=\"1y\")['Close']\n", + " log_returns = np.log(hist_data / hist_data.shift(1)).dropna()\n", + " volatility = np.std(log_returns) * np.sqrt(252) # Annualized volatility\n", + " \n", + " # Assume a risk-free rate from some known data (can be fetched from market data, here we use 0.001)\n", + " risk_free_rate = 0.001\n", + " \n", + " # Return the calculated parameters\n", + " return {\n", + " \"spot_price\": spot_price,\n", + " \"volatility\": volatility,\n", + " \"dividend_rate\": dividend_rate,\n", + " \"risk_free_rate\": risk_free_rate\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "c7b739d3", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Market Data Quality and Availability\n", + "Next, let's specify ticker and expiration date to get market data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50225fde", + "metadata": {}, + "outputs": [], + "source": [ + "ticker = \"MSFT\"\n", + "expiration_date = \"2024-12-13\" # Example expiration date in 'YYYY-MM-DD' form\n", + "\n", + "market_data = get_market_data(ticker=ticker, expiration_date_str=expiration_date)" + ] + }, + { + "cell_type": "markdown", + "id": "c539b95e", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind datasets\n", + "\n", + "Before you can run tests, you must first initialize a ValidMind dataset object using the [`init_dataset`](https://docs.validmind.ai/validmind/validmind.html#init_dataset) function from the ValidMind (`vm`) module." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "113f9c17", + "metadata": {}, + "outputs": [], + "source": [ + "vm_market_data = vm.init_dataset(\n", + " dataset=market_data,\n", + " input_id=\"market_data\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "185beb24", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Data Quality\n", + "Let's check quality of the data using outliers and missing data tests." + ] + }, + { + "cell_type": "markdown", + "id": "7f14464c", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Isolation Forest Outliers Test\n", + "Let's detects anomalies in the dataset using the Isolation Forest algorithm, visualized through scatter plots." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56c919ec", + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"validmind.data_validation.IsolationForestOutliers\",\n", + " inputs={\n", + " \"dataset\": vm_market_data,\n", + " },\n", + " title=\"Outliers detection using Isolation Forest\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "e4d0e5ca", + "metadata": {}, + "source": [ + "##### Missing Values Test\n", + "Let's evaluates dataset quality by ensuring the missing value ratio across all features does not exceed a set threshold." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e95c825f", + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"validmind.data_validation.MissingValues\",\n", + " inputs={\n", + " \"dataset\": vm_market_data,\n", + " },\n", + " title=\"Missing Values detection\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "829403a3", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Model parameters\n", + "Let's calculate the model parameters using from stock data " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25936449", + "metadata": {}, + "outputs": [], + "source": [ + "option_params = get_option_parameters(ticker=ticker)" + ] + }, + { + "cell_type": "markdown", + "id": "0a0948b6", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Model development - Heston Option price" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e15b8221", + "metadata": {}, + "outputs": [], + "source": [ + "class HestonModel:\n", + "\n", + " def __init__(self, ticker, expiration_date_str, calculation_date, spot_price, dividend_rate, risk_free_rate):\n", + " self.ticker = ticker\n", + " self.expiration_date_str = expiration_date_str,\n", + " self.calculation_date = calculation_date\n", + " self.spot_price = spot_price\n", + " self.dividend_rate = dividend_rate\n", + " self.risk_free_rate = risk_free_rate\n", + " \n", + " def predict_option_price(self, strike, maturity_date, spot_price, v0=None, theta=None, kappa=None, sigma=None, rho=None):\n", + " # Set the evaluation date\n", + " ql.Settings.instance().evaluationDate = self.calculation_date\n", + "\n", + " # Construct the European Option\n", + " payoff = ql.PlainVanillaPayoff(ql.Option.Call, strike)\n", + " exercise = ql.EuropeanExercise(maturity_date)\n", + " european_option = ql.VanillaOption(payoff, exercise)\n", + "\n", + " # Yield term structures for risk-free rate and dividend\n", + " riskFreeTS = ql.YieldTermStructureHandle(ql.FlatForward(calculation_date, self.risk_free_rate, ql.Actual365Fixed()))\n", + " dividendTS = ql.YieldTermStructureHandle(ql.FlatForward(calculation_date, self.dividend_rate, ql.Actual365Fixed()))\n", + "\n", + " # Initial stock price\n", + " initialValue = ql.QuoteHandle(ql.SimpleQuote(spot_price))\n", + "\n", + " # Heston process parameters\n", + " heston_process = ql.HestonProcess(riskFreeTS, dividendTS, initialValue, v0, kappa, theta, sigma, rho)\n", + " hestonModel = ql.HestonModel(heston_process)\n", + "\n", + " # Use the Heston analytic engine\n", + " engine = ql.AnalyticHestonEngine(hestonModel)\n", + " european_option.setPricingEngine(engine)\n", + "\n", + " # Calculate the Heston model price\n", + " h_price = european_option.NPV()\n", + "\n", + " return h_price\n", + "\n", + " def predict_american_option_price(self, strike, maturity_date, spot_price, v0=None, theta=None, kappa=None, sigma=None, rho=None):\n", + " # Set the evaluation date\n", + " ql.Settings.instance().evaluationDate = self.calculation_date\n", + "\n", + " # Construct the American Option\n", + " payoff = ql.PlainVanillaPayoff(ql.Option.Call, strike)\n", + " exercise = ql.AmericanExercise(self.calculation_date, maturity_date)\n", + " american_option = ql.VanillaOption(payoff, exercise)\n", + "\n", + " # Yield term structures for risk-free rate and dividend\n", + " riskFreeTS = ql.YieldTermStructureHandle(ql.FlatForward(self.calculation_date, self.risk_free_rate, ql.Actual365Fixed()))\n", + " dividendTS = ql.YieldTermStructureHandle(ql.FlatForward(self.calculation_date, self.dividend_rate, ql.Actual365Fixed()))\n", + "\n", + " # Initial stock price\n", + " initialValue = ql.QuoteHandle(ql.SimpleQuote(spot_price))\n", + "\n", + " # Heston process parameters\n", + " heston_process = ql.HestonProcess(riskFreeTS, dividendTS, initialValue, v0, kappa, theta, sigma, rho)\n", + " heston_model = ql.HestonModel(heston_process)\n", + "\n", + "\n", + " payoff = ql.PlainVanillaPayoff(ql.Option.Call, strike)\n", + " exercise = ql.AmericanExercise(self.calculation_date, maturity_date)\n", + " american_option = ql.VanillaOption(payoff, exercise)\n", + " heston_fd_engine = ql.FdHestonVanillaEngine(heston_model)\n", + " american_option.setPricingEngine(heston_fd_engine)\n", + " option_price = american_option.NPV()\n", + "\n", + " return option_price\n", + "\n", + " def objective_function(self, params, market_data, spot_price, dividend_rate, risk_free_rate):\n", + " v0, theta, kappa, sigma, rho = params\n", + "\n", + " # Sum of squared differences between market prices and model prices\n", + " error = 0.0\n", + " for i, row in market_data.iterrows():\n", + " model_price = self.predict_option_price(row['strike'], row['maturity_date'], spot_price, \n", + " v0, theta, kappa, sigma, rho)\n", + " error += (model_price - row['option_price']) ** 2\n", + " \n", + " return error\n", + "\n", + " def calibrate_model(self, ticker, expiration_date_str):\n", + " # Get the option market data dynamically from Yahoo Finance\n", + " market_data = get_market_data(ticker, expiration_date_str)\n", + "\n", + " # Initial guesses for Heston parameters\n", + " initial_params = [0.04, 0.04, 0.1, 0.1, -0.75]\n", + "\n", + " # Bounds for the parameters to ensure realistic values\n", + " bounds = [(0.0001, 1.0), # v0\n", + " (0.0001, 1.0), # theta\n", + " (0.001, 2.0), # kappa\n", + " (0.001, 1.0), # sigma\n", + " (-0.75, 0.0)] # rho\n", + "\n", + " # Optimize the parameters to minimize the error between model and market prices\n", + " result = minimize(self.objective_function, initial_params, args=(market_data, self.spot_price, self.dividend_rate, self.risk_free_rate),\n", + " bounds=bounds, method='L-BFGS-B')\n", + "\n", + " # Optimized Heston parameters\n", + " v0_opt, theta_opt, kappa_opt, sigma_opt, rho_opt = result.x\n", + "\n", + " return v0_opt, theta_opt, kappa_opt, sigma_opt, rho_opt\n" + ] + }, + { + "cell_type": "markdown", + "id": "a941aa32", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Model Calibration\n", + "* The calibration process aims to optimize the Heston model parameters (v0, theta, kappa, sigma, rho) by minimizing the difference between model-predicted option prices and observed market prices.\n", + "* In this implementation, the model is calibrated to current market data, specifically using option prices from the selected ticker and expiration date.\n", + "\n", + "Let's specify `calculation_date` and `strike_price` as input parameters for the model to verify its functionality and confirm it operates as expected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d61dfca", + "metadata": {}, + "outputs": [], + "source": [ + "calculation_date = ql.Date(26, 11, 2024)\n", + "# Convert expiration date string to QuantLib.Date\n", + "expiry_date_parts = list(map(int, expiration_date.split('-')))\n", + "maturity_date = ql.Date(expiry_date_parts[2], expiry_date_parts[1], expiry_date_parts[0])\n", + "strike_price = 460.0\n", + "\n", + "hm = HestonModel(\n", + " ticker=ticker,\n", + " expiration_date_str= expiration_date,\n", + " calculation_date= calculation_date,\n", + " spot_price= option_params['spot_price'],\n", + " dividend_rate = option_params['dividend_rate'],\n", + " risk_free_rate = option_params['risk_free_rate']\n", + ")\n", + "\n", + "# Let's calibrate model\n", + "v0_opt, theta_opt, kappa_opt, sigma_opt, rho_opt = hm.calibrate_model(ticker, expiration_date)\n", + "print(f\"Optimized Heston parameters: v0={v0_opt}, theta={theta_opt}, kappa={kappa_opt}, sigma={sigma_opt}, rho={rho_opt}\")\n", + "\n", + "\n", + "# option price\n", + "h_price = hm.predict_option_price(strike_price, maturity_date, option_params['spot_price'], v0_opt, theta_opt, kappa_opt, sigma_opt, rho_opt)\n", + "print(\"The Heston model price for the option is:\", h_price)" + ] + }, + { + "cell_type": "markdown", + "id": "75313272", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Model Evaluation" + ] + }, + { + "cell_type": "markdown", + "id": "2e6471ef", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Benchmark Testing\n", + "The benchmark testing framework provides a robust way to validate the Heston model implementation and understand the relationships between European and American option prices under stochastic volatility conditions.\n", + "Let's compares European and American option prices using the Heston model." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "810cf887", + "metadata": {}, + "outputs": [], + "source": [ + "@vm.test(\"my_custom_tests.BenchmarkTest\")\n", + "def benchmark_test(hm_model, strikes, maturity_date, spot_price, v0=None, theta=None, kappa=None, sigma=None, rho=None):\n", + " \"\"\"\n", + " Compares European and American option prices using the Heston model.\n", + "\n", + " This test evaluates the price differences between European and American options\n", + " across multiple strike prices while keeping other parameters constant. The comparison\n", + " helps understand the early exercise premium of American options over their European\n", + " counterparts under stochastic volatility conditions.\n", + "\n", + " Args:\n", + " hm_model: HestonModel instance for option pricing calculations\n", + " strikes (list[float]): List of strike prices to test\n", + " maturity_date (ql.Date): Option expiration date in QuantLib format\n", + " spot_price (float): Current price of the underlying asset\n", + " v0 (float, optional): Initial variance. Defaults to None.\n", + " theta (float, optional): Long-term variance. Defaults to None.\n", + " kappa (float, optional): Mean reversion rate. Defaults to None.\n", + " sigma (float, optional): Volatility of variance. Defaults to None.\n", + " rho (float, optional): Correlation between asset and variance. Defaults to None.\n", + "\n", + " Returns:\n", + " dict: Contains a DataFrame with the following columns:\n", + " - Strike: Strike prices tested\n", + " - Maturity date: Expiration date for all options\n", + " - Spot price: Current underlying price\n", + " - european model price: Prices for European options\n", + " - american model price: Prices for American options\n", + "\"\"\"\n", + " american_derived_prices = []\n", + " european_derived_prices = []\n", + " for K in strikes:\n", + " european_derived_prices.append(hm_model.predict_option_price(K, maturity_date, spot_price, v0, theta, kappa, sigma, rho))\n", + " american_derived_prices.append(hm_model.predict_american_option_price(K, maturity_date, spot_price, v0, theta, kappa, sigma, rho))\n", + "\n", + " data = {\n", + " \"Strike\": strikes,\n", + " \"Maturity date\": [maturity_date] * len(strikes),\n", + " \"Spot price\": [spot_price] * len(strikes),\n", + " \"european model price\": european_derived_prices,\n", + " \"american model price\": american_derived_prices,\n", + "\n", + " }\n", + " df1 = pd.DataFrame(data)\n", + " return {\"strikes variation benchmarking\": df1}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3fdd6705", + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"my_custom_tests.BenchmarkTest\",\n", + " params={\n", + " \"hm_model\": hm,\n", + " \"strikes\": [400, 425, 460, 495, 520],\n", + " \"maturity_date\": maturity_date,\n", + " \"spot_price\": option_params['spot_price'],\n", + " \"v0\":v0_opt,\n", + " \"theta\": theta_opt,\n", + " \"kappa\":kappa_opt ,\n", + " \"sigma\": sigma_opt,\n", + " \"rho\":rho_opt\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "e359b503", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Sensitivity Testing\n", + "The sensitivity testing framework provides a systematic approach to understanding how the Heston model responds to parameter changes, which is crucial for both model validation and practical application in trading and risk management." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51922313", + "metadata": {}, + "outputs": [], + "source": [ + "@vm.test(\"my_test_provider.Sensitivity\")\n", + "def SensitivityTest(\n", + " model,\n", + " strike_price,\n", + " maturity_date,\n", + " spot_price,\n", + " v0_opt,\n", + " theta_opt,\n", + " kappa_opt,\n", + " sigma_opt,\n", + " rho_opt,\n", + "):\n", + " \"\"\"\n", + " Evaluates the sensitivity of American option prices to changes in model parameters.\n", + "\n", + " This test calculates option prices using the Heston model with optimized parameters.\n", + " It's designed to analyze how changes in various model inputs affect the option price,\n", + " which is crucial for understanding model behavior and risk management.\n", + "\n", + " Args:\n", + " model (HestonModel): Initialized Heston model instance wrapped in ValidMind model object\n", + " strike_price (float): Strike price of the option\n", + " maturity_date (ql.Date): Expiration date of the option in QuantLib format\n", + " spot_price (float): Current price of the underlying asset\n", + " v0_opt (float): Optimized initial variance parameter\n", + " theta_opt (float): Optimized long-term variance parameter\n", + " kappa_opt (float): Optimized mean reversion rate parameter\n", + " sigma_opt (float): Optimized volatility of variance parameter\n", + " rho_opt (float): Optimized correlation parameter between asset price and variance\n", + " \"\"\"\n", + " price = model.model.predict_american_option_price(\n", + " strike_price,\n", + " maturity_date,\n", + " spot_price,\n", + " v0_opt,\n", + " theta_opt,\n", + " kappa_opt,\n", + " sigma_opt,\n", + " rho_opt,\n", + " )\n", + "\n", + " return price\n" + ] + }, + { + "cell_type": "markdown", + "id": "408a05ef", + "metadata": {}, + "source": [ + "##### Common plot function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "104ca6dd", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_results(df, params: dict = None):\n", + " fig2 = plt.figure(figsize=(10, 6))\n", + " plt.plot(df[params[\"x\"]], df[params[\"y\"]], label=params[\"label\"])\n", + " plt.xlabel(params[\"xlabel\"])\n", + " plt.ylabel(params[\"ylabel\"])\n", + " \n", + " plt.title(params[\"title\"])\n", + " plt.legend()\n", + " plt.grid(True)\n", + " plt.show() # close the plot to avoid displaying it" + ] + }, + { + "cell_type": "markdown", + "id": "ca72b9e5", + "metadata": {}, + "source": [ + "Let's create ValidMind model object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae7093fa", + "metadata": {}, + "outputs": [], + "source": [ + "hm_model = vm.init_model(model=hm, input_id=\"HestonModel\")" + ] + }, + { + "cell_type": "markdown", + "id": "b2141640", + "metadata": {}, + "source": [ + "##### Strike sensitivity\n", + "Let's analyzes how option prices change as the strike price varies. We create a range of strike prices around the current strike (460) and observe the impact on option prices while keeping all other parameters constant." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea7f1cbe", + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"my_test_provider.Sensitivity:ToStrike\",\n", + " inputs = {\n", + " \"model\": hm_model\n", + " },\n", + " param_grid={\n", + " \"strike_price\": list(np.linspace(460-50, 460+50, 10)),\n", + " \"maturity_date\": [maturity_date],\n", + " \"spot_price\": [option_params[\"spot_price\"]],\n", + " \"v0_opt\": [v0_opt],\n", + " \"theta_opt\": [theta_opt],\n", + " \"kappa_opt\": [kappa_opt],\n", + " \"sigma_opt\": [sigma_opt],\n", + " \"rho_opt\":[rho_opt]\n", + " },\n", + ")\n", + "result.log()\n", + "# Visualize how option prices change with different strike prices\n", + "plot_results(\n", + " pd.DataFrame(result.tables[0].data),\n", + " params={\n", + " \"x\": \"strike_price\",\n", + " \"y\":\"Value\",\n", + " \"label\":\"Strike price\",\n", + " \"xlabel\":\"Strike price\",\n", + " \"ylabel\":\"option price\",\n", + " \"title\":\"Heston option - Strike price Sensitivity\",\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "be143012", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Stress Testing\n", + "This stress testing framework provides a comprehensive view of how the Heston model behaves under different market conditions and helps identify potential risks in option pricing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2f01a40", + "metadata": {}, + "outputs": [], + "source": [ + "@vm.test(\"my_custom_tests.Stressing\")\n", + "def StressTest(\n", + " model,\n", + " strike_price,\n", + " maturity_date,\n", + " spot_price,\n", + " v0_opt,\n", + " theta_opt,\n", + " kappa_opt,\n", + " sigma_opt,\n", + " rho_opt,\n", + "):\n", + " \"\"\"\n", + " Performs stress testing on Heston model parameters to evaluate option price sensitivity.\n", + "\n", + " This test evaluates how the American option price responds to stressed market conditions\n", + " by varying key model parameters. It's designed to:\n", + " 1. Identify potential model vulnerabilities\n", + " 2. Understand price behavior under extreme scenarios\n", + " 3. Support risk management decisions\n", + " 4. Validate model stability across parameter ranges\n", + "\n", + " Args:\n", + " model (HestonModel): Initialized Heston model instance wrapped in ValidMind model object\n", + " strike_price (float): Option strike price\n", + " maturity_date (ql.Date): Option expiration date in QuantLib format\n", + " spot_price (float): Current price of the underlying asset\n", + " v0_opt (float): Initial variance parameter under stress testing\n", + " theta_opt (float): Long-term variance parameter under stress testing\n", + " kappa_opt (float): Mean reversion rate parameter under stress testing\n", + " sigma_opt (float): Volatility of variance parameter under stress testing\n", + " rho_opt (float): Correlation parameter under stress testing\n", + " \"\"\"\n", + " price = model.model.predict_american_option_price(\n", + " strike_price,\n", + " maturity_date,\n", + " spot_price,\n", + " v0_opt,\n", + " theta_opt,\n", + " kappa_opt,\n", + " sigma_opt,\n", + " rho_opt,\n", + " )\n", + "\n", + " return price\n" + ] + }, + { + "cell_type": "markdown", + "id": "31fcbe9c", + "metadata": {}, + "source": [ + "##### Rho (correlation) and Theta (long term vol) stress test\n", + "Next, let's evaluates the sensitivity of a model's output to changes in the correlation parameter (rho) and the long-term variance parameter (theta) within a stochastic volatility framework." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6119b5d9", + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"my_custom_tests.Stressing:TheRhoAndThetaParameters\",\n", + " inputs = {\n", + " \"model\": hm_model,\n", + " },\n", + " param_grid={\n", + " \"strike_price\": [460],\n", + " \"maturity_date\": [maturity_date],\n", + " \"spot_price\": [option_params[\"spot_price\"]],\n", + " \"v0_opt\": [v0_opt],\n", + " \"theta_opt\": list(np.linspace(0.1, theta_opt+0.4, 5)),\n", + " \"kappa_opt\": [kappa_opt],\n", + " \"sigma_opt\": [sigma_opt],\n", + " \"rho_opt\":list(np.linspace(rho_opt-0.2, rho_opt+0.2, 5))\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "be39cb3a", + "metadata": {}, + "source": [ + "##### Sigma stress test\n", + "Let's evaluates the sensitivity of a model's output to changes in the volatility parameter, sigma. This test is crucial for understanding how variations in market volatility impact the model's valuation of financial instruments, particularly options." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0dc189b7", + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"my_custom_tests.Stressing:TheSigmaParameter\",\n", + " inputs = {\n", + " \"model\": hm_model,\n", + " },\n", + " param_grid={\n", + " \"strike_price\": [460],\n", + " \"maturity_date\": [maturity_date],\n", + " \"spot_price\": [option_params[\"spot_price\"]],\n", + " \"v0_opt\": [v0_opt],\n", + " \"theta_opt\": [theta_opt],\n", + " \"kappa_opt\": [kappa_opt],\n", + " \"sigma_opt\": list(np.linspace(0.1, sigma_opt+0.6, 5)),\n", + " \"rho_opt\": [rho_opt]\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "173a5294", + "metadata": {}, + "source": [ + "##### Stress kappa\n", + "Let's evaluates the sensitivity of a model's output to changes in the kappa parameter, which is a mean reversion rate in stochastic volatility models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dae9714f", + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"my_custom_tests.Stressing:TheKappaParameter\",\n", + " inputs = {\n", + " \"model\": hm_model,\n", + " },\n", + " param_grid={\n", + " \"strike_price\": [460],\n", + " \"maturity_date\": [maturity_date],\n", + " \"spot_price\": [option_params[\"spot_price\"]],\n", + " \"v0_opt\": [v0_opt],\n", + " \"theta_opt\": [theta_opt],\n", + " \"kappa_opt\": list(np.linspace(kappa_opt, kappa_opt+0.2, 5)),\n", + " \"sigma_opt\": [sigma_opt],\n", + " \"rho_opt\": [rho_opt]\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "b4d1d968", + "metadata": {}, + "source": [ + "##### Stress theta\n", + "Let's evaluates the sensitivity of a model's output to changes in the parameter theta, which represents the long-term variance in a stochastic volatility model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e68df3db", + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"my_custom_tests.Stressing:TheThetaParameter\",\n", + " inputs = {\n", + " \"model\": hm_model,\n", + " },\n", + " param_grid={\n", + " \"strike_price\": [460],\n", + " \"maturity_date\": [maturity_date],\n", + " \"spot_price\": [option_params[\"spot_price\"]],\n", + " \"v0_opt\": [v0_opt],\n", + " \"theta_opt\": list(np.linspace(0.1, theta_opt+0.9, 5)),\n", + " \"kappa_opt\": [kappa_opt],\n", + " \"sigma_opt\": [sigma_opt],\n", + " \"rho_opt\": [rho_opt]\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "32e70456", + "metadata": {}, + "source": [ + "##### Stress rho\n", + "Let's evaluates the sensitivity of a model's output to changes in the correlation parameter, rho, within a stochastic volatility (SV) model framework. This test is crucial for understanding how variations in rho, which represents the correlation between the asset price and its volatility, impact the model's valuation output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5ca3fc2", + "metadata": {}, + "outputs": [], + "source": [ + "result = run_test(\n", + " \"my_custom_tests.Stressing:TheRhoParameter\",\n", + " inputs = {\n", + " \"model\": hm_model,\n", + " },\n", + " param_grid={\n", + " \"strike_price\": [460],\n", + " \"maturity_date\": [maturity_date],\n", + " \"spot_price\": [option_params[\"spot_price\"]],\n", + " \"v0_opt\": [v0_opt],\n", + " \"theta_opt\": [theta_opt],\n", + " \"kappa_opt\": [kappa_opt],\n", + " \"sigma_opt\": [sigma_opt],\n", + " \"rho_opt\": list(np.linspace(rho_opt-0.2, rho_opt+0.2, 5))\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "id": "892c5347", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Next steps\n", + "\n", + "You can look at the results of this test suite right in the notebook where you ran the code, as you would expect. But there is a better way — use the ValidMind Platform to work with your model documentation.\n", + "\n", + "\n", + "\n", + "### Work with your model documentation\n", + "\n", + "1. From the **Model Inventory** in the ValidMind Platform, go to the model you registered earlier. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/working-with-model-inventory.html))\n", + "\n", + "2. Click and expand the **Model Development** section.\n", + "\n", + "What you see is the full draft of your model documentation in a more easily consumable version. From here, you can make qualitative edits to model documentation, view guidelines, collaborate with validators, and submit your model documentation for approval when it's ready. [Learn more ...](https://docs.validmind.ai/guide/model-documentation/working-with-model-documentation.html)\n", + "\n", + "\n", + "\n", + "### Discover more learning resources\n", + "\n", + "We offer many interactive notebooks to help you automate testing, documenting, validating, and more:\n", + "\n", + "- [Run tests & test suites](https://docs.validmind.ai/developer/how-to/testing-overview.html)\n", + "- [Use ValidMind Library features](https://docs.validmind.ai/developer/how-to/feature-overview.html)\n", + "- [Code samples by use case](https://docs.validmind.ai/guide/samples-jupyter-notebooks.html)\n", + "\n", + "Or, visit our [documentation](https://docs.validmind.ai/) to learn more about ValidMind." + ] + }, + { + "cell_type": "markdown", + "id": "copyright-de5d1e182b09403abddabc2850f2dd05", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "***\n", + "\n", + "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", + "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", + "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "validmind-1QuffXMV-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/notebooks/use_cases/code_explainer/quickstart_code_explainer_demo.ipynb b/notebooks/use_cases/code_explainer/quickstart_code_explainer_demo.ipynb index 99ee1abf6..afd5e19f6 100644 --- a/notebooks/use_cases/code_explainer/quickstart_code_explainer_demo.ipynb +++ b/notebooks/use_cases/code_explainer/quickstart_code_explainer_demo.ipynb @@ -1,874 +1,875 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Quickstart for model code documentation\n", - "\n", - "Welcome! This notebook demonstrates how to use the ValidMind code explainer to automatically generate comprehensive documentation for your codebase. The code explainer analyzes your source code and provides detailed explanations across various aspects of your implementation.\n", - "\n", - "\n", - "\n", - "## About Code Explainer\n", - "The ValidMind code explainer is a powerful tool that automatically analyzes your source code and generates comprehensive documentation. It helps you:\n", - "\n", - "- Understand the structure and organization of your codebase\n", - "- Document dependencies and environment setup\n", - "- Explain data processing and model implementation details\n", - "- Document training, evaluation, and inference pipelines\n", - "- Track configuration, testing, and security measures\n", - "\n", - "This tool is particularly useful for:\n", - "- Onboarding new team members\n", - "- Maintaining up-to-date documentation\n", - "- Ensuring code quality and best practices\n", - "- Facilitating code reviews and audits" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "::: {.content-hidden when-format=\"html\"}\n", - "## Contents \n", - "- [About Code Explainer](#toc1__) \n", - "- [About ValidMind](#toc2__) \n", - " - [Before you begin](#toc2_1__) \n", - " - [New to ValidMind?](#toc2_2__) \n", - " - [Key concepts](#toc2_3__) \n", - "- [Setting up](#toc3__) \n", - " - [Install the ValidMind Library](#toc3_1__) \n", - " - [Initialize the ValidMind Library](#toc3_2__) \n", - " - [Register sample model](#toc3_2_1__) \n", - " - [Apply documentation template](#toc3_2_2__) \n", - " - [Get your code snippet](#toc3_2_3__) \n", - " - [Preview the documentation template](#toc3_3__) \n", - "- [Common function](#toc4__) \n", - "- [Default Behavior](#toc5__) \n", - "- [Codebase Overview](#toc6__) \n", - "- [Environment and Dependencies ('environment_setup')](#toc7__) \n", - "- [Data Ingestion and Preprocessing](#toc8__) \n", - "- [Model Implementation Details](#toc9__) \n", - "- [Model Training Pipeline](#toc10__) \n", - "- [Evaluation and Validation Code](#toc11__) \n", - "- [Inference and Scoring Logic](#toc12__) \n", - "- [Configuration and Parameters](#toc13__) \n", - "- [Unit and Integration Testing](#toc14__) \n", - "- [Logging and Monitoring Hooks](#toc15__) \n", - "- [Code and Model Versioning](#toc16__) \n", - "- [Security and Access Control](#toc17__) \n", - "- [Example Runs and Scripts](#toc18__) \n", - "- [Known Issues and Future Improvements](#toc19__) \n", - "\n", - ":::\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## About ValidMind\n", - "\n", - "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", - "\n", - "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", - "\n", - "\n", - "\n", - "### Before you begin\n", - "\n", - "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", - "\n", - "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", - "\n", - "\n", - "\n", - "### New to ValidMind?\n", - "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", - "\n", - "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", - "

\n", - "Register with ValidMind
\n", - "\n", - "\n", - "\n", - "### Key concepts\n", - "\n", - "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", - "\n", - "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", - "\n", - "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", - "\n", - "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", - "\n", - "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", - "\n", - " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", - " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", - " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", - " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", - "\n", - "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", - "\n", - "**Outputs**: Custom tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", - "\n", - "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", - "\n", - "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Setting up" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Install the ValidMind Library\n", - "\n", - "To install the library:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q validmind" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the ValidMind Library" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Register sample model\n", - "\n", - "Let's first register a sample model for use with this notebook:\n", - "\n", - "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", - "\n", - "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", - "\n", - "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", - "\n", - "4. Select your own name under the **MODEL OWNER** drop-down.\n", - "\n", - "5. Click **Register Model** to add the model to your inventory." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Apply documentation template\n", - "\n", - "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", - "\n", - "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", - "\n", - "2. Under **TEMPLATE**, select `Model Source Code Documentation`.\n", - "\n", - "3. Click **Use Template** to apply the template." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
Can't select this template?\n", - "

\n", - "Your organization administrators may need to add it to your template library:\n", - "\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Get your code snippet\n", - "\n", - "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", - "\n", - "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", - "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load your model identifier credentials from an `.env` file\n", - "\n", - "%load_ext dotenv\n", - "%dotenv .env\n", - "\n", - "# Or replace with your code snippet\n", - "\n", - "import validmind as vm\n", - "\n", - "vm.init(\n", - " # api_host=\"...\",\n", - " # api_key=\"...\",\n", - " # api_secret=\"...\",\n", - " # model=\"...\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Preview the documentation template\n", - "\n", - "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", - "\n", - "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm.preview_template()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Common function\n", - "The code above defines two key functions:\n", - "1. A function to read source code from 'customer_churn_full_suite.py' file\n", - "2. An 'explain_code' function that uses ValidMind's experimental agents to analyze and explain code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "source_code=\"\"\n", - "with open(\"customer_churn_full_suite.py\", \"r\") as f:\n", - " source_code = f.read()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `vm.experimental.agents.run_task` function is used to execute AI agent tasks.\n", - "\n", - "It requires:\n", - "- task: The type of task to run (e.g. `code_explainer`)\n", - "- input: A dictionary containing task-specific parameters\n", - " - For `code_explainer`, this includes:\n", - " - **source_code** (str): The code to be analyzed\n", - " - **user_instructions** (str): Instructions for how to analyze the code" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def explain_code(content_id: str, user_instructions: str):\n", - " \"\"\"Run code explanation task and log the results.\n", - " By default, the code explainer includes sections for:\n", - " - Main Purpose and Overall Functionality\n", - " - Breakdown of Key Functions or Components\n", - " - Potential Risks or Failure Points \n", - " - Assumptions or Limitations\n", - " If you want default sections, specify user_instructions as an empty string.\n", - " \n", - " Args:\n", - " user_instructions (str): Instructions for how to analyze the code\n", - " content_id (str): ID to use when logging the results\n", - " \n", - " Returns:\n", - " The result object from running the code explanation task\n", - " \"\"\"\n", - " result = vm.experimental.agents.run_task(\n", - " task=\"code_explainer\",\n", - " input={\n", - " \"source_code\": source_code,\n", - " \"user_instructions\": user_instructions\n", - " }\n", - " )\n", - " result.log(content_id=content_id)\n", - " return result" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Default Behavior" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "By default, the code explainer includes sections for:\n", - "- Main Purpose and Overall Functionality\n", - "- Breakdown of Key Functions or Components\n", - "- Potential Risks or Failure Points \n", - "- Assumptions or Limitations\n", - "\n", - "If you want default sections, specify `user_instructions` as an empty string. For example:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = vm.experimental.agents.run_task(\n", - " task=\"code_explainer\",\n", - " input={\n", - " \"source_code\": source_code,\n", - " \"user_instructions\": \"\"\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Codebase Overview\n", - "\n", - "Let's analyze your codebase structure to understand the main modules, components, entry points and their relationships. We'll also examine the technology stack and frameworks that are being utilized in the implementation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Describe the overall structure of the source code repository.\n", - " - Identify main modules, folders, and scripts.\n", - " - Highlight entry points for training, inference, and evaluation.\n", - " - State the main programming languages and frameworks used.\n", - " \"\"\",\n", - " content_id=\"code_structure_summary\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\",\n", - " content_id=\"code_structure_summary\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Environment and Dependencies ('environment_setup')\n", - "Let's document the technical requirements and setup needed to run your code, including Python packages, system dependencies, and environment configuration files. Understanding these requirements is essential for proper development environment setup and consistent deployments across different environments." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - List Python packages and system dependencies (OS, compilers, etc.).\n", - " - Reference environment files (requirements.txt, environment.yml, Dockerfile).\n", - " - Include setup instructions using Conda, virtualenv, or containers.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"setup_instructions\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Data Ingestion and Preprocessing\n", - "Let's document how your code handles data, including data sources, validation procedures, and preprocessing steps. We'll examine the data pipeline architecture, covering everything from initial data loading through feature engineering and quality checks." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Specify data input formats and sources.\n", - " - Document ingestion, validation, and transformation logic.\n", - " - Explain how raw data is preprocessed and features are generated.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections. \"\"\",\n", - " content_id=\"data_handling_notes\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " \n", - "\n", - "\n", - "\n", - "## Model Implementation Details\n", - "Let's document the core implementation details of your model, including its architecture, components, and key algorithms. Understanding the technical implementation is crucial for maintenance, debugging, and future improvements to the codebase. We'll examine how theoretical concepts are translated into working code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Describe the core model code structure (classes, functions).\n", - " - Link code to theoretical models or equations when applicable.\n", - " - Note custom components like loss functions or feature selectors.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"model_code_description\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Model Training Pipeline\n", - "\n", - "Let's document the training pipeline implementation, including how models are trained, optimized and evaluated. We'll examine the training process workflow, hyperparameter tuning approach, and model checkpointing mechanisms. This section provides insights into how the model learns from data and achieves optimal performance." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Explain the training process, optimization strategy, and hyperparameters.\n", - " - Describe logging, checkpointing, and early stopping mechanisms.\n", - " - Include references to training config files or tuning logic.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"training_logic_details\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Evaluation and Validation Code\n", - "Let's examine how the model's validation and evaluation code is implemented, including the metrics calculation and validation processes. We'll explore the diagnostic tools and visualization methods used to assess model performance. This section will also cover how validation results are logged and stored for future reference." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Describe how validation is implemented and metrics are calculated.\n", - " - Include plots and diagnostic tools (e.g., ROC, SHAP, confusion matrix).\n", - " - State how outputs are logged and persisted.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"evaluation_logic_notes\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Inference and Scoring Logic\n", - "Let's examine how the model performs inference and scoring on new data. This section will cover the implementation details of loading trained models, making predictions, and any required pre/post-processing steps. We'll also look at the APIs and interfaces available for both real-time serving and batch scoring scenarios." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Detail how the trained model is loaded and used for predictions.\n", - " - Explain I/O formats and APIs for serving or batch scoring.\n", - " - Include any preprocessing/postprocessing logic required.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"inference_mechanism\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Configuration and Parameters\n", - "Let's explore how configuration and parameters are managed in the codebase. We'll examine the configuration files, command-line arguments, environment variables, and other mechanisms used to control model behavior. This section will also cover parameter versioning and how different configurations are tracked across model iterations." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Describe configuration management (files, CLI args, env vars).\n", - " - Highlight default parameters and override mechanisms.\n", - " - Reference versioning practices for config files.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"config_control_notes\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Unit and Integration Testing\n", - "Let's examine the testing strategy and implementation in the codebase. We'll analyze the unit tests, integration tests, and testing frameworks used to ensure code quality and reliability. This section will also cover test coverage metrics and continuous integration practices." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - List unit and integration tests and what they cover.\n", - " - Mention testing frameworks and coverage tools used.\n", - " - Explain testing strategy for production-readiness.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"test_strategy_overview\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Logging and Monitoring Hooks\n", - "Let's analyze how logging and monitoring are implemented in the codebase. We'll examine the logging configuration, monitoring hooks, and key metrics being tracked. This section will also cover any real-time observability integrations and alerting mechanisms in place." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Describe logging configuration and structure.\n", - " - Highlight real-time monitoring or observability integrations.\n", - " - List key events, metrics, or alerts tracked.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"logging_monitoring_notes\"\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Code and Model Versioning\n", - "Let's examine how code and model versioning is managed in the codebase. This section will cover version control practices, including Git workflows and model artifact versioning tools like DVC or MLflow. We'll also look at how versioning integrates with the CI/CD pipeline." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Describe Git usage, branching, tagging, and commit standards.\n", - " - Include model artifact versioning practices (e.g., DVC, MLflow).\n", - " - Reference any automation in CI/CD.\n", - " Please remove the following sections: \n", - " - Potential Risks or Failure Points\n", - " - Assumptions or Limitations\n", - " - Breakdown of Key Functions or Components\n", - " Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"version_tracking_description\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Security and Access Control\n", - "Let's analyze the security and access control measures implemented in the codebase. We'll examine how sensitive data and code are protected through access controls, encryption, and compliance measures. Additionally, we'll review secure deployment practices and any specific handling of PII data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Document access controls for source code and data.\n", - " - Include any encryption, PII handling, or compliance measures.\n", - " - Mention secure deployment practices.\n", - " Please remove the following sections: \n", - " - Potential Risks or Failure Points\n", - " - Assumptions or Limitations\n", - " - Breakdown of Key Functions or Components\n", - " Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"security_policies_notes\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Example Runs and Scripts\n", - "Let's explore example runs and scripts that demonstrate how to use this codebase in practice. We'll look at working examples, command-line usage, and sample notebooks that showcase the core functionality. This section will also point to demo datasets and test scenarios that can help new users get started quickly." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - Provide working script examples.\n", - " - Include CLI usage instructions or sample notebooks.\n", - " - Link to demo datasets or test scenarios.\n", - " Please remove the following sections: \n", - " - Potential Risks or Failure Points\n", - " - Assumptions or Limitations\n", - " - Breakdown of Key Functions or Components\n", - " Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"runnable_examples\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Known Issues and Future Improvements\n", - "Let's examine the current limitations and areas for improvement in the codebase. This section will document known technical debt, bugs, and feature gaps that need to be addressed. We'll also outline proposed enhancements and reference any existing tickets or GitHub issues tracking these improvements." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = explain_code(\n", - " user_instructions=\"\"\"\n", - " Please provide a summary of the following bullet points only.\n", - " - List current limitations or technical debt.\n", - " - Outline proposed enhancements or refactors.\n", - " - Reference relevant tickets, GitHub issues, or roadmap items.\n", - " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", - " \"\"\",\n", - " content_id=\"issues_and_improvements_log\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "copyright-ccbede139a26452183291a108b791513", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "***\n", - "\n", - "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", - "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", - "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "validmind-1QuffXMV-py3.11", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Quickstart for model code documentation\n", + "\n", + "Welcome! This notebook demonstrates how to use the ValidMind code explainer to automatically generate comprehensive documentation for your codebase. The code explainer analyzes your source code and provides detailed explanations across various aspects of your implementation.\n", + "\n", + "\n", + "\n", + "## About Code Explainer\n", + "The ValidMind code explainer is a powerful tool that automatically analyzes your source code and generates comprehensive documentation. It helps you:\n", + "\n", + "- Understand the structure and organization of your codebase\n", + "- Document dependencies and environment setup\n", + "- Explain data processing and model implementation details\n", + "- Document training, evaluation, and inference pipelines\n", + "- Track configuration, testing, and security measures\n", + "\n", + "This tool is particularly useful for:\n", + "- Onboarding new team members\n", + "- Maintaining up-to-date documentation\n", + "- Ensuring code quality and best practices\n", + "- Facilitating code reviews and audits" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [About Code Explainer](#toc1__) \n", + "- [About ValidMind](#toc2__) \n", + " - [Before you begin](#toc2_1__) \n", + " - [New to ValidMind?](#toc2_2__) \n", + " - [Key concepts](#toc2_3__) \n", + "- [Setting up](#toc3__) \n", + " - [Install the ValidMind Library](#toc3_1__) \n", + " - [Initialize the ValidMind Library](#toc3_2__) \n", + " - [Register sample model](#toc3_2_1__) \n", + " - [Apply documentation template](#toc3_2_2__) \n", + " - [Get your code snippet](#toc3_2_3__) \n", + " - [Preview the documentation template](#toc3_3__) \n", + "- [Common function](#toc4__) \n", + "- [Default Behavior](#toc5__) \n", + "- [Codebase Overview](#toc6__) \n", + "- [Environment and Dependencies ('environment_setup')](#toc7__) \n", + "- [Data Ingestion and Preprocessing](#toc8__) \n", + "- [Model Implementation Details](#toc9__) \n", + "- [Model Training Pipeline](#toc10__) \n", + "- [Evaluation and Validation Code](#toc11__) \n", + "- [Inference and Scoring Logic](#toc12__) \n", + "- [Configuration and Parameters](#toc13__) \n", + "- [Unit and Integration Testing](#toc14__) \n", + "- [Logging and Monitoring Hooks](#toc15__) \n", + "- [Code and Model Versioning](#toc16__) \n", + "- [Security and Access Control](#toc17__) \n", + "- [Example Runs and Scripts](#toc18__) \n", + "- [Known Issues and Future Improvements](#toc19__) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", + "\n", + "\n", + "\n", + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", + "\n", + "\n", + "\n", + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", + "

\n", + "Register with ValidMind
\n", + "\n", + "\n", + "\n", + "### Key concepts\n", + "\n", + "**Model documentation**: A structured and detailed record pertaining to a model, encompassing key components such as its underlying assumptions, methodologies, data sources, inputs, performance metrics, evaluations, limitations, and intended uses. It serves to ensure transparency, adherence to regulatory requirements, and a clear understanding of potential risks associated with the model’s application.\n", + "\n", + "**Documentation template**: Functions as a test suite and lays out the structure of model documentation, segmented into various sections and sub-sections. Documentation templates define the structure of your model documentation, specifying the tests that should be run, and how the results should be displayed.\n", + "\n", + "**Tests**: A function contained in the ValidMind Library, designed to run a specific quantitative test on the dataset or model. Tests are the building blocks of ValidMind, used to evaluate and document models and datasets, and can be run individually or as part of a suite defined by your model documentation template.\n", + "\n", + "**Custom tests**: Custom tests are functions that you define to evaluate your model or dataset. These functions can be registered via the ValidMind Library to be used with the ValidMind Platform.\n", + "\n", + "**Inputs**: Objects to be evaluated and documented in the ValidMind Library. They can be any of the following:\n", + "\n", + " - **model**: A single model that has been initialized in ValidMind with [`vm.init_model()`](https://docs.validmind.ai/validmind/validmind.html#init_model).\n", + " - **dataset**: Single dataset that has been initialized in ValidMind with [`vm.init_dataset()`](https://docs.validmind.ai/validmind/validmind.html#init_dataset).\n", + " - **models**: A list of ValidMind models - usually this is used when you want to compare multiple models in your custom test.\n", + " - **datasets**: A list of ValidMind datasets - usually this is used when you want to compare multiple datasets in your custom test. See this [example](https://docs.validmind.ai/notebooks/how_to/tests/run_tests/configure_tests/run_tests_that_require_multiple_datasets.html) for more information.\n", + "\n", + "**Parameters**: Additional arguments that can be passed when running a ValidMind test, used to pass additional information to a test, customize its behavior, or provide additional context.\n", + "\n", + "**Outputs**: Custom tests can return elements like tables or plots. Tables may be a list of dictionaries (each representing a row) or a pandas DataFrame. Plots may be matplotlib or plotly figures.\n", + "\n", + "**Test suites**: Collections of tests designed to run together to automate and generate model documentation end-to-end for specific use-cases.\n", + "\n", + "Example: the [`classifier_full_suite`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html#ClassifierFullSuite) test suite runs tests from the [`tabular_dataset`](https://docs.validmind.ai/validmind/validmind/test_suites/tabular_datasets.html) and [`classifier`](https://docs.validmind.ai/validmind/validmind/test_suites/classifier.html) test suites to fully document the data and model sections for binary classification model use-cases." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Install the ValidMind Library\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind Library" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Register sample model\n", + "\n", + "Let's first register a sample model for use with this notebook:\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + "4. Select your own name under the **MODEL OWNER** drop-down.\n", + "\n", + "5. Click **Register Model** to add the model to your inventory." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Apply documentation template\n", + "\n", + "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", + "\n", + "2. Under **TEMPLATE**, select `Model Source Code Documentation`.\n", + "\n", + "3. Click **Use Template** to apply the template." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Can't select this template?\n", + "

\n", + "Your organization administrators may need to add it to your template library:\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Get your code snippet\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", + "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + " # document=\"documentation\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Preview the documentation template\n", + "\n", + "Let's verify that you have connected the ValidMind Library to the ValidMind Platform and that the appropriate *template* is selected for your model.\n", + "\n", + "You will upload documentation and test results unique to your model based on this template later on. For now, **take a look at the default structure that the template provides with [the `vm.preview_template()` function](https://docs.validmind.ai/validmind/validmind.html#preview_template)** from the ValidMind library and note the empty sections:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm.preview_template()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Common function\n", + "The code above defines two key functions:\n", + "1. A function to read source code from 'customer_churn_full_suite.py' file\n", + "2. An 'explain_code' function that uses ValidMind's experimental agents to analyze and explain code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "source_code=\"\"\n", + "with open(\"customer_churn_full_suite.py\", \"r\") as f:\n", + " source_code = f.read()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `vm.experimental.agents.run_task` function is used to execute AI agent tasks.\n", + "\n", + "It requires:\n", + "- task: The type of task to run (e.g. `code_explainer`)\n", + "- input: A dictionary containing task-specific parameters\n", + " - For `code_explainer`, this includes:\n", + " - **source_code** (str): The code to be analyzed\n", + " - **user_instructions** (str): Instructions for how to analyze the code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def explain_code(content_id: str, user_instructions: str):\n", + " \"\"\"Run code explanation task and log the results.\n", + " By default, the code explainer includes sections for:\n", + " - Main Purpose and Overall Functionality\n", + " - Breakdown of Key Functions or Components\n", + " - Potential Risks or Failure Points \n", + " - Assumptions or Limitations\n", + " If you want default sections, specify user_instructions as an empty string.\n", + " \n", + " Args:\n", + " user_instructions (str): Instructions for how to analyze the code\n", + " content_id (str): ID to use when logging the results\n", + " \n", + " Returns:\n", + " The result object from running the code explanation task\n", + " \"\"\"\n", + " result = vm.experimental.agents.run_task(\n", + " task=\"code_explainer\",\n", + " input={\n", + " \"source_code\": source_code,\n", + " \"user_instructions\": user_instructions\n", + " }\n", + " )\n", + " result.log(content_id=content_id)\n", + " return result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Default Behavior" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, the code explainer includes sections for:\n", + "- Main Purpose and Overall Functionality\n", + "- Breakdown of Key Functions or Components\n", + "- Potential Risks or Failure Points \n", + "- Assumptions or Limitations\n", + "\n", + "If you want default sections, specify `user_instructions` as an empty string. For example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = vm.experimental.agents.run_task(\n", + " task=\"code_explainer\",\n", + " input={\n", + " \"source_code\": source_code,\n", + " \"user_instructions\": \"\"\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Codebase Overview\n", + "\n", + "Let's analyze your codebase structure to understand the main modules, components, entry points and their relationships. We'll also examine the technology stack and frameworks that are being utilized in the implementation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Describe the overall structure of the source code repository.\n", + " - Identify main modules, folders, and scripts.\n", + " - Highlight entry points for training, inference, and evaluation.\n", + " - State the main programming languages and frameworks used.\n", + " \"\"\",\n", + " content_id=\"code_structure_summary\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\",\n", + " content_id=\"code_structure_summary\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Environment and Dependencies ('environment_setup')\n", + "Let's document the technical requirements and setup needed to run your code, including Python packages, system dependencies, and environment configuration files. Understanding these requirements is essential for proper development environment setup and consistent deployments across different environments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - List Python packages and system dependencies (OS, compilers, etc.).\n", + " - Reference environment files (requirements.txt, environment.yml, Dockerfile).\n", + " - Include setup instructions using Conda, virtualenv, or containers.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"setup_instructions\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Data Ingestion and Preprocessing\n", + "Let's document how your code handles data, including data sources, validation procedures, and preprocessing steps. We'll examine the data pipeline architecture, covering everything from initial data loading through feature engineering and quality checks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Specify data input formats and sources.\n", + " - Document ingestion, validation, and transformation logic.\n", + " - Explain how raw data is preprocessed and features are generated.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections. \"\"\",\n", + " content_id=\"data_handling_notes\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " \n", + "\n", + "\n", + "\n", + "## Model Implementation Details\n", + "Let's document the core implementation details of your model, including its architecture, components, and key algorithms. Understanding the technical implementation is crucial for maintenance, debugging, and future improvements to the codebase. We'll examine how theoretical concepts are translated into working code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Describe the core model code structure (classes, functions).\n", + " - Link code to theoretical models or equations when applicable.\n", + " - Note custom components like loss functions or feature selectors.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"model_code_description\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Model Training Pipeline\n", + "\n", + "Let's document the training pipeline implementation, including how models are trained, optimized and evaluated. We'll examine the training process workflow, hyperparameter tuning approach, and model checkpointing mechanisms. This section provides insights into how the model learns from data and achieves optimal performance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Explain the training process, optimization strategy, and hyperparameters.\n", + " - Describe logging, checkpointing, and early stopping mechanisms.\n", + " - Include references to training config files or tuning logic.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"training_logic_details\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Evaluation and Validation Code\n", + "Let's examine how the model's validation and evaluation code is implemented, including the metrics calculation and validation processes. We'll explore the diagnostic tools and visualization methods used to assess model performance. This section will also cover how validation results are logged and stored for future reference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Describe how validation is implemented and metrics are calculated.\n", + " - Include plots and diagnostic tools (e.g., ROC, SHAP, confusion matrix).\n", + " - State how outputs are logged and persisted.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"evaluation_logic_notes\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Inference and Scoring Logic\n", + "Let's examine how the model performs inference and scoring on new data. This section will cover the implementation details of loading trained models, making predictions, and any required pre/post-processing steps. We'll also look at the APIs and interfaces available for both real-time serving and batch scoring scenarios." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Detail how the trained model is loaded and used for predictions.\n", + " - Explain I/O formats and APIs for serving or batch scoring.\n", + " - Include any preprocessing/postprocessing logic required.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"inference_mechanism\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Configuration and Parameters\n", + "Let's explore how configuration and parameters are managed in the codebase. We'll examine the configuration files, command-line arguments, environment variables, and other mechanisms used to control model behavior. This section will also cover parameter versioning and how different configurations are tracked across model iterations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Describe configuration management (files, CLI args, env vars).\n", + " - Highlight default parameters and override mechanisms.\n", + " - Reference versioning practices for config files.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"config_control_notes\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Unit and Integration Testing\n", + "Let's examine the testing strategy and implementation in the codebase. We'll analyze the unit tests, integration tests, and testing frameworks used to ensure code quality and reliability. This section will also cover test coverage metrics and continuous integration practices." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - List unit and integration tests and what they cover.\n", + " - Mention testing frameworks and coverage tools used.\n", + " - Explain testing strategy for production-readiness.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"test_strategy_overview\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Logging and Monitoring Hooks\n", + "Let's analyze how logging and monitoring are implemented in the codebase. We'll examine the logging configuration, monitoring hooks, and key metrics being tracked. This section will also cover any real-time observability integrations and alerting mechanisms in place." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Describe logging configuration and structure.\n", + " - Highlight real-time monitoring or observability integrations.\n", + " - List key events, metrics, or alerts tracked.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"logging_monitoring_notes\"\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Code and Model Versioning\n", + "Let's examine how code and model versioning is managed in the codebase. This section will cover version control practices, including Git workflows and model artifact versioning tools like DVC or MLflow. We'll also look at how versioning integrates with the CI/CD pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Describe Git usage, branching, tagging, and commit standards.\n", + " - Include model artifact versioning practices (e.g., DVC, MLflow).\n", + " - Reference any automation in CI/CD.\n", + " Please remove the following sections: \n", + " - Potential Risks or Failure Points\n", + " - Assumptions or Limitations\n", + " - Breakdown of Key Functions or Components\n", + " Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"version_tracking_description\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Security and Access Control\n", + "Let's analyze the security and access control measures implemented in the codebase. We'll examine how sensitive data and code are protected through access controls, encryption, and compliance measures. Additionally, we'll review secure deployment practices and any specific handling of PII data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Document access controls for source code and data.\n", + " - Include any encryption, PII handling, or compliance measures.\n", + " - Mention secure deployment practices.\n", + " Please remove the following sections: \n", + " - Potential Risks or Failure Points\n", + " - Assumptions or Limitations\n", + " - Breakdown of Key Functions or Components\n", + " Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"security_policies_notes\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Example Runs and Scripts\n", + "Let's explore example runs and scripts that demonstrate how to use this codebase in practice. We'll look at working examples, command-line usage, and sample notebooks that showcase the core functionality. This section will also point to demo datasets and test scenarios that can help new users get started quickly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - Provide working script examples.\n", + " - Include CLI usage instructions or sample notebooks.\n", + " - Link to demo datasets or test scenarios.\n", + " Please remove the following sections: \n", + " - Potential Risks or Failure Points\n", + " - Assumptions or Limitations\n", + " - Breakdown of Key Functions or Components\n", + " Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"runnable_examples\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## Known Issues and Future Improvements\n", + "Let's examine the current limitations and areas for improvement in the codebase. This section will document known technical debt, bugs, and feature gaps that need to be addressed. We'll also outline proposed enhancements and reference any existing tickets or GitHub issues tracking these improvements." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = explain_code(\n", + " user_instructions=\"\"\"\n", + " Please provide a summary of the following bullet points only.\n", + " - List current limitations or technical debt.\n", + " - Outline proposed enhancements or refactors.\n", + " - Reference relevant tickets, GitHub issues, or roadmap items.\n", + " Please remove Potential Risks or Failure Points and Assumptions or Limitations sections. Please don't add any other sections.\n", + " \"\"\",\n", + " content_id=\"issues_and_improvements_log\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "copyright-72ed6e2a48984af3aca5888b96d1f6b6", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "***\n", + "\n", + "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", + "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", + "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "validmind-1QuffXMV-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/notebooks/use_cases/credit_risk/application_scorecard_executive.ipynb b/notebooks/use_cases/credit_risk/application_scorecard_executive.ipynb index 15d96737b..40388556c 100644 --- a/notebooks/use_cases/credit_risk/application_scorecard_executive.ipynb +++ b/notebooks/use_cases/credit_risk/application_scorecard_executive.ipynb @@ -211,7 +211,8 @@ " api_host = \"https://api.prod.validmind.ai/api/v1/tracking\",\n", " api_key = \"...\",\n", " api_secret = \"...\",\n", - " model = \"...\"\n", + " model = \"...\",\n", + " document=\"documentation\",\n", ")" ] }, @@ -342,7 +343,7 @@ }, { "cell_type": "markdown", - "id": "copyright-97f68fa25e694a059b7028ce3ec374cc", + "id": "copyright-382e83e3fe1d4928ae90c3917480d27d", "metadata": {}, "source": [ "\n", diff --git a/notebooks/use_cases/credit_risk/application_scorecard_full_suite.ipynb b/notebooks/use_cases/credit_risk/application_scorecard_full_suite.ipynb index aef424482..e64a206e4 100644 --- a/notebooks/use_cases/credit_risk/application_scorecard_full_suite.ipynb +++ b/notebooks/use_cases/credit_risk/application_scorecard_full_suite.ipynb @@ -232,7 +232,8 @@ " api_host = \"https://api.prod.validmind.ai/api/v1/tracking\",\n", " api_key = \"...\",\n", " api_secret = \"...\",\n", - " model = \"...\"\n", + " model = \"...\",\n", + " document=\"documentation\",\n", ")" ] }, @@ -862,7 +863,7 @@ }, { "cell_type": "markdown", - "id": "copyright-ce253f0d12144a08847d4a65a250a85f", + "id": "copyright-11ff9f7bf7724930b1bc81a5585f4a94", "metadata": {}, "source": [ "\n", diff --git a/notebooks/use_cases/credit_risk/application_scorecard_with_bias.ipynb b/notebooks/use_cases/credit_risk/application_scorecard_with_bias.ipynb index d13352768..7c2f6a6f3 100644 --- a/notebooks/use_cases/credit_risk/application_scorecard_with_bias.ipynb +++ b/notebooks/use_cases/credit_risk/application_scorecard_with_bias.ipynb @@ -234,6 +234,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " document=\"documentation\",\n", ")" ] }, @@ -1507,7 +1508,7 @@ }, { "cell_type": "markdown", - "id": "copyright-6ee1a1ce0bd74036a8890be21965bfd2", + "id": "copyright-f527b18ca3b94779bf6194547accab31", "metadata": {}, "source": [ "\n", diff --git a/notebooks/use_cases/credit_risk/application_scorecard_with_ml.ipynb b/notebooks/use_cases/credit_risk/application_scorecard_with_ml.ipynb index c69ae1207..7d85be4fb 100644 --- a/notebooks/use_cases/credit_risk/application_scorecard_with_ml.ipynb +++ b/notebooks/use_cases/credit_risk/application_scorecard_with_ml.ipynb @@ -245,7 +245,8 @@ " api_host = \"https://api.prod.validmind.ai/api/v1/tracking\",\n", " api_key = \"...\",\n", " api_secret = \"...\",\n", - " model = \"...\"\n", + " model = \"...\",\n", + " document=\"documentation\",\n", ")" ] }, @@ -1957,7 +1958,7 @@ }, { "cell_type": "markdown", - "id": "copyright-00e4b240625f4af29adb179235912142", + "id": "copyright-ced6c79ffe99424fa8a48ed9ce2be9a5", "metadata": {}, "source": [ "\n", diff --git a/notebooks/use_cases/credit_risk/document_excel_application_scorecard.ipynb b/notebooks/use_cases/credit_risk/document_excel_application_scorecard.ipynb index 9ab901278..22336ad18 100644 --- a/notebooks/use_cases/credit_risk/document_excel_application_scorecard.ipynb +++ b/notebooks/use_cases/credit_risk/document_excel_application_scorecard.ipynb @@ -235,6 +235,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -966,6 +967,7 @@ }, { "cell_type": "markdown", + "id": "copyright-d5682d445de64cada526101fd53872e2", "metadata": {}, "source": [ "\n", diff --git a/notebooks/use_cases/model_validation/validate_application_scorecard.ipynb b/notebooks/use_cases/model_validation/validate_application_scorecard.ipynb index 77e48bb90..82b692998 100644 --- a/notebooks/use_cases/model_validation/validate_application_scorecard.ipynb +++ b/notebooks/use_cases/model_validation/validate_application_scorecard.ipynb @@ -309,6 +309,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"validation-report\",\n", ")" ] }, @@ -1846,6 +1847,7 @@ }, { "cell_type": "markdown", + "id": "copyright-bb563dd58ddf40f49499ddf7f72b21a1", "metadata": {}, "source": [ "\n", diff --git a/notebooks/use_cases/nlp_and_llm/foundation_models_integration_demo.ipynb b/notebooks/use_cases/nlp_and_llm/foundation_models_integration_demo.ipynb index 346ecb6e1..813acd5d9 100644 --- a/notebooks/use_cases/nlp_and_llm/foundation_models_integration_demo.ipynb +++ b/notebooks/use_cases/nlp_and_llm/foundation_models_integration_demo.ipynb @@ -186,6 +186,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -457,7 +458,7 @@ }, { "cell_type": "markdown", - "id": "copyright-114aa3e23e7e44318b66971760526b6f", + "id": "copyright-5ab032f1745c4c7cab699f02156c7ff4", "metadata": {}, "source": [ "\n", diff --git a/notebooks/use_cases/nlp_and_llm/foundation_models_summarization_demo.ipynb b/notebooks/use_cases/nlp_and_llm/foundation_models_summarization_demo.ipynb index a7f06ab15..cfb50ce82 100644 --- a/notebooks/use_cases/nlp_and_llm/foundation_models_summarization_demo.ipynb +++ b/notebooks/use_cases/nlp_and_llm/foundation_models_summarization_demo.ipynb @@ -187,6 +187,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -506,7 +507,7 @@ }, { "cell_type": "markdown", - "id": "copyright-b34e4189be964082bb87b10aa94dcf6a", + "id": "copyright-efc3f6aeaef44ebebe6833e093c660f8", "metadata": {}, "source": [ "\n", diff --git a/notebooks/use_cases/nlp_and_llm/hugging_face_integration_demo.ipynb b/notebooks/use_cases/nlp_and_llm/hugging_face_integration_demo.ipynb index c424355de..082b4fb6e 100644 --- a/notebooks/use_cases/nlp_and_llm/hugging_face_integration_demo.ipynb +++ b/notebooks/use_cases/nlp_and_llm/hugging_face_integration_demo.ipynb @@ -188,6 +188,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -441,7 +442,7 @@ }, { "cell_type": "markdown", - "id": "copyright-8e54f1f3ff334b529685ef57073abb3e", + "id": "copyright-5cf8dfbd26c24ae79e9f222e70152f95", "metadata": {}, "source": [ "\n", diff --git a/notebooks/use_cases/nlp_and_llm/hugging_face_summarization_demo.ipynb b/notebooks/use_cases/nlp_and_llm/hugging_face_summarization_demo.ipynb index 14816ce33..fc4eefd75 100644 --- a/notebooks/use_cases/nlp_and_llm/hugging_face_summarization_demo.ipynb +++ b/notebooks/use_cases/nlp_and_llm/hugging_face_summarization_demo.ipynb @@ -185,6 +185,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -462,7 +463,7 @@ }, { "cell_type": "markdown", - "id": "copyright-14c6f04dcd164deb8a1db44fde050729", + "id": "copyright-8bfb7d59fd9f4116804faf34ac4fe1fa", "metadata": {}, "source": [ "\n", diff --git a/notebooks/use_cases/nlp_and_llm/llm_summarization_demo.ipynb b/notebooks/use_cases/nlp_and_llm/llm_summarization_demo.ipynb index fe6e87956..69f5dcd4c 100644 --- a/notebooks/use_cases/nlp_and_llm/llm_summarization_demo.ipynb +++ b/notebooks/use_cases/nlp_and_llm/llm_summarization_demo.ipynb @@ -219,6 +219,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -979,7 +980,7 @@ }, { "cell_type": "markdown", - "id": "copyright-f38869c3ab0c4de7989f536d06b51773", + "id": "copyright-f65649d1aec9483c95ad944f3fc55c73", "metadata": {}, "source": [ "\n", diff --git a/notebooks/use_cases/nlp_and_llm/prompt_validation_demo.ipynb b/notebooks/use_cases/nlp_and_llm/prompt_validation_demo.ipynb index 7cea4fff1..1f0114a3c 100644 --- a/notebooks/use_cases/nlp_and_llm/prompt_validation_demo.ipynb +++ b/notebooks/use_cases/nlp_and_llm/prompt_validation_demo.ipynb @@ -215,6 +215,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -513,7 +514,7 @@ }, { "cell_type": "markdown", - "id": "copyright-4101a5c77a954664ba8d8a682bee1a1c", + "id": "copyright-da0317263ddc4a119cb7b306ac1b39c1", "metadata": {}, "source": [ "\n", diff --git a/notebooks/use_cases/nlp_and_llm/rag_benchmark_demo.ipynb b/notebooks/use_cases/nlp_and_llm/rag_benchmark_demo.ipynb index 51515ad7b..bb0a89b5b 100644 --- a/notebooks/use_cases/nlp_and_llm/rag_benchmark_demo.ipynb +++ b/notebooks/use_cases/nlp_and_llm/rag_benchmark_demo.ipynb @@ -1,1869 +1,1870 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# RAG Model Benchmarking Demo\n", - "\n", - "In this notebook, we are going to implement a simple RAG Model for automating the process of answering RFP questions using GenAI. We will see how we can initialize an embedding model, a retrieval model and a generator model with LangChain components and use them within the ValidMind Library to run tests against them. We'll demonstrate how to set up multiple models for benchmarking at each stage of the RAG pipeline - specifically two embedding models, two retrieval models with different parameters, and two LLM models (GPT-3.5 and GPT-4o) - allowing for comparison of performance across different configurations. Finally, we will see how we can put them together in a Pipeline and run that to get e2e results and run tests against that." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "::: {.content-hidden when-format=\"html\"}\n", - "## Contents \n", - "- [About ValidMind](#toc1__) \n", - " - [Before you begin](#toc1_1__) \n", - " - [New to ValidMind?](#toc1_2__) \n", - " - [Key concepts](#toc1_3__) \n", - "- [Setting up](#toc2__) \n", - " - [Install the ValidMind Library](#toc2_1__) \n", - " - [Initialize the ValidMind Library](#toc2_2__) \n", - " - [Register sample model](#toc2_2_1__) \n", - " - [Apply documentation template](#toc2_2_2__) \n", - " - [Get your code snippet](#toc2_2_3__) \n", - "- [Read Open AI API Key](#toc3__) \n", - "- [Dataset Loader](#toc4__) \n", - "- [Data validation](#toc5__) \n", - " - [Duplicates](#toc5_1__) \n", - " - [Stop Words](#toc5_2__) \n", - " - [Punctuations](#toc5_3__) \n", - " - [Common Words](#toc5_4__) \n", - " - [Language Detection](#toc5_5__) \n", - " - [Toxicity Score](#toc5_6__) \n", - " - [Polarity and Subjectivity](#toc5_7__) \n", - " - [Sentiment](#toc5_8__) \n", - " - [Assign Predictions](#toc5_9__) \n", - " - [Run tests](#toc5_10__) \n", - " - [Generate embeddings for the Train Set](#toc5_11__) \n", - " - [Insert embeddings and questions into Vector DB](#toc5_12__) \n", - "- [Prompt Evaluation](#toc6__) \n", - "- [RAGAS evaluation](#toc7__) \n", - " - [Semantic Similarity](#toc7_1__) \n", - " - [Context Entity Recall](#toc7_2__) \n", - " - [Context Precision](#toc7_3__) \n", - " - [Context Precision Without Reference](#toc7_4__) \n", - " - [Faithfulness](#toc7_5__) \n", - " - [Response Relevancy](#toc7_6__) \n", - " - [Context Recall](#toc7_7__) \n", - " - [Answer Correctness](#toc7_8__) \n", - " - [Aspect Critic](#toc7_9__) \n", - " - [Noise Sensitivity](#toc7_10__) \n", - "- [Generation quality](#toc8__) \n", - " - [Token Disparity](#toc8_1__) \n", - " - [ROUGE Score](#toc8_2__) \n", - " - [BLEU Score](#toc8_3__) \n", - " - [BERT Score](#toc8_4__) \n", - " - [METEOR Score](#toc8_5__) \n", - "- [Bias and Toxicity](#toc9__) \n", - " - [Toxicity Score](#toc9_1__) \n", - " - [Regard Score](#toc9_2__) \n", - "- [Upgrade ValidMind](#toc10__) \n", - "\n", - ":::\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## About ValidMind\n", - "\n", - "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", - "\n", - "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", - "\n", - "\n", - "\n", - "### Before you begin\n", - "\n", - "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", - "\n", - "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", - "\n", - "\n", - "\n", - "### New to ValidMind?\n", - "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", - "\n", - "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", - "

\n", - "Register with ValidMind
\n", - "\n", - "\n", - "\n", - "### Key concepts\n", - "\n", - "- **FunctionModels**: ValidMind offers support for creating `VMModel` instances from Python functions. This enables us to support any \"model\" by simply using the provided function as the model's `predict` method.\n", - "- **PipelineModels**: ValidMind models (`VMModel` instances) of any type can be piped together to create a model pipeline. This allows model components to be created and tested/documented independently, and then combined into a single model for end-to-end testing and documentation. We use the `|` operator to pipe models together.\n", - "- **RAG**: RAG stands for Retrieval Augmented Generation and refers to a wide range of GenAI applications where some form of retrieval is used to add context to the prompt so that the LLM that generates content can refer to it when creating its output. In this notebook, we are going to implement a simple RAG setup using LangChain components." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Prerequisites\n", - "\n", - "Let's go ahead and install the `validmind` library if its not already installed... Then we can install the `qdrant-client` library for our vector store and `langchain` for everything else:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q \"validmind[llm]\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q qdrant-client langchain langchain-openai sentencepiece" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Setting up" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Install the ValidMind Library\n", - "\n", - "
Recommended Python versions\n", - "

\n", - "Python 3.8 <= x <= 3.11
\n", - "\n", - "To install the library:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q validmind" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the ValidMind Library" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Register sample model\n", - "\n", - "Let's first register a sample model for use with this notebook:\n", - "\n", - "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", - "\n", - "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", - "\n", - "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", - "\n", - "4. Select your own name under the **MODEL OWNER** drop-down.\n", - "\n", - "5. Click **Register Model** to add the model to your inventory." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Apply documentation template\n", - "\n", - "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", - "\n", - "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", - "\n", - "2. Under **TEMPLATE**, select `Gen AI RAG`.\n", - "\n", - "3. Click **Use Template** to apply the template." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
Can't select this template?\n", - "

\n", - "Your organization administrators may need to add it to your template library:\n", - "\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Get your code snippet\n", - "\n", - "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", - "\n", - "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", - "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load your model identifier credentials from an `.env` file\n", - "\n", - "%load_ext dotenv\n", - "%dotenv .env\n", - "\n", - "# Or replace with your code snippet\n", - "\n", - "import validmind as vm\n", - "\n", - "vm.init(\n", - " api_host = \"https://api.prod.validmind.ai/api/v1/tracking\",\n", - " api_key = \"...\",\n", - " api_secret = \"...\",\n", - " model = \"...\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Read Open AI API Key\n", - "\n", - "We will need to have an OpenAI API key to be able to use their `text-embedding-3-small` and `text-embedding-3-large` models for our embeddings, `gpt-3.5-turbo` and `gpt-4o` models for our generator and `gpt-4o` model for our LLM-as-Judge tests. If you don't have an OpenAI API key, you can get one by signing up at [OpenAI](https://platform.openai.com/signup). Then you can create a `.env` file in the root of your project and the following cell will load it from there. Alternatively, you can just uncomment the line below to directly set the key (not recommended for security reasons)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# load openai api key\n", - "import os\n", - "\n", - "import dotenv\n", - "import nltk\n", - "\n", - "dotenv.load_dotenv()\n", - "nltk.download('stopwords')\n", - "nltk.download('punkt_tab')\n", - "\n", - "# os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n", - "\n", - "if not \"OPENAI_API_KEY\" in os.environ:\n", - " raise ValueError(\"OPENAI_API_KEY is not set\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Dataset Loader\n", - "\n", - "Great, now that we have all of our dependencies installed, the ValidMind Library initialized and connected to our model and our OpenAI API key setup, we can go ahead and load our datasets. We will use the synthetic `RFP` dataset included with ValidMind for this notebook. This dataset contains a variety of RFP questions and ground truth answers that we can use both as the source where our Retriever will search for similar question-answer pairs as well as our test set for evaluating the performance of our RAG model. To do this, we just have to load it and call the preprocess function to get a split of the data into train and test sets." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# Import the sample dataset from the library\n", - "from validmind.datasets.llm.rag import rfp\n", - "\n", - "raw_df = rfp.load_data()\n", - "train_df, test_df = rfp.preprocess(raw_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_train_ds = vm.init_dataset(\n", - " train_df,\n", - " text_column=\"question\",\n", - " target_column=\"ground_truth\",\n", - ")\n", - "\n", - "vm_test_ds = vm.init_dataset(\n", - " test_df,\n", - " text_column=\"question\",\n", - " target_column=\"ground_truth\",\n", - ")\n", - "\n", - "vm_test_ds.df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Data validation\n", - "\n", - "Now that we have loaded our dataset, we can go ahead and run some data validation tests right away to start assessing and documenting the quality of our data. Since we are using a text dataset, we can use ValidMind's built-in array of text data quality tests to check that things like number of duplicates, missing values, and other common text data issues are not present in our dataset. We can also run some tests to check the sentiment and toxicity of our data." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Duplicates\n", - "\n", - "First, let's check for duplicates in our dataset. We can use the `validmind.data_validation.Duplicates` test and pass our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.tests import run_test\n", - "\n", - "run_test(\n", - " test_id=\"validmind.data_validation.Duplicates\",\n", - " inputs={\"dataset\": vm_train_ds},\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Stop Words\n", - "\n", - "Next, let's check for stop words in our dataset. We can use the `validmind.data_validation.StopWords` test and pass our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " test_id=\"validmind.data_validation.nlp.StopWords\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Punctuations\n", - "\n", - "Next, let's check for punctuations in our dataset. We can use the `validmind.data_validation.Punctuations` test:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " test_id=\"validmind.data_validation.nlp.Punctuations\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Common Words\n", - "\n", - "Next, let's check for common words in our dataset. We can use the `validmind.data_validation.CommonWord` test:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " test_id=\"validmind.data_validation.nlp.CommonWords\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Language Detection\n", - "\n", - "For documentation purposes, we can detect and log the languages used in the dataset with the `validmind.data_validation.LanguageDetection` test:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " test_id=\"validmind.data_validation.nlp.LanguageDetection\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Toxicity Score\n", - "\n", - "Now, let's go ahead and run the `validmind.data_validation.nlp.Toxicity` test to compute a toxicity score for our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.data_validation.nlp.Toxicity\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Polarity and Subjectivity\n", - "\n", - "We can also run the `validmind.data_validation.nlp.PolarityAndSubjectivity` test to compute the polarity and subjectivity of our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.data_validation.nlp.PolarityAndSubjectivity\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Sentiment\n", - "\n", - "Finally, we can run the `validmind.data_validation.nlp.Sentiment` test to plot the sentiment of our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.data_validation.nlp.Sentiment\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Embedding Model\n", - "\n", - "Now that we have our dataset loaded and have run some data validation tests to assess and document the quality of our data, we can go ahead and initialize our embedding model. We will use `text-embedding-3-small` and `text-embedding-3-large` models from OpenAI for this purpose wrapped in the `OpenAIEmbeddings` class from LangChain. This model will be used to \"embed\" our questions both for inserting the question-answer pairs from the \"train\" set into the vector store and for embedding the question from inputs when making predictions with our RAG model." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_openai import OpenAIEmbeddings\n", - "\n", - "embedding_small_client = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", - "\n", - "\n", - "def embed_small(input):\n", - " \"\"\"Returns a text embedding for the given text\"\"\"\n", - " return embedding_small_client.embed_query(input[\"question\"])\n", - "\n", - "\n", - "vm_embedder_small = vm.init_model(input_id=\"embedding_small_model\", predict_fn=embed_small)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "embedding_large_client = OpenAIEmbeddings(model=\"text-embedding-3-large\")\n", - "\n", - "\n", - "def embed_large(input):\n", - " \"\"\"Returns a text embedding for the given text\"\"\"\n", - " return embedding_large_client.embed_query(input[\"question\"])\n", - "\n", - "\n", - "vm_embedder_large = vm.init_model(input_id=\"embedding_large_model\", predict_fn=embed_large)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "What we have done here is to initialize the `OpenAIEmbeddings` class so it uses OpenAI's `text-embedding-3-small` and `text-embedding-3-large` models. We then created an `embed` function that takes in an `input` dictionary and uses the `embed_query` method of the embedding client to compute the embeddings of the `question`. We use an `embed` function since that is how ValidMind supports any custom model. We will use this strategy for the retrieval and generator models as well but you could also use, say, a HuggingFace model directly. See the documentation for more information on which model types are directly supported - [ValidMind Documentation](https://docs.validmind.ai/validmind/validmind.html)... Finally, we use the `init_model` function from the ValidMind Library to create a `VMModel` object that can be used in ValidMind tests. This also logs the model to our model documentation and any test that uses the model will be linked to the logged model and its metadata." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Assign Predictions\n", - "\n", - "To precompute the embeddings for our test set, we can call the `assign_predictions` method of our `vm_test_ds` object we created above. This will compute the embeddings for each question in the test set and store them in the a special prediction column of the test set thats linked to our `vm_embedder` model. This will allow us to use these embeddings later when we run tests against our embedding model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_ds.assign_predictions(vm_embedder_small)\n", - "vm_test_ds.assign_predictions(vm_embedder_large)\n", - "print(vm_test_ds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Run tests\n", - "\n", - "Now that everything is setup for the embedding model, we can go ahead and run some tests to assess and document the quality of our embeddings. We will use the `validmind.model_validation.embeddings.*` tests to compute a variety of metrics against our model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.StabilityAnalysisRandomNoise\",\n", - " input_grid={\n", - " \"model\": [vm_embedder_small, vm_embedder_large],\n", - " \"dataset\": [vm_test_ds],\n", - " },\n", - " params={\n", - " \"probability\": 0.3,\n", - " \"mean_similarity_threshold\": 0.7,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.StabilityAnalysisSynonyms\",\n", - " input_grid={\n", - " \"model\": [vm_embedder_small, vm_embedder_large],\n", - " \"dataset\": [vm_test_ds],\n", - " },\n", - " params={\n", - " \"probability\": 0.3,\n", - " \"mean_similarity_threshold\": 0.7,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.StabilityAnalysisTranslation\",\n", - " input_grid={\n", - " \"model\": [vm_embedder_small, vm_embedder_large],\n", - " \"dataset\": [vm_test_ds],\n", - " },\n", - " params={\n", - " \"source_lang\": \"en\",\n", - " \"target_lang\": \"fr\",\n", - " \"mean_similarity_threshold\": 0.7,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.CosineSimilarityHeatmap\",\n", - " input_grid={\n", - " \"model\": [vm_embedder_small, vm_embedder_large],\n", - " \"dataset\": [vm_test_ds],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.CosineSimilarityDistribution\",\n", - " input_grid={\n", - " \"model\": [vm_embedder_small, vm_embedder_large],\n", - " \"dataset\": [vm_test_ds],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.PCAComponentsPairwisePlots\",\n", - " input_grid={\n", - " \"model\": [vm_embedder_small, vm_embedder_large],\n", - " \"dataset\": [vm_test_ds],\n", - " },\n", - " params={\n", - " \"n_components\": 3,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Setup Vector Store\n", - "\n", - "Great, so now that we have assessed our embedding model and verified that it is performing well, we can go ahead and use it to compute embeddings for our question-answer pairs in the \"train\" set. We will then use these embeddings to insert the question-answer pairs into a vector store. We will use an in-memory `qdrant` vector database for demo purposes but any option would work just as well here. We will use the `QdrantClient` class from LangChain to interact with the vector store. This class will allow us to insert and search for embeddings in the vector store." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Generate embeddings for the Train Set\n", - "\n", - "We can use the same `assign_predictions` method from earlier except this time we will use the `vm_train_ds` object to compute the embeddings for the question-answer pairs in the \"train\" set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_train_ds.assign_predictions(vm_embedder_small)\n", - "print(vm_train_ds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Insert embeddings and questions into Vector DB\n", - "\n", - "Now that we have computed the embeddings for our question-answer pairs in the \"train\" set, we can go ahead and insert them into the vector store:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.vectorstores import Qdrant\n", - "from langchain_community.document_loaders import DataFrameLoader\n", - "\n", - "# load documents from dataframe\n", - "loader = DataFrameLoader(train_df, page_content_column=\"question\")\n", - "docs = loader.load()\n", - "\n", - "# setup vector datastore\n", - "qdrant = Qdrant.from_documents(\n", - " docs,\n", - " embedding_small_client,\n", - " location=\":memory:\", # Local mode with in-memory storage only\n", - " collection_name=\"rfp_rag_collection\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Retrieval Model\n", - "\n", - "Now that we have an embedding model and a vector database setup and loaded with our data, we need a Retrieval model that can search for similar question-answer pairs for a given input question. Once created, we can initialize this as a ValidMind model and `assign_predictions` to it just like our embedding model. In this example, we'll create two retrieval models with different `k` parameters (the number of documents retrieved) to benchmark and compare their performance. This approach allows us to evaluate how retrieval depth affects the overall system quality." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "def retrieve(input):\n", - " contexts = []\n", - "\n", - " for result in qdrant.similarity_search_with_score(input[\"question\"], k=5):\n", - " document, score = result\n", - " context = f\"Q: {document.page_content}\\n\"\n", - " context += f\"A: {document.metadata['ground_truth']}\\n\"\n", - "\n", - " contexts.append(context)\n", - "\n", - " return contexts\n", - "\n", - "\n", - "vm_retriever_k5 = vm.init_model(input_id=\"retrieval_k5_model\", predict_fn=retrieve)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "def retrieve(input):\n", - " contexts = []\n", - "\n", - " for result in qdrant.similarity_search_with_score(input[\"question\"], k=10):\n", - " document, score = result\n", - " context = f\"Q: {document.page_content}\\n\"\n", - " context += f\"A: {document.metadata['ground_truth']}\\n\"\n", - "\n", - " contexts.append(context)\n", - "\n", - " return contexts\n", - "\n", - "\n", - "vm_retriever_k10 = vm.init_model(input_id=\"retrieval_k10_model\", predict_fn=retrieve)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_ds.assign_predictions(model=vm_retriever_k5)\n", - "vm_test_ds.assign_predictions(model=vm_retriever_k10)\n", - "print(vm_test_ds)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_ds._df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Generation Model\n", - "\n", - "As the final piece of this simple RAG pipeline, we can create and initialize a generation model that will use the retrieved context to generate an answer to the input question. We will use the `gpt-3.5-turbo` and `gpt-4o` models from OpenAI. Since we have two retrieval models (with different `k` values) and want to test two different LLMs, we'll create a total of four generator models - pairing each retrieval configuration with each LLM to comprehensively evaluate how both retrieval depth and model capability affect response quality." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "from openai import OpenAI\n", - "\n", - "from validmind.models import Prompt\n", - "\n", - "\n", - "system_prompt = \"\"\"\n", - "You are an expert RFP AI assistant.\n", - "You are tasked with answering new RFP questions based on existing RFP questions and answers.\n", - "You will be provided with the existing RFP questions and answer pairs that are the most relevant to the new RFP question.\n", - "After that you will be provided with a new RFP question.\n", - "You will generate an answer and respond only with the answer.\n", - "Ignore your pre-existing knowledge and answer the question based on the provided context.\n", - "\"\"\".strip()\n", - "\n", - "openai_client = OpenAI()" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "def generate(input):\n", - " \n", - " response = openai_client.chat.completions.create(\n", - " model=\"gpt-3.5-turbo\",\n", - " messages=[\n", - " {\"role\": \"system\", \"content\": system_prompt},\n", - " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k5_model\"])},\n", - " {\"role\": \"user\", \"content\": input[\"question\"]},\n", - " ],\n", - " )\n", - " \n", - " return response.choices[0].message.content\n", - "\n", - "\n", - "vm_generator_k5_gpt35 = vm.init_model(\n", - " input_id=\"generation_k5_gpt35_model\",\n", - " predict_fn=generate,\n", - " prompt=Prompt(template=system_prompt),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "def generate(input):\n", - " response = openai_client.chat.completions.create(\n", - " model=\"gpt-3.5-turbo\",\n", - " messages=[\n", - " {\"role\": \"system\", \"content\": system_prompt},\n", - " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k10_model\"])},\n", - " {\"role\": \"user\", \"content\": input[\"question\"]},\n", - " ],\n", - " )\n", - "\n", - " return response.choices[0].message.content\n", - "\n", - "\n", - "vm_generator_k10_gpt35 = vm.init_model(\n", - " input_id=\"generation_k10_gpt35_model\",\n", - " predict_fn=generate,\n", - " prompt=Prompt(template=system_prompt),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "def generate(input):\n", - " \n", - " response = openai_client.chat.completions.create(\n", - " model=\"gpt-4o\",\n", - " messages=[\n", - " {\"role\": \"system\", \"content\": system_prompt},\n", - " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k5_model\"])},\n", - " {\"role\": \"user\", \"content\": input[\"question\"]},\n", - " ],\n", - " )\n", - " \n", - " return response.choices[0].message.content\n", - "\n", - "\n", - "vm_generator_k5_gpt4o = vm.init_model(\n", - " input_id=\"generation_k5_gpt4o_model\",\n", - " predict_fn=generate,\n", - " prompt=Prompt(template=system_prompt),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def generate(input):\n", - " response = openai_client.chat.completions.create(\n", - " model=\"gpt-4o\",\n", - " messages=[\n", - " {\"role\": \"system\", \"content\": system_prompt},\n", - " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k10_model\"])},\n", - " {\"role\": \"user\", \"content\": input[\"question\"]},\n", - " ],\n", - " )\n", - "\n", - " return response.choices[0].message.content\n", - "\n", - "\n", - "vm_generator_k10_gpt4o = vm.init_model(\n", - " input_id=\"generation_k10_gpt4o_model\",\n", - " predict_fn=generate,\n", - " prompt=Prompt(template=system_prompt),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's test it out real quick:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "vm_generator_k5_gpt35.predict(\n", - " pd.DataFrame(\n", - " {\"retrieval_k5_model\": [[\"My name is anil\"]], \"question\": [\"what is my name\"]}\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_generator_k5_gpt4o.predict(\n", - " pd.DataFrame(\n", - " {\"retrieval_k5_model\": [[\"My name is anil\"]], \"question\": [\"what is my name\"]}\n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Prompt Evaluation\n", - "\n", - "Now that we have our generator model initialized, we can run some LLM-as-Judge tests to evaluate the system prompt. This will allow us to get an initial sense of how well the prompt meets a few best practices for prompt engineering. These tests use an LLM to rate the prompt on a scale of 1-10 against the following criteria:\n", - "\n", - "- **Examplar Bias**: When using multi-shot prompting, does the prompt contain an unbiased distribution of examples?\n", - "- **Delimitation**: When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", - "- **Clarity**: How clearly the prompt states the task.\n", - "- **Conciseness**: How succinctly the prompt states the task.\n", - "- **Instruction Framing**: Whether the prompt contains negative instructions.\n", - "- **Specificity**: How specific the prompt defines the task." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Bias\",\n", - " inputs={\n", - " \"model\": vm_generator_k5_gpt4o,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Clarity\",\n", - " inputs={\n", - " \"model\": vm_generator_k5_gpt4o,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Conciseness\",\n", - " inputs={\n", - " \"model\": vm_generator_k5_gpt4o,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Delimitation\",\n", - " inputs={\n", - " \"model\": vm_generator_k5_gpt4o,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.NegativeInstruction\",\n", - " inputs={\n", - " \"model\": vm_generator_k5_gpt4o,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Specificity\",\n", - " inputs={\n", - " \"model\": vm_generator_k5_gpt4o,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Setup RAG Pipeline Model\n", - "\n", - "Now that we have all of our individual \"component\" models setup and initialized we need some way to put them all together in a single \"pipeline\". We can use the `PipelineModel` class to do this. This ValidMind model type simply wraps any number of other ValidMind models and runs them in sequence. We can use a pipe(`|`) operator - in Python this is normally an `or` operator but we have overloaded it for easy pipeline creation - to chain together our models. We can then initialize this pipeline model and assign predictions to it just like any other model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_rag_k5_gpt35_model = vm.init_model(vm_retriever_k5 | vm_generator_k5_gpt35, input_id=\"rag_k5_gpt35_model\")\n", - "vm_rag_k10_gpt35_model = vm.init_model(vm_retriever_k10 | vm_generator_k10_gpt35, input_id=\"rag_k10_gpt35_model\")\n", - "vm_rag_k5_gpt4o_model = vm.init_model(vm_retriever_k5 | vm_generator_k5_gpt4o, input_id=\"rag_k5_gpt4o_model\")\n", - "vm_rag_k10_gpt4o_model = vm.init_model(vm_retriever_k10 | vm_generator_k10_gpt4o, input_id=\"rag_k10_gpt4o_model\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can `assign_predictions` to the pipeline model just like we did with the individual models. This will run the pipeline on the test set and store the results in the test set for later use." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_ds.assign_predictions(model=vm_rag_k5_gpt35_model)\n", - "vm_test_ds.assign_predictions(model=vm_rag_k10_gpt35_model)\n", - "vm_test_ds.assign_predictions(model=vm_rag_k5_gpt4o_model)\n", - "vm_test_ds.assign_predictions(model=vm_rag_k10_gpt4o_model)\n", - "print(vm_test_ds)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_ds._df.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Run tests" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## RAGAS evaluation\n", - "\n", - "Let's go ahead and run some of our new RAG tests against our model...\n", - "\n", - "> Note: these tests are still being developed and are not yet in a stable state. We are using advanced tests here that use LLM-as-Judge and other strategies to assess things like the relevancy of the retrieved context to the input question and the correctness of the generated answer when compared to the ground truth. There is more to come in this area so stay tuned!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Semantic Similarity\n", - "\n", - "The concept of Answer Semantic Similarity pertains to the assessment of the semantic resemblance between the generated answer and the ground truth. This evaluation is based on the ground truth and the answer, with values falling within the range of 0 to 1. A higher score signifies a better alignment between the generated answer and the ground truth.\n", - "\n", - "Measuring the semantic similarity between answers can offer valuable insights into the quality of the generated response. This evaluation utilizes a cross-encoder model to calculate the semantic similarity score." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.SemanticSimilarity\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", - " \"reference_column\": [\"ground_truth\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Context Entity Recall\n", - "\n", - "This test gives the measure of recall of the retrieved context, based on the number of entities present in both ground_truths and contexts relative to the number of entities present in the ground_truths alone. Simply put, it is a measure of what fraction of entities are recalled from ground_truths. This test is useful in fact-based use cases like tourism help desk, historical QA, etc. This test can help evaluate the retrieval mechanism for entities, based on comparison with entities present in ground_truths, because in cases where entities matter, we need the contexts which cover them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextEntityRecall\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"reference_column\": [\"ground_truth\"],\n", - " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\", \"retrieval_k10_model_prediction\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Context Precision\n", - "\n", - "Context Precision is a test that evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Ideally all the relevant chunks must appear at the top ranks. This test is computed using the question, ground_truth and the contexts, with values ranging between 0 and 1, where higher scores indicate better precision." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextPrecision\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"user_input_column\": [\"question\"],\n", - " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\", \"retrieval_k10_model_prediction\"],\n", - " \"reference_column\": [\"ground_truth\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Context Precision Without Reference\n", - "\n", - "This test evaluates whether retrieved contexts align well with the expected response for a given user input, without requiring a ground-truth reference. This test assesses the relevance of each retrieved context chunk by comparing it directly to the response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextPrecisionWithoutReference\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid=[\n", - " {\"user_input_column\": \"question\",\n", - " \"retrieved_contexts_column\": \"retrieval_k5_model_prediction\",\n", - " \"response_column\": \"rag_k5_gpt4o_model_prediction\"\n", - " },\n", - " {\"user_input_column\": \"question\",\n", - " \"retrieved_contexts_column\": \"retrieval_k10_model_prediction\",\n", - " \"response_column\": \"rag_k10_gpt4o_model_prediction\"\n", - " },\n", - " ],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextPrecisionWithoutReference\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"user_input_column\": [\"question\"],\n", - " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", - " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Faithfulness\n", - "\n", - "This measures the factual consistency of the generated answer against the given context. It is calculated from answer and retrieved context. The answer is scaled to (0,1) range. Higher the better.\n", - "\n", - "The generated answer is regarded as faithful if all the claims that are made in the answer can be inferred from the given context. To calculate this a set of claims from the generated answer is first identified. Then each one of these claims are cross checked with given context to determine if it can be inferred from given context or not." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.Faithfulness\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"user_input_column\": [\"question\"],\n", - " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", - " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Response Relevancy\n", - "\n", - "The Response Relevancy test, focuses on assessing how pertinent the generated answer is to the given prompt. A lower score is assigned to answers that are incomplete or contain redundant information and higher scores indicate better relevancy. This test is computed using the question, the context and the answer.\n", - "\n", - "The Response Relevancy is defined as the mean cosine similartiy of the original question to a number of artifical questions, which where generated (reverse engineered) based on the answer.\n", - "\n", - "Please note, that eventhough in practice the score will range between 0 and 1 most of the time, this is not mathematically guranteed, due to the nature of the cosine similarity ranging from -1 to 1.\n", - "\n", - "> Note: This is a reference free test. If you’re looking to compare ground truth answer with generated answer refer to Answer Correctness.\n", - "\n", - "An answer is deemed relevant when it directly and appropriately addresses the original question. Importantly, our assessment of answer relevance does not consider factuality but instead penalizes cases where the answer lacks completeness or contains redundant details. To calculate this score, the LLM is prompted to generate an appropriate question for the generated answer multiple times, and the mean cosine similarity between these generated questions and the original question is measured. The underlying idea is that if the generated answer accurately addresses the initial question, the LLM should be able to generate questions from the answer that align with the original question." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ResponseRelevancy\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"user_input_column\": [\"question\"],\n", - " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", - " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Context Recall\n", - "\n", - "Context recall measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. It is computed based on the ground truth and the retrieved context, and the values range between 0 and 1, with higher values indicating better performance.\n", - "\n", - "To estimate context recall from the ground truth answer, each sentence in the ground truth answer is analyzed to determine whether it can be attributed to the retrieved context or not. In an ideal scenario, all sentences in the ground truth answer should be attributable to the retrieved context." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextRecall\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"user_input_column\": [\"question\"],\n", - " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\", \"retrieval_k10_model_prediction\"],\n", - " \"reference_column\": [\"ground_truth\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Answer Correctness\n", - "\n", - "The assessment of Answer Correctness involves gauging the accuracy of the generated answer when compared to the ground truth. This evaluation relies on the ground truth and the answer, with scores ranging from 0 to 1. A higher score indicates a closer alignment between the generated answer and the ground truth, signifying better correctness.\n", - "\n", - "Answer correctness encompasses two critical aspects: semantic similarity between the generated answer and the ground truth, as well as factual similarity. These aspects are combined using a weighted scheme to formulate the answer correctness score.\n", - "\n", - "Factual correctness quantifies the factual overlap between the generated answer and the ground truth answer. This is done using the concepts of:\n", - "\n", - "- TP (True Positive): Facts or statements that are present in both the ground truth and the generated answer.\n", - "- FP (False Positive): Facts or statements that are present in the generated answer but not in the ground truth.\n", - "- FN (False Negative): Facts or statements that are present in the ground truth but not in the generated answer." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.AnswerCorrectness\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"user_input_column\": [\"question\"],\n", - " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", - " \"reference_column\": [\"ground_truth\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Aspect Critic\n", - "\n", - "This is designed to assess submissions based on predefined aspects such as harmlessness and correctness. Additionally, users have the flexibility to define their own aspects for evaluating submissions according to their specific criteria. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not. This evaluation is performed using the ‘answer’ as input.\n", - "\n", - "Critiques within the LLM evaluators evaluate submissions based on the provided aspect. Ragas Critiques offers a range of predefined aspects like correctness, harmfulness, etc. Users can also define their own aspects for evaluating submissions based on their specific criteria. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.AspectCritic\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"user_input_column\": [\"question\"],\n", - " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", - " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Noise Sensitivity\n", - "\n", - "This test is designed to evaluate the robustness of the RAG pipeline model against noise in the retrieved context. It works by checking how well the \"claims\" in the generated answer match up with the \"claims\" in the ground truth answer. If the generated answer contains \"claims\" from the contexts that the ground truth answer does not contain, those claims are considered incorrect. The score for each answer is the number of incorrect claims divided by the total number of claims. This *can* be interpreted as a measure of how sensitive the LLM is to \"noise\" in the context where \"noise\" is information that is relevant but should not be included in the answer since the ground truth answer does not contain it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.NoiseSensitivity\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " param_grid={\n", - " \"user_input_column\": [\"question\"],\n", - " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", - " \"reference_column\": [\"ground_truth\"],\n", - " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Generation quality\n", - "\n", - "In this section, we evaluate the alignment and relevance of generated responses to reference outputs within our retrieval-augmented generation (RAG) application. We use metrics that assess various quality dimensions of the generated responses, including semantic similarity, structural alignment, and phrasing overlap. Semantic similarity metrics compare embeddings of generated and reference text to capture deeper contextual alignment, while overlap and alignment measures quantify how well the phrasing and structure of generated responses match the intended outputs." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Token Disparity\n", - "\n", - "This test assesses the difference in token counts between the reference texts (ground truth) and the answers generated by the RAG model. It helps evaluate how well the model's outputs align with the expected length and level of detail in the reference texts. A significant disparity in token counts could signal issues with generation quality, such as excessive verbosity or insufficient detail. Consistently low token counts in generated answers compared to references might suggest that the model’s outputs are incomplete or overly concise, missing important contextual information." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.TokenDisparity\",\n", - " input_grid={\n", - " \"dataset\": [vm_test_ds],\n", - " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### ROUGE Score\n", - "\n", - "This test evaluates the quality of answers generated by the RAG model by measuring overlaps in n-grams, word sequences, and word pairs between the model output and the reference (ground truth) text. ROUGE, short for Recall-Oriented Understudy for Gisting Evaluation, assesses both precision and recall, providing a balanced view of how well the generated response captures the reference content. ROUGE precision measures the proportion of n-grams in the generated text that match the reference, highlighting relevance and conciseness, while ROUGE recall assesses the proportion of reference n-grams present in the generated text, indicating completeness and thoroughness. \n", - "\n", - "Low precision scores might reveal that the generated text includes redundant or irrelevant information, while low recall scores suggest omissions of essential details from the reference. Consistently low ROUGE scores could indicate poor overall alignment with the ground truth, suggesting the model may be missing key content or failing to capture the intended meaning." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.RougeScore\",\n", - " input_grid={\n", - " \"dataset\": [vm_test_ds],\n", - " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", - " },\n", - " params={\n", - " \"metric\": \"rouge-1\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### BLEU Score\n", - "\n", - "The BLEU Score test evaluates the quality of answers generated by the RAG model by measuring n-gram overlap between the generated text and the reference (ground truth) text, with a specific focus on exact precision in phrasing. While ROUGE precision also assesses overlap, BLEU differs in two main ways: first, it applies a geometric average across multiple n-gram levels, capturing precise phrase alignment, and second, it includes a brevity penalty to prevent overly short outputs from inflating scores artificially. This added precision focus is valuable in RAG applications where strict adherence to reference language is essential, as BLEU emphasizes the match to exact phrasing. In contrast, ROUGE precision evaluates general content overlap without penalizing brevity, offering a broader sense of content alignment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.BleuScore\",\n", - " input_grid={\n", - " \"dataset\": [vm_test_ds],\n", - " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### BERT Score\n", - "\n", - "This test evaluates the quality of the RAG generated answers using BERT embeddings to measure precision, recall, and F1 scores based on semantic similarity, rather than exact n-gram matches as in BLEU and ROUGE. This approach captures contextual meaning, making it valuable when wording differs but the intended message closely aligns with the reference. In RAG applications, the BERT score is especially useful for ensuring that generated answers convey the reference text’s meaning, even if phrasing varies. Consistently low scores indicate a lack of semantic alignment, suggesting the model may miss or misrepresent key content. Low precision may reflect irrelevant or redundant details, while low recall can indicate omissions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.BertScore\",\n", - " input_grid={\n", - " \"dataset\": [vm_test_ds],\n", - " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### METEOR Score\n", - "\n", - "This test evaluates the quality of the generated answers by measuring alignment with the ground truth, emphasizing both accuracy and fluency. Unlike BLEU and ROUGE, which focus on n-gram matches, METEOR combines precision, recall, synonym matching, and word order, focusing at how well the generated text conveys meaning and reads naturally. This metric is especially useful for RAG applications where sentence structure and natural flow are crucial for clear communication. Lower scores may suggest alignment issues, indicating that the answers may lack fluency or key content. Discrepancies in word order or high fragmentation penalties can reveal problems with how the model constructs sentences, potentially affecting readability." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.MeteorScore\",\n", - " input_grid={\n", - " \"dataset\": [vm_test_ds],\n", - " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Bias and Toxicity\n", - "\n", - "In this section, we use metrics like Toxicity Score and Regard Score to evaluate both the generated responses and the ground truth. These tests helps us detect any harmful, offensive, or inappropriate language and evaluate the level of bias and neutrality enabling us to assess and mitigate potential biases in both the model's responses and the original dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Toxicity Score\n", - "\n", - "This test measures the level of harmful or offensive content in the generated answers. The test uses a preloaded toxicity detection tool from Hugging Face, which identifies language that may be inappropriate, aggressive, or derogatory. High toxicity scores indicate potentially toxic content, while consistently elevated scores across multiple outputs may signal underlying issues in the model’s generation process that require attention to prevent the spread of harmful language." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ToxicityScore\",\n", - " input_grid={\n", - " \"dataset\": [vm_test_ds],\n", - " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Regard Score\n", - "\n", - "This test evaluates the sentiment and perceived regard—categorized as positive, negative, neutral, or other—in answers generated by the RAG model. This is important for identifying any biases or sentiment tendencies in responses, ensuring that generated answers are balanced and appropriate for the context. The uses a preloaded regard evaluation tool from Hugging Face to compute scores for each response. High skewness in regard scores, especially if the generated responses consistently diverge from expected sentiments in the reference texts, may reveal biases in the model’s generation, such as overly positive or negative tones where neutrality is expected." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.RegardScore\",\n", - " input_grid={\n", - " \"dataset\": [vm_test_ds],\n", - " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Conclusion\n", - "\n", - "In this notebook, we have seen how we can use LangChain and ValidMind together to build, evaluate and document a simple RAG Model as its developed. This is a great example of the interactive development experience that ValidMind is designed to support. We can quickly iterate on our model and document as we go... We have seen how ValidMind supports non-traditional \"models\" using a functional interface and how we can build pipelines of many models to support complex GenAI workflows.\n", - "\n", - "This is still a work in progress and we are actively developing new tests to support more advanced GenAI workflows. We are also keeping an eye on the most popular GenAI models and libraries to explore direct integrations. Stay tuned for more updates and new features in this area!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Upgrade ValidMind\n", - "\n", - "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", - "\n", - "Retrieve the information for the currently installed version of ValidMind:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip show validmind" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", - "\n", - "```bash\n", - "%pip install --upgrade validmind\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You may need to restart your kernel after running the upgrade package for changes to be applied." - ] - }, - { - "cell_type": "markdown", - "id": "copyright-09e315440ca84258abe1aaefaca3a3d0", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "***\n", - "\n", - "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", - "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", - "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "ValidMind Library", - "language": "python", - "name": "validmind" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RAG Model Benchmarking Demo\n", + "\n", + "In this notebook, we are going to implement a simple RAG Model for automating the process of answering RFP questions using GenAI. We will see how we can initialize an embedding model, a retrieval model and a generator model with LangChain components and use them within the ValidMind Library to run tests against them. We'll demonstrate how to set up multiple models for benchmarking at each stage of the RAG pipeline - specifically two embedding models, two retrieval models with different parameters, and two LLM models (GPT-3.5 and GPT-4o) - allowing for comparison of performance across different configurations. Finally, we will see how we can put them together in a Pipeline and run that to get e2e results and run tests against that." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [About ValidMind](#toc1__) \n", + " - [Before you begin](#toc1_1__) \n", + " - [New to ValidMind?](#toc1_2__) \n", + " - [Key concepts](#toc1_3__) \n", + "- [Setting up](#toc2__) \n", + " - [Install the ValidMind Library](#toc2_1__) \n", + " - [Initialize the ValidMind Library](#toc2_2__) \n", + " - [Register sample model](#toc2_2_1__) \n", + " - [Apply documentation template](#toc2_2_2__) \n", + " - [Get your code snippet](#toc2_2_3__) \n", + "- [Read Open AI API Key](#toc3__) \n", + "- [Dataset Loader](#toc4__) \n", + "- [Data validation](#toc5__) \n", + " - [Duplicates](#toc5_1__) \n", + " - [Stop Words](#toc5_2__) \n", + " - [Punctuations](#toc5_3__) \n", + " - [Common Words](#toc5_4__) \n", + " - [Language Detection](#toc5_5__) \n", + " - [Toxicity Score](#toc5_6__) \n", + " - [Polarity and Subjectivity](#toc5_7__) \n", + " - [Sentiment](#toc5_8__) \n", + " - [Assign Predictions](#toc5_9__) \n", + " - [Run tests](#toc5_10__) \n", + " - [Generate embeddings for the Train Set](#toc5_11__) \n", + " - [Insert embeddings and questions into Vector DB](#toc5_12__) \n", + "- [Prompt Evaluation](#toc6__) \n", + "- [RAGAS evaluation](#toc7__) \n", + " - [Semantic Similarity](#toc7_1__) \n", + " - [Context Entity Recall](#toc7_2__) \n", + " - [Context Precision](#toc7_3__) \n", + " - [Context Precision Without Reference](#toc7_4__) \n", + " - [Faithfulness](#toc7_5__) \n", + " - [Response Relevancy](#toc7_6__) \n", + " - [Context Recall](#toc7_7__) \n", + " - [Answer Correctness](#toc7_8__) \n", + " - [Aspect Critic](#toc7_9__) \n", + " - [Noise Sensitivity](#toc7_10__) \n", + "- [Generation quality](#toc8__) \n", + " - [Token Disparity](#toc8_1__) \n", + " - [ROUGE Score](#toc8_2__) \n", + " - [BLEU Score](#toc8_3__) \n", + " - [BERT Score](#toc8_4__) \n", + " - [METEOR Score](#toc8_5__) \n", + "- [Bias and Toxicity](#toc9__) \n", + " - [Toxicity Score](#toc9_1__) \n", + " - [Regard Score](#toc9_2__) \n", + "- [Upgrade ValidMind](#toc10__) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", + "\n", + "\n", + "\n", + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", + "\n", + "\n", + "\n", + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", + "

\n", + "Register with ValidMind
\n", + "\n", + "\n", + "\n", + "### Key concepts\n", + "\n", + "- **FunctionModels**: ValidMind offers support for creating `VMModel` instances from Python functions. This enables us to support any \"model\" by simply using the provided function as the model's `predict` method.\n", + "- **PipelineModels**: ValidMind models (`VMModel` instances) of any type can be piped together to create a model pipeline. This allows model components to be created and tested/documented independently, and then combined into a single model for end-to-end testing and documentation. We use the `|` operator to pipe models together.\n", + "- **RAG**: RAG stands for Retrieval Augmented Generation and refers to a wide range of GenAI applications where some form of retrieval is used to add context to the prompt so that the LLM that generates content can refer to it when creating its output. In this notebook, we are going to implement a simple RAG setup using LangChain components." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prerequisites\n", + "\n", + "Let's go ahead and install the `validmind` library if its not already installed... Then we can install the `qdrant-client` library for our vector store and `langchain` for everything else:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q \"validmind[llm]\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q qdrant-client langchain langchain-openai sentencepiece" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Install the ValidMind Library\n", + "\n", + "
Recommended Python versions\n", + "

\n", + "Python 3.8 <= x <= 3.11
\n", + "\n", + "To install the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind Library" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Register sample model\n", + "\n", + "Let's first register a sample model for use with this notebook:\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "\n", + "4. Select your own name under the **MODEL OWNER** drop-down.\n", + "\n", + "5. Click **Register Model** to add the model to your inventory." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Apply documentation template\n", + "\n", + "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", + "\n", + "2. Under **TEMPLATE**, select `Gen AI RAG`.\n", + "\n", + "3. Click **Use Template** to apply the template." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Can't select this template?\n", + "

\n", + "Your organization administrators may need to add it to your template library:\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Get your code snippet\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", + "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " api_host = \"https://api.prod.validmind.ai/api/v1/tracking\",\n", + " api_key = \"...\",\n", + " api_secret = \"...\",\n", + " model = \"...\",\n", + " document=\"documentation\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Read Open AI API Key\n", + "\n", + "We will need to have an OpenAI API key to be able to use their `text-embedding-3-small` and `text-embedding-3-large` models for our embeddings, `gpt-3.5-turbo` and `gpt-4o` models for our generator and `gpt-4o` model for our LLM-as-Judge tests. If you don't have an OpenAI API key, you can get one by signing up at [OpenAI](https://platform.openai.com/signup). Then you can create a `.env` file in the root of your project and the following cell will load it from there. Alternatively, you can just uncomment the line below to directly set the key (not recommended for security reasons)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load openai api key\n", + "import os\n", + "\n", + "import dotenv\n", + "import nltk\n", + "\n", + "dotenv.load_dotenv()\n", + "nltk.download('stopwords')\n", + "nltk.download('punkt_tab')\n", + "\n", + "# os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n", + "\n", + "if not \"OPENAI_API_KEY\" in os.environ:\n", + " raise ValueError(\"OPENAI_API_KEY is not set\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Dataset Loader\n", + "\n", + "Great, now that we have all of our dependencies installed, the ValidMind Library initialized and connected to our model and our OpenAI API key setup, we can go ahead and load our datasets. We will use the synthetic `RFP` dataset included with ValidMind for this notebook. This dataset contains a variety of RFP questions and ground truth answers that we can use both as the source where our Retriever will search for similar question-answer pairs as well as our test set for evaluating the performance of our RAG model. To do this, we just have to load it and call the preprocess function to get a split of the data into train and test sets." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the sample dataset from the library\n", + "from validmind.datasets.llm.rag import rfp\n", + "\n", + "raw_df = rfp.load_data()\n", + "train_df, test_df = rfp.preprocess(raw_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_train_ds = vm.init_dataset(\n", + " train_df,\n", + " text_column=\"question\",\n", + " target_column=\"ground_truth\",\n", + ")\n", + "\n", + "vm_test_ds = vm.init_dataset(\n", + " test_df,\n", + " text_column=\"question\",\n", + " target_column=\"ground_truth\",\n", + ")\n", + "\n", + "vm_test_ds.df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Data validation\n", + "\n", + "Now that we have loaded our dataset, we can go ahead and run some data validation tests right away to start assessing and documenting the quality of our data. Since we are using a text dataset, we can use ValidMind's built-in array of text data quality tests to check that things like number of duplicates, missing values, and other common text data issues are not present in our dataset. We can also run some tests to check the sentiment and toxicity of our data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Duplicates\n", + "\n", + "First, let's check for duplicates in our dataset. We can use the `validmind.data_validation.Duplicates` test and pass our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.tests import run_test\n", + "\n", + "run_test(\n", + " test_id=\"validmind.data_validation.Duplicates\",\n", + " inputs={\"dataset\": vm_train_ds},\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Stop Words\n", + "\n", + "Next, let's check for stop words in our dataset. We can use the `validmind.data_validation.StopWords` test and pass our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.StopWords\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Punctuations\n", + "\n", + "Next, let's check for punctuations in our dataset. We can use the `validmind.data_validation.Punctuations` test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.Punctuations\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Common Words\n", + "\n", + "Next, let's check for common words in our dataset. We can use the `validmind.data_validation.CommonWord` test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.CommonWords\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Language Detection\n", + "\n", + "For documentation purposes, we can detect and log the languages used in the dataset with the `validmind.data_validation.LanguageDetection` test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.LanguageDetection\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Toxicity Score\n", + "\n", + "Now, let's go ahead and run the `validmind.data_validation.nlp.Toxicity` test to compute a toxicity score for our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.nlp.Toxicity\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Polarity and Subjectivity\n", + "\n", + "We can also run the `validmind.data_validation.nlp.PolarityAndSubjectivity` test to compute the polarity and subjectivity of our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.nlp.PolarityAndSubjectivity\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Sentiment\n", + "\n", + "Finally, we can run the `validmind.data_validation.nlp.Sentiment` test to plot the sentiment of our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.nlp.Sentiment\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Embedding Model\n", + "\n", + "Now that we have our dataset loaded and have run some data validation tests to assess and document the quality of our data, we can go ahead and initialize our embedding model. We will use `text-embedding-3-small` and `text-embedding-3-large` models from OpenAI for this purpose wrapped in the `OpenAIEmbeddings` class from LangChain. This model will be used to \"embed\" our questions both for inserting the question-answer pairs from the \"train\" set into the vector store and for embedding the question from inputs when making predictions with our RAG model." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_openai import OpenAIEmbeddings\n", + "\n", + "embedding_small_client = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", + "\n", + "\n", + "def embed_small(input):\n", + " \"\"\"Returns a text embedding for the given text\"\"\"\n", + " return embedding_small_client.embed_query(input[\"question\"])\n", + "\n", + "\n", + "vm_embedder_small = vm.init_model(input_id=\"embedding_small_model\", predict_fn=embed_small)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_large_client = OpenAIEmbeddings(model=\"text-embedding-3-large\")\n", + "\n", + "\n", + "def embed_large(input):\n", + " \"\"\"Returns a text embedding for the given text\"\"\"\n", + " return embedding_large_client.embed_query(input[\"question\"])\n", + "\n", + "\n", + "vm_embedder_large = vm.init_model(input_id=\"embedding_large_model\", predict_fn=embed_large)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What we have done here is to initialize the `OpenAIEmbeddings` class so it uses OpenAI's `text-embedding-3-small` and `text-embedding-3-large` models. We then created an `embed` function that takes in an `input` dictionary and uses the `embed_query` method of the embedding client to compute the embeddings of the `question`. We use an `embed` function since that is how ValidMind supports any custom model. We will use this strategy for the retrieval and generator models as well but you could also use, say, a HuggingFace model directly. See the documentation for more information on which model types are directly supported - [ValidMind Documentation](https://docs.validmind.ai/validmind/validmind.html)... Finally, we use the `init_model` function from the ValidMind Library to create a `VMModel` object that can be used in ValidMind tests. This also logs the model to our model documentation and any test that uses the model will be linked to the logged model and its metadata." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Assign Predictions\n", + "\n", + "To precompute the embeddings for our test set, we can call the `assign_predictions` method of our `vm_test_ds` object we created above. This will compute the embeddings for each question in the test set and store them in the a special prediction column of the test set thats linked to our `vm_embedder` model. This will allow us to use these embeddings later when we run tests against our embedding model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds.assign_predictions(vm_embedder_small)\n", + "vm_test_ds.assign_predictions(vm_embedder_large)\n", + "print(vm_test_ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Run tests\n", + "\n", + "Now that everything is setup for the embedding model, we can go ahead and run some tests to assess and document the quality of our embeddings. We will use the `validmind.model_validation.embeddings.*` tests to compute a variety of metrics against our model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.StabilityAnalysisRandomNoise\",\n", + " input_grid={\n", + " \"model\": [vm_embedder_small, vm_embedder_large],\n", + " \"dataset\": [vm_test_ds],\n", + " },\n", + " params={\n", + " \"probability\": 0.3,\n", + " \"mean_similarity_threshold\": 0.7,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.StabilityAnalysisSynonyms\",\n", + " input_grid={\n", + " \"model\": [vm_embedder_small, vm_embedder_large],\n", + " \"dataset\": [vm_test_ds],\n", + " },\n", + " params={\n", + " \"probability\": 0.3,\n", + " \"mean_similarity_threshold\": 0.7,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.StabilityAnalysisTranslation\",\n", + " input_grid={\n", + " \"model\": [vm_embedder_small, vm_embedder_large],\n", + " \"dataset\": [vm_test_ds],\n", + " },\n", + " params={\n", + " \"source_lang\": \"en\",\n", + " \"target_lang\": \"fr\",\n", + " \"mean_similarity_threshold\": 0.7,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.CosineSimilarityHeatmap\",\n", + " input_grid={\n", + " \"model\": [vm_embedder_small, vm_embedder_large],\n", + " \"dataset\": [vm_test_ds],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.CosineSimilarityDistribution\",\n", + " input_grid={\n", + " \"model\": [vm_embedder_small, vm_embedder_large],\n", + " \"dataset\": [vm_test_ds],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.PCAComponentsPairwisePlots\",\n", + " input_grid={\n", + " \"model\": [vm_embedder_small, vm_embedder_large],\n", + " \"dataset\": [vm_test_ds],\n", + " },\n", + " params={\n", + " \"n_components\": 3,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup Vector Store\n", + "\n", + "Great, so now that we have assessed our embedding model and verified that it is performing well, we can go ahead and use it to compute embeddings for our question-answer pairs in the \"train\" set. We will then use these embeddings to insert the question-answer pairs into a vector store. We will use an in-memory `qdrant` vector database for demo purposes but any option would work just as well here. We will use the `QdrantClient` class from LangChain to interact with the vector store. This class will allow us to insert and search for embeddings in the vector store." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Generate embeddings for the Train Set\n", + "\n", + "We can use the same `assign_predictions` method from earlier except this time we will use the `vm_train_ds` object to compute the embeddings for the question-answer pairs in the \"train\" set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_train_ds.assign_predictions(vm_embedder_small)\n", + "print(vm_train_ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Insert embeddings and questions into Vector DB\n", + "\n", + "Now that we have computed the embeddings for our question-answer pairs in the \"train\" set, we can go ahead and insert them into the vector store:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.vectorstores import Qdrant\n", + "from langchain_community.document_loaders import DataFrameLoader\n", + "\n", + "# load documents from dataframe\n", + "loader = DataFrameLoader(train_df, page_content_column=\"question\")\n", + "docs = loader.load()\n", + "\n", + "# setup vector datastore\n", + "qdrant = Qdrant.from_documents(\n", + " docs,\n", + " embedding_small_client,\n", + " location=\":memory:\", # Local mode with in-memory storage only\n", + " collection_name=\"rfp_rag_collection\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Retrieval Model\n", + "\n", + "Now that we have an embedding model and a vector database setup and loaded with our data, we need a Retrieval model that can search for similar question-answer pairs for a given input question. Once created, we can initialize this as a ValidMind model and `assign_predictions` to it just like our embedding model. In this example, we'll create two retrieval models with different `k` parameters (the number of documents retrieved) to benchmark and compare their performance. This approach allows us to evaluate how retrieval depth affects the overall system quality." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "def retrieve(input):\n", + " contexts = []\n", + "\n", + " for result in qdrant.similarity_search_with_score(input[\"question\"], k=5):\n", + " document, score = result\n", + " context = f\"Q: {document.page_content}\\n\"\n", + " context += f\"A: {document.metadata['ground_truth']}\\n\"\n", + "\n", + " contexts.append(context)\n", + "\n", + " return contexts\n", + "\n", + "\n", + "vm_retriever_k5 = vm.init_model(input_id=\"retrieval_k5_model\", predict_fn=retrieve)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "def retrieve(input):\n", + " contexts = []\n", + "\n", + " for result in qdrant.similarity_search_with_score(input[\"question\"], k=10):\n", + " document, score = result\n", + " context = f\"Q: {document.page_content}\\n\"\n", + " context += f\"A: {document.metadata['ground_truth']}\\n\"\n", + "\n", + " contexts.append(context)\n", + "\n", + " return contexts\n", + "\n", + "\n", + "vm_retriever_k10 = vm.init_model(input_id=\"retrieval_k10_model\", predict_fn=retrieve)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds.assign_predictions(model=vm_retriever_k5)\n", + "vm_test_ds.assign_predictions(model=vm_retriever_k10)\n", + "print(vm_test_ds)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds._df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generation Model\n", + "\n", + "As the final piece of this simple RAG pipeline, we can create and initialize a generation model that will use the retrieved context to generate an answer to the input question. We will use the `gpt-3.5-turbo` and `gpt-4o` models from OpenAI. Since we have two retrieval models (with different `k` values) and want to test two different LLMs, we'll create a total of four generator models - pairing each retrieval configuration with each LLM to comprehensively evaluate how both retrieval depth and model capability affect response quality." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "from openai import OpenAI\n", + "\n", + "from validmind.models import Prompt\n", + "\n", + "\n", + "system_prompt = \"\"\"\n", + "You are an expert RFP AI assistant.\n", + "You are tasked with answering new RFP questions based on existing RFP questions and answers.\n", + "You will be provided with the existing RFP questions and answer pairs that are the most relevant to the new RFP question.\n", + "After that you will be provided with a new RFP question.\n", + "You will generate an answer and respond only with the answer.\n", + "Ignore your pre-existing knowledge and answer the question based on the provided context.\n", + "\"\"\".strip()\n", + "\n", + "openai_client = OpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "def generate(input):\n", + " \n", + " response = openai_client.chat.completions.create(\n", + " model=\"gpt-3.5-turbo\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k5_model\"])},\n", + " {\"role\": \"user\", \"content\": input[\"question\"]},\n", + " ],\n", + " )\n", + " \n", + " return response.choices[0].message.content\n", + "\n", + "\n", + "vm_generator_k5_gpt35 = vm.init_model(\n", + " input_id=\"generation_k5_gpt35_model\",\n", + " predict_fn=generate,\n", + " prompt=Prompt(template=system_prompt),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "def generate(input):\n", + " response = openai_client.chat.completions.create(\n", + " model=\"gpt-3.5-turbo\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k10_model\"])},\n", + " {\"role\": \"user\", \"content\": input[\"question\"]},\n", + " ],\n", + " )\n", + "\n", + " return response.choices[0].message.content\n", + "\n", + "\n", + "vm_generator_k10_gpt35 = vm.init_model(\n", + " input_id=\"generation_k10_gpt35_model\",\n", + " predict_fn=generate,\n", + " prompt=Prompt(template=system_prompt),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "def generate(input):\n", + " \n", + " response = openai_client.chat.completions.create(\n", + " model=\"gpt-4o\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k5_model\"])},\n", + " {\"role\": \"user\", \"content\": input[\"question\"]},\n", + " ],\n", + " )\n", + " \n", + " return response.choices[0].message.content\n", + "\n", + "\n", + "vm_generator_k5_gpt4o = vm.init_model(\n", + " input_id=\"generation_k5_gpt4o_model\",\n", + " predict_fn=generate,\n", + " prompt=Prompt(template=system_prompt),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate(input):\n", + " response = openai_client.chat.completions.create(\n", + " model=\"gpt-4o\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_k10_model\"])},\n", + " {\"role\": \"user\", \"content\": input[\"question\"]},\n", + " ],\n", + " )\n", + "\n", + " return response.choices[0].message.content\n", + "\n", + "\n", + "vm_generator_k10_gpt4o = vm.init_model(\n", + " input_id=\"generation_k10_gpt4o_model\",\n", + " predict_fn=generate,\n", + " prompt=Prompt(template=system_prompt),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's test it out real quick:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "vm_generator_k5_gpt35.predict(\n", + " pd.DataFrame(\n", + " {\"retrieval_k5_model\": [[\"My name is anil\"]], \"question\": [\"what is my name\"]}\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_generator_k5_gpt4o.predict(\n", + " pd.DataFrame(\n", + " {\"retrieval_k5_model\": [[\"My name is anil\"]], \"question\": [\"what is my name\"]}\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Prompt Evaluation\n", + "\n", + "Now that we have our generator model initialized, we can run some LLM-as-Judge tests to evaluate the system prompt. This will allow us to get an initial sense of how well the prompt meets a few best practices for prompt engineering. These tests use an LLM to rate the prompt on a scale of 1-10 against the following criteria:\n", + "\n", + "- **Examplar Bias**: When using multi-shot prompting, does the prompt contain an unbiased distribution of examples?\n", + "- **Delimitation**: When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", + "- **Clarity**: How clearly the prompt states the task.\n", + "- **Conciseness**: How succinctly the prompt states the task.\n", + "- **Instruction Framing**: Whether the prompt contains negative instructions.\n", + "- **Specificity**: How specific the prompt defines the task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Bias\",\n", + " inputs={\n", + " \"model\": vm_generator_k5_gpt4o,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Clarity\",\n", + " inputs={\n", + " \"model\": vm_generator_k5_gpt4o,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Conciseness\",\n", + " inputs={\n", + " \"model\": vm_generator_k5_gpt4o,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Delimitation\",\n", + " inputs={\n", + " \"model\": vm_generator_k5_gpt4o,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.NegativeInstruction\",\n", + " inputs={\n", + " \"model\": vm_generator_k5_gpt4o,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Specificity\",\n", + " inputs={\n", + " \"model\": vm_generator_k5_gpt4o,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup RAG Pipeline Model\n", + "\n", + "Now that we have all of our individual \"component\" models setup and initialized we need some way to put them all together in a single \"pipeline\". We can use the `PipelineModel` class to do this. This ValidMind model type simply wraps any number of other ValidMind models and runs them in sequence. We can use a pipe(`|`) operator - in Python this is normally an `or` operator but we have overloaded it for easy pipeline creation - to chain together our models. We can then initialize this pipeline model and assign predictions to it just like any other model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_rag_k5_gpt35_model = vm.init_model(vm_retriever_k5 | vm_generator_k5_gpt35, input_id=\"rag_k5_gpt35_model\")\n", + "vm_rag_k10_gpt35_model = vm.init_model(vm_retriever_k10 | vm_generator_k10_gpt35, input_id=\"rag_k10_gpt35_model\")\n", + "vm_rag_k5_gpt4o_model = vm.init_model(vm_retriever_k5 | vm_generator_k5_gpt4o, input_id=\"rag_k5_gpt4o_model\")\n", + "vm_rag_k10_gpt4o_model = vm.init_model(vm_retriever_k10 | vm_generator_k10_gpt4o, input_id=\"rag_k10_gpt4o_model\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can `assign_predictions` to the pipeline model just like we did with the individual models. This will run the pipeline on the test set and store the results in the test set for later use." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds.assign_predictions(model=vm_rag_k5_gpt35_model)\n", + "vm_test_ds.assign_predictions(model=vm_rag_k10_gpt35_model)\n", + "vm_test_ds.assign_predictions(model=vm_rag_k5_gpt4o_model)\n", + "vm_test_ds.assign_predictions(model=vm_rag_k10_gpt4o_model)\n", + "print(vm_test_ds)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds._df.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run tests" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## RAGAS evaluation\n", + "\n", + "Let's go ahead and run some of our new RAG tests against our model...\n", + "\n", + "> Note: these tests are still being developed and are not yet in a stable state. We are using advanced tests here that use LLM-as-Judge and other strategies to assess things like the relevancy of the retrieved context to the input question and the correctness of the generated answer when compared to the ground truth. There is more to come in this area so stay tuned!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Semantic Similarity\n", + "\n", + "The concept of Answer Semantic Similarity pertains to the assessment of the semantic resemblance between the generated answer and the ground truth. This evaluation is based on the ground truth and the answer, with values falling within the range of 0 to 1. A higher score signifies a better alignment between the generated answer and the ground truth.\n", + "\n", + "Measuring the semantic similarity between answers can offer valuable insights into the quality of the generated response. This evaluation utilizes a cross-encoder model to calculate the semantic similarity score." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.SemanticSimilarity\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " \"reference_column\": [\"ground_truth\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Context Entity Recall\n", + "\n", + "This test gives the measure of recall of the retrieved context, based on the number of entities present in both ground_truths and contexts relative to the number of entities present in the ground_truths alone. Simply put, it is a measure of what fraction of entities are recalled from ground_truths. This test is useful in fact-based use cases like tourism help desk, historical QA, etc. This test can help evaluate the retrieval mechanism for entities, based on comparison with entities present in ground_truths, because in cases where entities matter, we need the contexts which cover them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextEntityRecall\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"reference_column\": [\"ground_truth\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\", \"retrieval_k10_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Context Precision\n", + "\n", + "Context Precision is a test that evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Ideally all the relevant chunks must appear at the top ranks. This test is computed using the question, ground_truth and the contexts, with values ranging between 0 and 1, where higher scores indicate better precision." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextPrecision\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\", \"retrieval_k10_model_prediction\"],\n", + " \"reference_column\": [\"ground_truth\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Context Precision Without Reference\n", + "\n", + "This test evaluates whether retrieved contexts align well with the expected response for a given user input, without requiring a ground-truth reference. This test assesses the relevance of each retrieved context chunk by comparing it directly to the response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextPrecisionWithoutReference\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid=[\n", + " {\"user_input_column\": \"question\",\n", + " \"retrieved_contexts_column\": \"retrieval_k5_model_prediction\",\n", + " \"response_column\": \"rag_k5_gpt4o_model_prediction\"\n", + " },\n", + " {\"user_input_column\": \"question\",\n", + " \"retrieved_contexts_column\": \"retrieval_k10_model_prediction\",\n", + " \"response_column\": \"rag_k10_gpt4o_model_prediction\"\n", + " },\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextPrecisionWithoutReference\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Faithfulness\n", + "\n", + "This measures the factual consistency of the generated answer against the given context. It is calculated from answer and retrieved context. The answer is scaled to (0,1) range. Higher the better.\n", + "\n", + "The generated answer is regarded as faithful if all the claims that are made in the answer can be inferred from the given context. To calculate this a set of claims from the generated answer is first identified. Then each one of these claims are cross checked with given context to determine if it can be inferred from given context or not." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.Faithfulness\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Response Relevancy\n", + "\n", + "The Response Relevancy test, focuses on assessing how pertinent the generated answer is to the given prompt. A lower score is assigned to answers that are incomplete or contain redundant information and higher scores indicate better relevancy. This test is computed using the question, the context and the answer.\n", + "\n", + "The Response Relevancy is defined as the mean cosine similartiy of the original question to a number of artifical questions, which where generated (reverse engineered) based on the answer.\n", + "\n", + "Please note, that eventhough in practice the score will range between 0 and 1 most of the time, this is not mathematically guranteed, due to the nature of the cosine similarity ranging from -1 to 1.\n", + "\n", + "> Note: This is a reference free test. If you’re looking to compare ground truth answer with generated answer refer to Answer Correctness.\n", + "\n", + "An answer is deemed relevant when it directly and appropriately addresses the original question. Importantly, our assessment of answer relevance does not consider factuality but instead penalizes cases where the answer lacks completeness or contains redundant details. To calculate this score, the LLM is prompted to generate an appropriate question for the generated answer multiple times, and the mean cosine similarity between these generated questions and the original question is measured. The underlying idea is that if the generated answer accurately addresses the initial question, the LLM should be able to generate questions from the answer that align with the original question." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ResponseRelevancy\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Context Recall\n", + "\n", + "Context recall measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. It is computed based on the ground truth and the retrieved context, and the values range between 0 and 1, with higher values indicating better performance.\n", + "\n", + "To estimate context recall from the ground truth answer, each sentence in the ground truth answer is analyzed to determine whether it can be attributed to the retrieved context or not. In an ideal scenario, all sentences in the ground truth answer should be attributable to the retrieved context." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextRecall\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\", \"retrieval_k10_model_prediction\"],\n", + " \"reference_column\": [\"ground_truth\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Answer Correctness\n", + "\n", + "The assessment of Answer Correctness involves gauging the accuracy of the generated answer when compared to the ground truth. This evaluation relies on the ground truth and the answer, with scores ranging from 0 to 1. A higher score indicates a closer alignment between the generated answer and the ground truth, signifying better correctness.\n", + "\n", + "Answer correctness encompasses two critical aspects: semantic similarity between the generated answer and the ground truth, as well as factual similarity. These aspects are combined using a weighted scheme to formulate the answer correctness score.\n", + "\n", + "Factual correctness quantifies the factual overlap between the generated answer and the ground truth answer. This is done using the concepts of:\n", + "\n", + "- TP (True Positive): Facts or statements that are present in both the ground truth and the generated answer.\n", + "- FP (False Positive): Facts or statements that are present in the generated answer but not in the ground truth.\n", + "- FN (False Negative): Facts or statements that are present in the ground truth but not in the generated answer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.AnswerCorrectness\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " \"reference_column\": [\"ground_truth\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Aspect Critic\n", + "\n", + "This is designed to assess submissions based on predefined aspects such as harmlessness and correctness. Additionally, users have the flexibility to define their own aspects for evaluating submissions according to their specific criteria. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not. This evaluation is performed using the ‘answer’ as input.\n", + "\n", + "Critiques within the LLM evaluators evaluate submissions based on the provided aspect. Ragas Critiques offers a range of predefined aspects like correctness, harmfulness, etc. Users can also define their own aspects for evaluating submissions based on their specific criteria. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.AspectCritic\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Noise Sensitivity\n", + "\n", + "This test is designed to evaluate the robustness of the RAG pipeline model against noise in the retrieved context. It works by checking how well the \"claims\" in the generated answer match up with the \"claims\" in the ground truth answer. If the generated answer contains \"claims\" from the contexts that the ground truth answer does not contain, those claims are considered incorrect. The score for each answer is the number of incorrect claims divided by the total number of claims. This *can* be interpreted as a measure of how sensitive the LLM is to \"noise\" in the context where \"noise\" is information that is relevant but should not be included in the answer since the ground truth answer does not contain it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.NoiseSensitivity\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " param_grid={\n", + " \"user_input_column\": [\"question\"],\n", + " \"response_column\": [\"rag_k5_gpt35_model_prediction\", \"rag_k5_gpt4o_model_prediction\"],\n", + " \"reference_column\": [\"ground_truth\"],\n", + " \"retrieved_contexts_column\": [\"retrieval_k5_model_prediction\"],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Generation quality\n", + "\n", + "In this section, we evaluate the alignment and relevance of generated responses to reference outputs within our retrieval-augmented generation (RAG) application. We use metrics that assess various quality dimensions of the generated responses, including semantic similarity, structural alignment, and phrasing overlap. Semantic similarity metrics compare embeddings of generated and reference text to capture deeper contextual alignment, while overlap and alignment measures quantify how well the phrasing and structure of generated responses match the intended outputs." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Token Disparity\n", + "\n", + "This test assesses the difference in token counts between the reference texts (ground truth) and the answers generated by the RAG model. It helps evaluate how well the model's outputs align with the expected length and level of detail in the reference texts. A significant disparity in token counts could signal issues with generation quality, such as excessive verbosity or insufficient detail. Consistently low token counts in generated answers compared to references might suggest that the model’s outputs are incomplete or overly concise, missing important contextual information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.TokenDisparity\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### ROUGE Score\n", + "\n", + "This test evaluates the quality of answers generated by the RAG model by measuring overlaps in n-grams, word sequences, and word pairs between the model output and the reference (ground truth) text. ROUGE, short for Recall-Oriented Understudy for Gisting Evaluation, assesses both precision and recall, providing a balanced view of how well the generated response captures the reference content. ROUGE precision measures the proportion of n-grams in the generated text that match the reference, highlighting relevance and conciseness, while ROUGE recall assesses the proportion of reference n-grams present in the generated text, indicating completeness and thoroughness. \n", + "\n", + "Low precision scores might reveal that the generated text includes redundant or irrelevant information, while low recall scores suggest omissions of essential details from the reference. Consistently low ROUGE scores could indicate poor overall alignment with the ground truth, suggesting the model may be missing key content or failing to capture the intended meaning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.RougeScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + " params={\n", + " \"metric\": \"rouge-1\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### BLEU Score\n", + "\n", + "The BLEU Score test evaluates the quality of answers generated by the RAG model by measuring n-gram overlap between the generated text and the reference (ground truth) text, with a specific focus on exact precision in phrasing. While ROUGE precision also assesses overlap, BLEU differs in two main ways: first, it applies a geometric average across multiple n-gram levels, capturing precise phrase alignment, and second, it includes a brevity penalty to prevent overly short outputs from inflating scores artificially. This added precision focus is valuable in RAG applications where strict adherence to reference language is essential, as BLEU emphasizes the match to exact phrasing. In contrast, ROUGE precision evaluates general content overlap without penalizing brevity, offering a broader sense of content alignment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.BleuScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### BERT Score\n", + "\n", + "This test evaluates the quality of the RAG generated answers using BERT embeddings to measure precision, recall, and F1 scores based on semantic similarity, rather than exact n-gram matches as in BLEU and ROUGE. This approach captures contextual meaning, making it valuable when wording differs but the intended message closely aligns with the reference. In RAG applications, the BERT score is especially useful for ensuring that generated answers convey the reference text’s meaning, even if phrasing varies. Consistently low scores indicate a lack of semantic alignment, suggesting the model may miss or misrepresent key content. Low precision may reflect irrelevant or redundant details, while low recall can indicate omissions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.BertScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### METEOR Score\n", + "\n", + "This test evaluates the quality of the generated answers by measuring alignment with the ground truth, emphasizing both accuracy and fluency. Unlike BLEU and ROUGE, which focus on n-gram matches, METEOR combines precision, recall, synonym matching, and word order, focusing at how well the generated text conveys meaning and reads naturally. This metric is especially useful for RAG applications where sentence structure and natural flow are crucial for clear communication. Lower scores may suggest alignment issues, indicating that the answers may lack fluency or key content. Discrepancies in word order or high fragmentation penalties can reveal problems with how the model constructs sentences, potentially affecting readability." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.MeteorScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Bias and Toxicity\n", + "\n", + "In this section, we use metrics like Toxicity Score and Regard Score to evaluate both the generated responses and the ground truth. These tests helps us detect any harmful, offensive, or inappropriate language and evaluate the level of bias and neutrality enabling us to assess and mitigate potential biases in both the model's responses and the original dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Toxicity Score\n", + "\n", + "This test measures the level of harmful or offensive content in the generated answers. The test uses a preloaded toxicity detection tool from Hugging Face, which identifies language that may be inappropriate, aggressive, or derogatory. High toxicity scores indicate potentially toxic content, while consistently elevated scores across multiple outputs may signal underlying issues in the model’s generation process that require attention to prevent the spread of harmful language." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ToxicityScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Regard Score\n", + "\n", + "This test evaluates the sentiment and perceived regard—categorized as positive, negative, neutral, or other—in answers generated by the RAG model. This is important for identifying any biases or sentiment tendencies in responses, ensuring that generated answers are balanced and appropriate for the context. The uses a preloaded regard evaluation tool from Hugging Face to compute scores for each response. High skewness in regard scores, especially if the generated responses consistently diverge from expected sentiments in the reference texts, may reveal biases in the model’s generation, such as overly positive or negative tones where neutrality is expected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.RegardScore\",\n", + " input_grid={\n", + " \"dataset\": [vm_test_ds],\n", + " \"model\": [vm_rag_k5_gpt35_model, vm_rag_k5_gpt4o_model],\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conclusion\n", + "\n", + "In this notebook, we have seen how we can use LangChain and ValidMind together to build, evaluate and document a simple RAG Model as its developed. This is a great example of the interactive development experience that ValidMind is designed to support. We can quickly iterate on our model and document as we go... We have seen how ValidMind supports non-traditional \"models\" using a functional interface and how we can build pipelines of many models to support complex GenAI workflows.\n", + "\n", + "This is still a work in progress and we are actively developing new tests to support more advanced GenAI workflows. We are also keeping an eye on the most popular GenAI models and libraries to explore direct integrations. Stay tuned for more updates and new features in this area!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Upgrade ValidMind\n", + "\n", + "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", + "\n", + "Retrieve the information for the currently installed version of ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip show validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", + "\n", + "```bash\n", + "%pip install --upgrade validmind\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may need to restart your kernel after running the upgrade package for changes to be applied." + ] + }, + { + "cell_type": "markdown", + "id": "copyright-81adcc0373ee4729a477d2ccab3b93d8", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "***\n", + "\n", + "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", + "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", + "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ValidMind Library", + "language": "python", + "name": "validmind" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/notebooks/use_cases/nlp_and_llm/rag_documentation_demo.ipynb b/notebooks/use_cases/nlp_and_llm/rag_documentation_demo.ipynb index aac1876d5..7e606f031 100644 --- a/notebooks/use_cases/nlp_and_llm/rag_documentation_demo.ipynb +++ b/notebooks/use_cases/nlp_and_llm/rag_documentation_demo.ipynb @@ -1,1692 +1,1693 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# RAG Model Documentation Demo\n", - "\n", - "In this notebook, we are going to implement a simple RAG Model for automating the process of answering RFP questions using GenAI. We will see how we can initialize an embedding model, a retrieval model and a generator model with LangChain components and use them within the ValidMind Library to run tests against them. Finally, we will see how we can put them together in a Pipeline and run that to get e2e results and run tests against that." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "::: {.content-hidden when-format=\"html\"}\n", - "## Contents \n", - "- [About ValidMind](#toc1__) \n", - " - [Before you begin](#toc1_1__) \n", - " - [New to ValidMind?](#toc1_2__) \n", - " - [Key concepts](#toc1_3__) \n", - "- [Setting up](#toc2__) \n", - " - [Initialize the ValidMind Library](#toc2_1__) \n", - " - [Register sample model](#toc2_1_1__) \n", - " - [Apply documentation template](#toc2_1_2__) \n", - " - [Get your code snippet](#toc2_1_3__) \n", - "- [Read Open AI API Key](#toc3__) \n", - "- [Dataset Loader](#toc4__) \n", - "- [Data validation](#toc5__) \n", - " - [Duplicates](#toc5_1__) \n", - " - [Stop Words](#toc5_2__) \n", - " - [Punctuations](#toc5_3__) \n", - " - [Common Words](#toc5_4__) \n", - " - [Language Detection](#toc5_5__) \n", - " - [Toxicity Score](#toc5_6__) \n", - " - [Polarity and Subjectivity](#toc5_7__) \n", - " - [Sentiment](#toc5_8__) \n", - " - [Assign Predictions](#toc5_9__) \n", - " - [Run tests](#toc5_10__) \n", - " - [Generate embeddings for the Train Set](#toc5_11__) \n", - " - [Insert embeddings and questions into Vector DB](#toc5_12__) \n", - "- [Prompt Evaluation](#toc6__) \n", - "- [RAGAS evaluation](#toc7__) \n", - " - [Semantic Similarity](#toc7_1__) \n", - " - [Context Entity Recall](#toc7_2__) \n", - " - [Context Precision](#toc7_3__) \n", - " - [Context Precision Without Reference](#toc7_4__) \n", - " - [Faithfulness](#toc7_5__) \n", - " - [Response Relevancy](#toc7_6__) \n", - " - [Context Recall](#toc7_7__) \n", - " - [Answer Correctness](#toc7_8__) \n", - " - [Aspect Critic](#toc7_9__) \n", - " - [Noise Sensitivity](#toc7_10__) \n", - "- [Generation quality](#toc8__) \n", - " - [Token Disparity](#toc8_1__) \n", - " - [ROUGE Score](#toc8_2__) \n", - " - [BLEU Score](#toc8_3__) \n", - " - [BERT Score](#toc8_4__) \n", - " - [METEOR Score](#toc8_5__) \n", - "- [Bias and Toxicity](#toc9__) \n", - " - [Toxicity Score](#toc9_1__) \n", - " - [Regard Score](#toc9_2__) \n", - "- [Upgrade ValidMind](#toc10__) \n", - "\n", - ":::\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## About ValidMind\n", - "\n", - "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", - "\n", - "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", - "\n", - "\n", - "\n", - "### Before you begin\n", - "\n", - "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", - "\n", - "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", - "\n", - "\n", - "\n", - "### New to ValidMind?\n", - "\n", - "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", - "\n", - "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", - "

\n", - "Register with ValidMind
\n", - "\n", - "\n", - "\n", - "### Key concepts\n", - "\n", - "- **FunctionModels**: ValidMind offers support for creating `VMModel` instances from Python functions. This enables us to support any \"model\" by simply using the provided function as the model's `predict` method.\n", - "- **PipelineModels**: ValidMind models (`VMModel` instances) of any type can be piped together to create a model pipeline. This allows model components to be created and tested/documented independently, and then combined into a single model for end-to-end testing and documentation. We use the `|` operator to pipe models together.\n", - "- **RAG**: RAG stands for Retrieval Augmented Generation and refers to a wide range of GenAI applications where some form of retrieval is used to add context to the prompt so that the LLM that generates content can refer to it when creating its output. In this notebook, we are going to implement a simple RAG setup using LangChain components." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Prerequisites\n", - "\n", - "Let's go ahead and install the `validmind` library if its not already installed... Then we can install the `qdrant-client` library for our vector store and `langchain` for everything else:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q validmind" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -q qdrant-client langchain langchain-openai sentencepiece" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Setting up" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize the ValidMind Library" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Register sample model\n", - "\n", - "Let's first register a sample model for use with this notebook:\n", - "\n", - "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", - "\n", - "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", - "\n", - "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", - "4. Select your own name under the **MODEL OWNER** drop-down.\n", - "\n", - "5. Click **Register Model** to add the model to your inventory." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Apply documentation template\n", - "\n", - "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", - "\n", - "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", - "\n", - "2. Under **TEMPLATE**, select `Gen AI RAG`.\n", - "\n", - "3. Click **Use Template** to apply the template." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
Can't select this template?\n", - "

\n", - "Your organization administrators may need to add it to your template library:\n", - "\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Get your code snippet\n", - "\n", - "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", - "\n", - "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", - "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load your model identifier credentials from an `.env` file\n", - "\n", - "%load_ext dotenv\n", - "%dotenv .env\n", - "\n", - "# Or replace with your code snippet\n", - "\n", - "import validmind as vm\n", - "\n", - "vm.init(\n", - " # api_host=\"...\",\n", - " # api_key=\"...\",\n", - " # api_secret=\"...\",\n", - " # model=\"...\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Read Open AI API Key\n", - "\n", - "We will need to have an OpenAI API key to be able to use their `text-embedding-3-small` model for our embeddings, `gpt-3.5-turbo` model for our generator and `gpt-4o` model for our LLM-as-Judge tests. If you don't have an OpenAI API key, you can get one by signing up at [OpenAI](https://platform.openai.com/signup). Then you can create a `.env` file in the root of your project and the following cell will load it from there. Alternatively, you can just uncomment the line below to directly set the key (not recommended for security reasons)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# load openai api key\n", - "import os\n", - "\n", - "import dotenv\n", - "import nltk\n", - "\n", - "dotenv.load_dotenv()\n", - "nltk.download('stopwords')\n", - "nltk.download('punkt_tab')\n", - "\n", - "# os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n", - "\n", - "if not \"OPENAI_API_KEY\" in os.environ:\n", - " raise ValueError(\"OPENAI_API_KEY is not set\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Dataset Loader\n", - "\n", - "Great, now that we have all of our dependencies installed, the ValidMind Library initialized and connected to our model and our OpenAI API key setup, we can go ahead and load our datasets. We will use the synthetic `RFP` dataset included with ValidMind for this notebook. This dataset contains a variety of RFP questions and ground truth answers that we can use both as the source where our Retriever will search for similar question-answer pairs as well as our test set for evaluating the performance of our RAG model. To do this, we just have to load it and call the preprocess function to get a split of the data into train and test sets." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# Import the sample dataset from the library\n", - "from validmind.datasets.llm.rag import rfp\n", - "\n", - "raw_df = rfp.load_data()\n", - "train_df, test_df = rfp.preprocess(raw_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_train_ds = vm.init_dataset(\n", - " train_df,\n", - " text_column=\"question\",\n", - " target_column=\"ground_truth\",\n", - ")\n", - "\n", - "vm_test_ds = vm.init_dataset(\n", - " test_df,\n", - " text_column=\"question\",\n", - " target_column=\"ground_truth\",\n", - ")\n", - "\n", - "vm_test_ds.df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Data validation\n", - "\n", - "Now that we have loaded our dataset, we can go ahead and run some data validation tests right away to start assessing and documenting the quality of our data. Since we are using a text dataset, we can use ValidMind's built-in array of text data quality tests to check that things like number of duplicates, missing values, and other common text data issues are not present in our dataset. We can also run some tests to check the sentiment and toxicity of our data." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Duplicates\n", - "\n", - "First, let's check for duplicates in our dataset. We can use the `validmind.data_validation.Duplicates` test and pass our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from validmind.tests import run_test\n", - "\n", - "run_test(\n", - " test_id=\"validmind.data_validation.Duplicates\",\n", - " inputs={\"dataset\": vm_train_ds},\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Stop Words\n", - "\n", - "Next, let's check for stop words in our dataset. We can use the `validmind.data_validation.StopWords` test and pass our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " test_id=\"validmind.data_validation.nlp.StopWords\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Punctuations\n", - "\n", - "Next, let's check for punctuations in our dataset. We can use the `validmind.data_validation.Punctuations` test:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " test_id=\"validmind.data_validation.nlp.Punctuations\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Common Words\n", - "\n", - "Next, let's check for common words in our dataset. We can use the `validmind.data_validation.CommonWord` test:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " test_id=\"validmind.data_validation.nlp.CommonWords\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Language Detection\n", - "\n", - "For documentation purposes, we can detect and log the languages used in the dataset with the `validmind.data_validation.LanguageDetection` test:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " test_id=\"validmind.data_validation.nlp.LanguageDetection\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Toxicity Score\n", - "\n", - "Now, let's go ahead and run the `validmind.data_validation.nlp.Toxicity` test to compute a toxicity score for our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.data_validation.nlp.Toxicity\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Polarity and Subjectivity\n", - "\n", - "We can also run the `validmind.data_validation.nlp.PolarityAndSubjectivity` test to compute the polarity and subjectivity of our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.data_validation.nlp.PolarityAndSubjectivity\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Sentiment\n", - "\n", - "Finally, we can run the `validmind.data_validation.nlp.Sentiment` test to plot the sentiment of our dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.data_validation.nlp.Sentiment\",\n", - " inputs={\n", - " \"dataset\": vm_train_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Embedding Model\n", - "\n", - "Now that we have our dataset loaded and have run some data validation tests to assess and document the quality of our data, we can go ahead and initialize our embedding model. We will use the `text-embedding-3-small` model from OpenAI for this purpose wrapped in the `OpenAIEmbeddings` class from LangChain. This model will be used to \"embed\" our questions both for inserting the question-answer pairs from the \"train\" set into the vector store and for embedding the question from inputs when making predictions with our RAG model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_openai import OpenAIEmbeddings\n", - "\n", - "embedding_client = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", - "\n", - "\n", - "def embed(input):\n", - " \"\"\"Returns a text embedding for the given text\"\"\"\n", - " return embedding_client.embed_query(input[\"question\"])\n", - "\n", - "\n", - "vm_embedder = vm.init_model(input_id=\"embedding_model\", predict_fn=embed)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "What we have done here is to initialize the `OpenAIEmbeddings` class so it uses OpenAI's `text-embedding-3-small` model. We then created an `embed` function that takes in an `input` dictionary and uses the `embed_query` method of the embedding client to compute the embeddings of the `question`. We use an `embed` function since that is how ValidMind supports any custom model. We will use this strategy for the retrieval and generator models as well but you could also use, say, a HuggingFace model directly. See the documentation for more information on which model types are directly supported - [ValidMind Documentation](https://docs.validmind.ai/validmind/validmind.html)... Finally, we use the `init_model` function from the ValidMind Library to create a `VMModel` object that can be used in ValidMind tests. This also logs the model to our model documentation and any test that uses the model will be linked to the logged model and its metadata." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Assign Predictions\n", - "\n", - "To precompute the embeddings for our test set, we can call the `assign_predictions` method of our `vm_test_ds` object we created above. This will compute the embeddings for each question in the test set and store them in the a special prediction column of the test set thats linked to our `vm_embedder` model. This will allow us to use these embeddings later when we run tests against our embedding model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_ds.assign_predictions(vm_embedder)\n", - "print(vm_test_ds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Run tests\n", - "\n", - "Now that everything is setup for the embedding model, we can go ahead and run some tests to assess and document the quality of our embeddings. We will use the `validmind.model_validation.embeddings.*` tests to compute a variety of metrics against our model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.StabilityAnalysisRandomNoise\",\n", - " inputs={\n", - " \"model\": vm_embedder,\n", - " \"dataset\": vm_test_ds,\n", - " },\n", - " params={\"probability\": 0.3},\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.StabilityAnalysisSynonyms\",\n", - " inputs={\n", - " \"model\": vm_embedder,\n", - " \"dataset\": vm_test_ds,\n", - " },\n", - " params={\"probability\": 0.3},\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.StabilityAnalysisTranslation\",\n", - " inputs={\n", - " \"model\": vm_embedder,\n", - " \"dataset\": vm_test_ds,\n", - " },\n", - " params={\n", - " \"source_lang\": \"en\",\n", - " \"target_lang\": \"fr\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.CosineSimilarityHeatmap\",\n", - " inputs={\n", - " \"model\": vm_embedder,\n", - " \"dataset\": vm_test_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "run_test(\n", - " \"validmind.model_validation.embeddings.CosineSimilarityDistribution\",\n", - " inputs={\n", - " \"model\": vm_embedder,\n", - " \"dataset\": vm_test_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.EuclideanDistanceHeatmap\",\n", - " inputs={\n", - " \"model\": vm_embedder,\n", - " \"dataset\": vm_test_ds,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.PCAComponentsPairwisePlots\",\n", - " inputs={\n", - " \"model\": vm_embedder,\n", - " \"dataset\": vm_test_ds,\n", - " },\n", - " params={\"n_components\": 3},\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.embeddings.TSNEComponentsPairwisePlots\",\n", - " inputs={\n", - " \"model\": vm_embedder,\n", - " \"dataset\": vm_test_ds,\n", - " },\n", - " params={\"n_components\": 3, \"perplexity\": 20},\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Setup Vector Store\n", - "\n", - "Great, so now that we have assessed our embedding model and verified that it is performing well, we can go ahead and use it to compute embeddings for our question-answer pairs in the \"train\" set. We will then use these embeddings to insert the question-answer pairs into a vector store. We will use an in-memory `qdrant` vector database for demo purposes but any option would work just as well here. We will use the `QdrantClient` class from LangChain to interact with the vector store. This class will allow us to insert and search for embeddings in the vector store." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Generate embeddings for the Train Set\n", - "\n", - "We can use the same `assign_predictions` method from earlier except this time we will use the `vm_train_ds` object to compute the embeddings for the question-answer pairs in the \"train\" set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_train_ds.assign_predictions(vm_embedder)\n", - "print(vm_train_ds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Insert embeddings and questions into Vector DB\n", - "\n", - "Now that we have computed the embeddings for our question-answer pairs in the \"train\" set, we can go ahead and insert them into the vector store:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.vectorstores import Qdrant\n", - "from langchain_openai import OpenAIEmbeddings\n", - "from langchain_community.document_loaders import DataFrameLoader\n", - "\n", - "# load documents from dataframe\n", - "loader = DataFrameLoader(train_df, page_content_column=\"question\")\n", - "docs = loader.load()\n", - "# choose model using embedding client\n", - "embedding_client = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", - "\n", - "# setup vector datastore\n", - "qdrant = Qdrant.from_documents(\n", - " docs,\n", - " embedding_client,\n", - " location=\":memory:\", # Local mode with in-memory storage only\n", - " collection_name=\"rfp_rag_collection\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Retrieval Model\n", - "\n", - "Now that we have an embedding model and a vector database setup and loaded with our data, we need a Retrieval model that can search for similar question-answer pairs for a given input question. Once created, we can initialize this as a ValidMind model and `assign_predictions` to it just like our embedding model." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "def retrieve(input):\n", - " contexts = []\n", - "\n", - " for result in qdrant.similarity_search_with_score(input[\"question\"]):\n", - " document, score = result\n", - " context = f\"Q: {document.page_content}\\n\"\n", - " context += f\"A: {document.metadata['ground_truth']}\\n\"\n", - "\n", - " contexts.append(context)\n", - "\n", - " return contexts\n", - "\n", - "\n", - "vm_retriever = vm.init_model(input_id=\"retrieval_model\", predict_fn=retrieve)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_ds.assign_predictions(model=vm_retriever)\n", - "print(vm_test_ds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Generation Model\n", - "\n", - "As the final piece of this simple RAG pipeline, we can create and initialize a generation model that will use the retrieved context to generate an answer to the input question. We will use the `gpt-3.5-turbo` model from OpenAI." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from openai import OpenAI\n", - "\n", - "from validmind.models import Prompt\n", - "\n", - "\n", - "system_prompt = \"\"\"\n", - "You are an expert RFP AI assistant.\n", - "You are tasked with answering new RFP questions based on existing RFP questions and answers.\n", - "You will be provided with the existing RFP questions and answer pairs that are the most relevant to the new RFP question.\n", - "After that you will be provided with a new RFP question.\n", - "You will generate an answer and respond only with the answer.\n", - "Ignore your pre-existing knowledge and answer the question based on the provided context.\n", - "\"\"\".strip()\n", - "\n", - "openai_client = OpenAI()\n", - "\n", - "\n", - "def generate(input):\n", - " response = openai_client.chat.completions.create(\n", - " model=\"gpt-3.5-turbo\",\n", - " messages=[\n", - " {\"role\": \"system\", \"content\": system_prompt},\n", - " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_model\"])},\n", - " {\"role\": \"user\", \"content\": input[\"question\"]},\n", - " ],\n", - " )\n", - "\n", - " return response.choices[0].message.content\n", - "\n", - "\n", - "vm_generator = vm.init_model(\n", - " input_id=\"generation_model\",\n", - " predict_fn=generate,\n", - " prompt=Prompt(template=system_prompt),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's test it out real quick:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "vm_generator.predict(\n", - " pd.DataFrame(\n", - " {\"retrieval_model\": [[\"My name is anil\"]], \"question\": [\"what is my name\"]}\n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Prompt Evaluation\n", - "\n", - "Now that we have our generator model initialized, we can run some LLM-as-Judge tests to evaluate the system prompt. This will allow us to get an initial sense of how well the prompt meets a few best practices for prompt engineering. These tests use an LLM to rate the prompt on a scale of 1-10 against the following criteria:\n", - "\n", - "- **Examplar Bias**: When using multi-shot prompting, does the prompt contain an unbiased distribution of examples?\n", - "- **Delimitation**: When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", - "- **Clarity**: How clearly the prompt states the task.\n", - "- **Conciseness**: How succinctly the prompt states the task.\n", - "- **Instruction Framing**: Whether the prompt contains negative instructions.\n", - "- **Specificity**: How specific the prompt defines the task." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Bias\",\n", - " inputs={\n", - " \"model\": vm_generator,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Clarity\",\n", - " inputs={\n", - " \"model\": vm_generator,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Conciseness\",\n", - " inputs={\n", - " \"model\": vm_generator,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Delimitation\",\n", - " inputs={\n", - " \"model\": vm_generator,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.NegativeInstruction\",\n", - " inputs={\n", - " \"model\": vm_generator,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.prompt_validation.Specificity\",\n", - " inputs={\n", - " \"model\": vm_generator,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Setup RAG Pipeline Model\n", - "\n", - "Now that we have all of our individual \"component\" models setup and initialized we need some way to put them all together in a single \"pipeline\". We can use the `PipelineModel` class to do this. This ValidMind model type simply wraps any number of other ValidMind models and runs them in sequence. We can use a pipe(`|`) operator - in Python this is normally an `or` operator but we have overloaded it for easy pipeline creation - to chain together our models. We can then initialize this pipeline model and assign predictions to it just like any other model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_rag_model = vm.init_model(vm_retriever | vm_generator, input_id=\"rag_model\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can `assign_predictions` to the pipeline model just like we did with the individual models. This will run the pipeline on the test set and store the results in the test set for later use." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_ds.assign_predictions(model=vm_rag_model)\n", - "print(vm_test_ds)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vm_test_ds._df.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Run tests" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## RAGAS evaluation\n", - "\n", - "Let's go ahead and run some of our new RAG tests against our model...\n", - "\n", - "> Note: these tests are still being developed and are not yet in a stable state. We are using advanced tests here that use LLM-as-Judge and other strategies to assess things like the relevancy of the retrieved context to the input question and the correctness of the generated answer when compared to the ground truth. There is more to come in this area so stay tuned!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Semantic Similarity\n", - "\n", - "The concept of Answer Semantic Similarity pertains to the assessment of the semantic resemblance between the generated answer and the ground truth. This evaluation is based on the ground truth and the answer, with values falling within the range of 0 to 1. A higher score signifies a better alignment between the generated answer and the ground truth.\n", - "\n", - "Measuring the semantic similarity between answers can offer valuable insights into the quality of the generated response. This evaluation utilizes a cross-encoder model to calculate the semantic similarity score." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.SemanticSimilarity\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"response_column\": \"rag_model_prediction\",\n", - " \"reference_column\": \"ground_truth\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Context Entity Recall\n", - "\n", - "This test gives the measure of recall of the retrieved context, based on the number of entities present in both ground_truths and contexts relative to the number of entities present in the ground_truths alone. Simply put, it is a measure of what fraction of entities are recalled from ground_truths. This test is useful in fact-based use cases like tourism help desk, historical QA, etc. This test can help evaluate the retrieval mechanism for entities, based on comparison with entities present in ground_truths, because in cases where entities matter, we need the contexts which cover them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextEntityRecall\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"reference_column\": \"ground_truth\",\n", - " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Context Precision\n", - "\n", - "Context Precision is a test that evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Ideally all the relevant chunks must appear at the top ranks. This test is computed using the question, ground_truth and the contexts, with values ranging between 0 and 1, where higher scores indicate better precision." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextPrecision\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"user_input_column\": \"question\",\n", - " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", - " \"reference_column\": \"ground_truth\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Context Precision Without Reference\n", - "\n", - "This test evaluates whether retrieved contexts align well with the expected response for a given user input, without requiring a ground-truth reference. This test assesses the relevance of each retrieved context chunk by comparing it directly to the response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextPrecisionWithoutReference\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"user_input_column\": \"question\",\n", - " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", - " \"response_column\": \"rag_model_prediction\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Faithfulness\n", - "\n", - "This measures the factual consistency of the generated answer against the given context. It is calculated from answer and retrieved context. The answer is scaled to (0,1) range. Higher the better.\n", - "\n", - "The generated answer is regarded as faithful if all the claims that are made in the answer can be inferred from the given context. To calculate this a set of claims from the generated answer is first identified. Then each one of these claims are cross checked with given context to determine if it can be inferred from given context or not." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.Faithfulness\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"user_input_column\": \"question\",\n", - " \"response_column\": \"rag_model_prediction\",\n", - " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Response Relevancy\n", - "\n", - "The Response Relevancy test, focuses on assessing how pertinent the generated answer is to the given prompt. A lower score is assigned to answers that are incomplete or contain redundant information and higher scores indicate better relevancy. This test is computed using the question, the context and the answer.\n", - "\n", - "The Response Relevancy is defined as the mean cosine similartiy of the original question to a number of artifical questions, which where generated (reverse engineered) based on the answer.\n", - "\n", - "Please note, that eventhough in practice the score will range between 0 and 1 most of the time, this is not mathematically guranteed, due to the nature of the cosine similarity ranging from -1 to 1.\n", - "\n", - "> Note: This is a reference free test. If you’re looking to compare ground truth answer with generated answer refer to Answer Correctness.\n", - "\n", - "An answer is deemed relevant when it directly and appropriately addresses the original question. Importantly, our assessment of answer relevance does not consider factuality but instead penalizes cases where the answer lacks completeness or contains redundant details. To calculate this score, the LLM is prompted to generate an appropriate question for the generated answer multiple times, and the mean cosine similarity between these generated questions and the original question is measured. The underlying idea is that if the generated answer accurately addresses the initial question, the LLM should be able to generate questions from the answer that align with the original question." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ResponseRelevancy\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"user_input_column\": \"question\",\n", - " \"response_column\": \"rag_model_prediction\",\n", - " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Context Recall\n", - "\n", - "Context recall measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. It is computed based on the ground truth and the retrieved context, and the values range between 0 and 1, with higher values indicating better performance.\n", - "\n", - "To estimate context recall from the ground truth answer, each sentence in the ground truth answer is analyzed to determine whether it can be attributed to the retrieved context or not. In an ideal scenario, all sentences in the ground truth answer should be attributable to the retrieved context." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.ContextRecall\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"user_input_column\": \"question\",\n", - " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", - " \"reference_column\": \"ground_truth\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Answer Correctness\n", - "\n", - "The assessment of Answer Correctness involves gauging the accuracy of the generated answer when compared to the ground truth. This evaluation relies on the ground truth and the answer, with scores ranging from 0 to 1. A higher score indicates a closer alignment between the generated answer and the ground truth, signifying better correctness.\n", - "\n", - "Answer correctness encompasses two critical aspects: semantic similarity between the generated answer and the ground truth, as well as factual similarity. These aspects are combined using a weighted scheme to formulate the answer correctness score.\n", - "\n", - "Factual correctness quantifies the factual overlap between the generated answer and the ground truth answer. This is done using the concepts of:\n", - "\n", - "- TP (True Positive): Facts or statements that are present in both the ground truth and the generated answer.\n", - "- FP (False Positive): Facts or statements that are present in the generated answer but not in the ground truth.\n", - "- FN (False Negative): Facts or statements that are present in the ground truth but not in the generated answer." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.AnswerCorrectness\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"user_input_column\": \"question\",\n", - " \"response_column\": \"rag_model_prediction\",\n", - " \"reference_column\": \"ground_truth\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Aspect Critic\n", - "\n", - "This is designed to assess submissions based on predefined aspects such as harmlessness and correctness. Additionally, users have the flexibility to define their own aspects for evaluating submissions according to their specific criteria. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not. This evaluation is performed using the ‘answer’ as input.\n", - "\n", - "Critiques within the LLM evaluators evaluate submissions based on the provided aspect. Ragas Critiques offers a range of predefined aspects like correctness, harmfulness, etc. Users can also define their own aspects for evaluating submissions based on their specific criteria. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.AspectCritic\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"user_input_column\": \"question\",\n", - " \"response_column\": \"rag_model_prediction\",\n", - " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Noise Sensitivity\n", - "\n", - "This test is designed to evaluate the robustness of the RAG pipeline model against noise in the retrieved context. It works by checking how well the \"claims\" in the generated answer match up with the \"claims\" in the ground truth answer. If the generated answer contains \"claims\" from the contexts that the ground truth answer does not contain, those claims are considered incorrect. The score for each answer is the number of incorrect claims divided by the total number of claims. This *can* be interpreted as a measure of how sensitive the LLM is to \"noise\" in the context where \"noise\" is information that is relevant but should not be included in the answer since the ground truth answer does not contain it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ragas.NoiseSensitivity\",\n", - " inputs={\"dataset\": vm_test_ds},\n", - " params={\n", - " \"user_input_column\": \"question\",\n", - " \"response_column\": \"rag_model_prediction\",\n", - " \"reference_column\": \"ground_truth\",\n", - " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Generation quality\n", - "\n", - "In this section, we evaluate the alignment and relevance of generated responses to reference outputs within our retrieval-augmented generation (RAG) application. We use metrics that assess various quality dimensions of the generated responses, including semantic similarity, structural alignment, and phrasing overlap. Semantic similarity metrics compare embeddings of generated and reference text to capture deeper contextual alignment, while overlap and alignment measures quantify how well the phrasing and structure of generated responses match the intended outputs." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Token Disparity\n", - "\n", - "This test assesses the difference in token counts between the reference texts (ground truth) and the answers generated by the RAG model. It helps evaluate how well the model's outputs align with the expected length and level of detail in the reference texts. A significant disparity in token counts could signal issues with generation quality, such as excessive verbosity or insufficient detail. Consistently low token counts in generated answers compared to references might suggest that the model’s outputs are incomplete or overly concise, missing important contextual information." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.TokenDisparity\",\n", - " inputs={\n", - " \"dataset\": vm_test_ds,\n", - " \"model\": vm_rag_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### ROUGE Score\n", - "\n", - "This test evaluates the quality of answers generated by the RAG model by measuring overlaps in n-grams, word sequences, and word pairs between the model output and the reference (ground truth) text. ROUGE, short for Recall-Oriented Understudy for Gisting Evaluation, assesses both precision and recall, providing a balanced view of how well the generated response captures the reference content. ROUGE precision measures the proportion of n-grams in the generated text that match the reference, highlighting relevance and conciseness, while ROUGE recall assesses the proportion of reference n-grams present in the generated text, indicating completeness and thoroughness. \n", - "\n", - "Low precision scores might reveal that the generated text includes redundant or irrelevant information, while low recall scores suggest omissions of essential details from the reference. Consistently low ROUGE scores could indicate poor overall alignment with the ground truth, suggesting the model may be missing key content or failing to capture the intended meaning." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.RougeScore\",\n", - " inputs={\n", - " \"dataset\": vm_test_ds,\n", - " \"model\": vm_rag_model,\n", - " },\n", - " params={\n", - " \"metric\": \"rouge-1\",\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### BLEU Score\n", - "\n", - "The BLEU Score test evaluates the quality of answers generated by the RAG model by measuring n-gram overlap between the generated text and the reference (ground truth) text, with a specific focus on exact precision in phrasing. While ROUGE precision also assesses overlap, BLEU differs in two main ways: first, it applies a geometric average across multiple n-gram levels, capturing precise phrase alignment, and second, it includes a brevity penalty to prevent overly short outputs from inflating scores artificially. This added precision focus is valuable in RAG applications where strict adherence to reference language is essential, as BLEU emphasizes the match to exact phrasing. In contrast, ROUGE precision evaluates general content overlap without penalizing brevity, offering a broader sense of content alignment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.BleuScore\",\n", - " inputs={\n", - " \"dataset\": vm_test_ds,\n", - " \"model\": vm_rag_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### BERT Score\n", - "\n", - "This test evaluates the quality of the RAG generated answers using BERT embeddings to measure precision, recall, and F1 scores based on semantic similarity, rather than exact n-gram matches as in BLEU and ROUGE. This approach captures contextual meaning, making it valuable when wording differs but the intended message closely aligns with the reference. In RAG applications, the BERT score is especially useful for ensuring that generated answers convey the reference text’s meaning, even if phrasing varies. Consistently low scores indicate a lack of semantic alignment, suggesting the model may miss or misrepresent key content. Low precision may reflect irrelevant or redundant details, while low recall can indicate omissions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.BertScore\",\n", - " inputs={\n", - " \"dataset\": vm_test_ds,\n", - " \"model\": vm_rag_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### METEOR Score\n", - "\n", - "This test evaluates the quality of the generated answers by measuring alignment with the ground truth, emphasizing both accuracy and fluency. Unlike BLEU and ROUGE, which focus on n-gram matches, METEOR combines precision, recall, synonym matching, and word order, focusing at how well the generated text conveys meaning and reads naturally. This metric is especially useful for RAG applications where sentence structure and natural flow are crucial for clear communication. Lower scores may suggest alignment issues, indicating that the answers may lack fluency or key content. Discrepancies in word order or high fragmentation penalties can reveal problems with how the model constructs sentences, potentially affecting readability." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.MeteorScore\",\n", - " inputs={\n", - " \"dataset\": vm_test_ds,\n", - " \"model\": vm_rag_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Bias and Toxicity\n", - "\n", - "In this section, we use metrics like Toxicity Score and Regard Score to evaluate both the generated responses and the ground truth. These tests helps us detect any harmful, offensive, or inappropriate language and evaluate the level of bias and neutrality enabling us to assess and mitigate potential biases in both the model's responses and the original dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Toxicity Score\n", - "\n", - "This test measures the level of harmful or offensive content in the generated answers. The test uses a preloaded toxicity detection tool from Hugging Face, which identifies language that may be inappropriate, aggressive, or derogatory. High toxicity scores indicate potentially toxic content, while consistently elevated scores across multiple outputs may signal underlying issues in the model’s generation process that require attention to prevent the spread of harmful language." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.ToxicityScore\",\n", - " inputs={\n", - " \"dataset\": vm_test_ds,\n", - " \"model\": vm_rag_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Regard Score\n", - "\n", - "This test evaluates the sentiment and perceived regard—categorized as positive, negative, neutral, or other—in answers generated by the RAG model. This is important for identifying any biases or sentiment tendencies in responses, ensuring that generated answers are balanced and appropriate for the context. The uses a preloaded regard evaluation tool from Hugging Face to compute scores for each response. High skewness in regard scores, especially if the generated responses consistently diverge from expected sentiments in the reference texts, may reveal biases in the model’s generation, such as overly positive or negative tones where neutrality is expected." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run_test(\n", - " \"validmind.model_validation.RegardScore\",\n", - " inputs={\n", - " \"dataset\": vm_test_ds,\n", - " \"model\": vm_rag_model,\n", - " },\n", - ").log()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Conclusion\n", - "\n", - "In this notebook, we have seen how we can use LangChain and ValidMind together to build, evaluate and document a simple RAG Model as its developed. This is a great example of the interactive development experience that ValidMind is designed to support. We can quickly iterate on our model and document as we go... We have seen how ValidMind supports non-traditional \"models\" using a functional interface and how we can build pipelines of many models to support complex GenAI workflows.\n", - "\n", - "This is still a work in progress and we are actively developing new tests to support more advanced GenAI workflows. We are also keeping an eye on the most popular GenAI models and libraries to explore direct integrations. Stay tuned for more updates and new features in this area!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Upgrade ValidMind\n", - "\n", - "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", - "\n", - "Retrieve the information for the currently installed version of ValidMind:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip show validmind" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", - "\n", - "```bash\n", - "%pip install --upgrade validmind\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You may need to restart your kernel after running the upgrade package for changes to be applied." - ] - }, - { - "cell_type": "markdown", - "id": "copyright-397fa35a68a34dc38f5d84d797fb5331", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "***\n", - "\n", - "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", - "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", - "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "validmind-py3.10", - "language": "python", - "name": "validmind-py3.10" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RAG Model Documentation Demo\n", + "\n", + "In this notebook, we are going to implement a simple RAG Model for automating the process of answering RFP questions using GenAI. We will see how we can initialize an embedding model, a retrieval model and a generator model with LangChain components and use them within the ValidMind Library to run tests against them. Finally, we will see how we can put them together in a Pipeline and run that to get e2e results and run tests against that." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.content-hidden when-format=\"html\"}\n", + "## Contents \n", + "- [About ValidMind](#toc1__) \n", + " - [Before you begin](#toc1_1__) \n", + " - [New to ValidMind?](#toc1_2__) \n", + " - [Key concepts](#toc1_3__) \n", + "- [Setting up](#toc2__) \n", + " - [Initialize the ValidMind Library](#toc2_1__) \n", + " - [Register sample model](#toc2_1_1__) \n", + " - [Apply documentation template](#toc2_1_2__) \n", + " - [Get your code snippet](#toc2_1_3__) \n", + "- [Read Open AI API Key](#toc3__) \n", + "- [Dataset Loader](#toc4__) \n", + "- [Data validation](#toc5__) \n", + " - [Duplicates](#toc5_1__) \n", + " - [Stop Words](#toc5_2__) \n", + " - [Punctuations](#toc5_3__) \n", + " - [Common Words](#toc5_4__) \n", + " - [Language Detection](#toc5_5__) \n", + " - [Toxicity Score](#toc5_6__) \n", + " - [Polarity and Subjectivity](#toc5_7__) \n", + " - [Sentiment](#toc5_8__) \n", + " - [Assign Predictions](#toc5_9__) \n", + " - [Run tests](#toc5_10__) \n", + " - [Generate embeddings for the Train Set](#toc5_11__) \n", + " - [Insert embeddings and questions into Vector DB](#toc5_12__) \n", + "- [Prompt Evaluation](#toc6__) \n", + "- [RAGAS evaluation](#toc7__) \n", + " - [Semantic Similarity](#toc7_1__) \n", + " - [Context Entity Recall](#toc7_2__) \n", + " - [Context Precision](#toc7_3__) \n", + " - [Context Precision Without Reference](#toc7_4__) \n", + " - [Faithfulness](#toc7_5__) \n", + " - [Response Relevancy](#toc7_6__) \n", + " - [Context Recall](#toc7_7__) \n", + " - [Answer Correctness](#toc7_8__) \n", + " - [Aspect Critic](#toc7_9__) \n", + " - [Noise Sensitivity](#toc7_10__) \n", + "- [Generation quality](#toc8__) \n", + " - [Token Disparity](#toc8_1__) \n", + " - [ROUGE Score](#toc8_2__) \n", + " - [BLEU Score](#toc8_3__) \n", + " - [BERT Score](#toc8_4__) \n", + " - [METEOR Score](#toc8_5__) \n", + "- [Bias and Toxicity](#toc9__) \n", + " - [Toxicity Score](#toc9_1__) \n", + " - [Regard Score](#toc9_2__) \n", + "- [Upgrade ValidMind](#toc10__) \n", + "\n", + ":::\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## About ValidMind\n", + "\n", + "ValidMind is a suite of tools for managing model risk, including risk associated with AI and statistical models.\n", + "\n", + "You use the ValidMind Library to automate documentation and validation tests, and then use the ValidMind Platform to collaborate on model documentation. Together, these products simplify model risk management, facilitate compliance with regulations and institutional standards, and enhance collaboration between yourself and model validators.\n", + "\n", + "\n", + "\n", + "### Before you begin\n", + "\n", + "This notebook assumes you have basic familiarity with Python, including an understanding of how functions work. If you are new to Python, you can still run the notebook but we recommend further familiarizing yourself with the language. \n", + "\n", + "If you encounter errors due to missing modules in your Python environment, install the modules with `pip install`, and then re-run the notebook. For more help, refer to [Installing Python Modules](https://docs.python.org/3/installing/index.html).\n", + "\n", + "\n", + "\n", + "### New to ValidMind?\n", + "\n", + "If you haven't already seen our documentation on the [ValidMind Library](https://docs.validmind.ai/developer/validmind-library.html), we recommend you begin by exploring the available resources in this section. There, you can learn more about documenting models and running tests, as well as find code samples and our Python Library API reference.\n", + "\n", + "
For access to all features available in this notebook, you'll need access to a ValidMind account.\n", + "

\n", + "Register with ValidMind
\n", + "\n", + "\n", + "\n", + "### Key concepts\n", + "\n", + "- **FunctionModels**: ValidMind offers support for creating `VMModel` instances from Python functions. This enables us to support any \"model\" by simply using the provided function as the model's `predict` method.\n", + "- **PipelineModels**: ValidMind models (`VMModel` instances) of any type can be piped together to create a model pipeline. This allows model components to be created and tested/documented independently, and then combined into a single model for end-to-end testing and documentation. We use the `|` operator to pipe models together.\n", + "- **RAG**: RAG stands for Retrieval Augmented Generation and refers to a wide range of GenAI applications where some form of retrieval is used to add context to the prompt so that the LLM that generates content can refer to it when creating its output. In this notebook, we are going to implement a simple RAG setup using LangChain components." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prerequisites\n", + "\n", + "Let's go ahead and install the `validmind` library if its not already installed... Then we can install the `qdrant-client` library for our vector store and `langchain` for everything else:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q qdrant-client langchain langchain-openai sentencepiece" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Setting up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Initialize the ValidMind Library" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Register sample model\n", + "\n", + "Let's first register a sample model for use with this notebook:\n", + "\n", + "1. In a browser, [log in to ValidMind](https://docs.validmind.ai/guide/configuration/log-in-to-validmind.html).\n", + "\n", + "2. In the left sidebar, navigate to **Inventory** and click **+ Register Model**.\n", + "\n", + "3. Enter the model details and click **Next >** to continue to assignment of model stakeholders. ([Need more help?](https://docs.validmind.ai/guide/model-inventory/register-models-in-inventory.html))\n", + "4. Select your own name under the **MODEL OWNER** drop-down.\n", + "\n", + "5. Click **Register Model** to add the model to your inventory." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Apply documentation template\n", + "\n", + "Once you've registered your model, let's select a documentation template. A template predefines sections for your model documentation and provides a general outline to follow, making the documentation process much easier.\n", + "\n", + "1. In the left sidebar that appears for your model, click **Documents** and select **Documentation**.\n", + "\n", + "2. Under **TEMPLATE**, select `Gen AI RAG`.\n", + "\n", + "3. Click **Use Template** to apply the template." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Can't select this template?\n", + "

\n", + "Your organization administrators may need to add it to your template library:\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### Get your code snippet\n", + "\n", + "ValidMind generates a unique _code snippet_ for each registered model to connect with your developer environment. You initialize the ValidMind Library with this code snippet, which ensures that your documentation and tests are uploaded to the correct model when you run the notebook.\n", + "\n", + "1. On the left sidebar that appears for your model, select **Getting Started** and click **Copy snippet to clipboard**.\n", + "2. Next, [load your model identifier credentials from an `.env` file](https://docs.validmind.ai/developer/model-documentation/store-credentials-in-env-file.html) or replace the placeholder with your own code snippet:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or replace with your code snippet\n", + "\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + " # document=\"documentation\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Read Open AI API Key\n", + "\n", + "We will need to have an OpenAI API key to be able to use their `text-embedding-3-small` model for our embeddings, `gpt-3.5-turbo` model for our generator and `gpt-4o` model for our LLM-as-Judge tests. If you don't have an OpenAI API key, you can get one by signing up at [OpenAI](https://platform.openai.com/signup). Then you can create a `.env` file in the root of your project and the following cell will load it from there. Alternatively, you can just uncomment the line below to directly set the key (not recommended for security reasons)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load openai api key\n", + "import os\n", + "\n", + "import dotenv\n", + "import nltk\n", + "\n", + "dotenv.load_dotenv()\n", + "nltk.download('stopwords')\n", + "nltk.download('punkt_tab')\n", + "\n", + "# os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n", + "\n", + "if not \"OPENAI_API_KEY\" in os.environ:\n", + " raise ValueError(\"OPENAI_API_KEY is not set\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Dataset Loader\n", + "\n", + "Great, now that we have all of our dependencies installed, the ValidMind Library initialized and connected to our model and our OpenAI API key setup, we can go ahead and load our datasets. We will use the synthetic `RFP` dataset included with ValidMind for this notebook. This dataset contains a variety of RFP questions and ground truth answers that we can use both as the source where our Retriever will search for similar question-answer pairs as well as our test set for evaluating the performance of our RAG model. To do this, we just have to load it and call the preprocess function to get a split of the data into train and test sets." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the sample dataset from the library\n", + "from validmind.datasets.llm.rag import rfp\n", + "\n", + "raw_df = rfp.load_data()\n", + "train_df, test_df = rfp.preprocess(raw_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_train_ds = vm.init_dataset(\n", + " train_df,\n", + " text_column=\"question\",\n", + " target_column=\"ground_truth\",\n", + ")\n", + "\n", + "vm_test_ds = vm.init_dataset(\n", + " test_df,\n", + " text_column=\"question\",\n", + " target_column=\"ground_truth\",\n", + ")\n", + "\n", + "vm_test_ds.df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Data validation\n", + "\n", + "Now that we have loaded our dataset, we can go ahead and run some data validation tests right away to start assessing and documenting the quality of our data. Since we are using a text dataset, we can use ValidMind's built-in array of text data quality tests to check that things like number of duplicates, missing values, and other common text data issues are not present in our dataset. We can also run some tests to check the sentiment and toxicity of our data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Duplicates\n", + "\n", + "First, let's check for duplicates in our dataset. We can use the `validmind.data_validation.Duplicates` test and pass our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from validmind.tests import run_test\n", + "\n", + "run_test(\n", + " test_id=\"validmind.data_validation.Duplicates\",\n", + " inputs={\"dataset\": vm_train_ds},\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Stop Words\n", + "\n", + "Next, let's check for stop words in our dataset. We can use the `validmind.data_validation.StopWords` test and pass our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.StopWords\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Punctuations\n", + "\n", + "Next, let's check for punctuations in our dataset. We can use the `validmind.data_validation.Punctuations` test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.Punctuations\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Common Words\n", + "\n", + "Next, let's check for common words in our dataset. We can use the `validmind.data_validation.CommonWord` test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.CommonWords\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Language Detection\n", + "\n", + "For documentation purposes, we can detect and log the languages used in the dataset with the `validmind.data_validation.LanguageDetection` test:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " test_id=\"validmind.data_validation.nlp.LanguageDetection\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Toxicity Score\n", + "\n", + "Now, let's go ahead and run the `validmind.data_validation.nlp.Toxicity` test to compute a toxicity score for our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.nlp.Toxicity\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Polarity and Subjectivity\n", + "\n", + "We can also run the `validmind.data_validation.nlp.PolarityAndSubjectivity` test to compute the polarity and subjectivity of our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.nlp.PolarityAndSubjectivity\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Sentiment\n", + "\n", + "Finally, we can run the `validmind.data_validation.nlp.Sentiment` test to plot the sentiment of our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.data_validation.nlp.Sentiment\",\n", + " inputs={\n", + " \"dataset\": vm_train_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Embedding Model\n", + "\n", + "Now that we have our dataset loaded and have run some data validation tests to assess and document the quality of our data, we can go ahead and initialize our embedding model. We will use the `text-embedding-3-small` model from OpenAI for this purpose wrapped in the `OpenAIEmbeddings` class from LangChain. This model will be used to \"embed\" our questions both for inserting the question-answer pairs from the \"train\" set into the vector store and for embedding the question from inputs when making predictions with our RAG model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_openai import OpenAIEmbeddings\n", + "\n", + "embedding_client = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", + "\n", + "\n", + "def embed(input):\n", + " \"\"\"Returns a text embedding for the given text\"\"\"\n", + " return embedding_client.embed_query(input[\"question\"])\n", + "\n", + "\n", + "vm_embedder = vm.init_model(input_id=\"embedding_model\", predict_fn=embed)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What we have done here is to initialize the `OpenAIEmbeddings` class so it uses OpenAI's `text-embedding-3-small` model. We then created an `embed` function that takes in an `input` dictionary and uses the `embed_query` method of the embedding client to compute the embeddings of the `question`. We use an `embed` function since that is how ValidMind supports any custom model. We will use this strategy for the retrieval and generator models as well but you could also use, say, a HuggingFace model directly. See the documentation for more information on which model types are directly supported - [ValidMind Documentation](https://docs.validmind.ai/validmind/validmind.html)... Finally, we use the `init_model` function from the ValidMind Library to create a `VMModel` object that can be used in ValidMind tests. This also logs the model to our model documentation and any test that uses the model will be linked to the logged model and its metadata." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Assign Predictions\n", + "\n", + "To precompute the embeddings for our test set, we can call the `assign_predictions` method of our `vm_test_ds` object we created above. This will compute the embeddings for each question in the test set and store them in the a special prediction column of the test set thats linked to our `vm_embedder` model. This will allow us to use these embeddings later when we run tests against our embedding model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds.assign_predictions(vm_embedder)\n", + "print(vm_test_ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Run tests\n", + "\n", + "Now that everything is setup for the embedding model, we can go ahead and run some tests to assess and document the quality of our embeddings. We will use the `validmind.model_validation.embeddings.*` tests to compute a variety of metrics against our model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.StabilityAnalysisRandomNoise\",\n", + " inputs={\n", + " \"model\": vm_embedder,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + " params={\"probability\": 0.3},\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.StabilityAnalysisSynonyms\",\n", + " inputs={\n", + " \"model\": vm_embedder,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + " params={\"probability\": 0.3},\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.StabilityAnalysisTranslation\",\n", + " inputs={\n", + " \"model\": vm_embedder,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + " params={\n", + " \"source_lang\": \"en\",\n", + " \"target_lang\": \"fr\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.CosineSimilarityHeatmap\",\n", + " inputs={\n", + " \"model\": vm_embedder,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "run_test(\n", + " \"validmind.model_validation.embeddings.CosineSimilarityDistribution\",\n", + " inputs={\n", + " \"model\": vm_embedder,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.EuclideanDistanceHeatmap\",\n", + " inputs={\n", + " \"model\": vm_embedder,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.PCAComponentsPairwisePlots\",\n", + " inputs={\n", + " \"model\": vm_embedder,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + " params={\"n_components\": 3},\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.embeddings.TSNEComponentsPairwisePlots\",\n", + " inputs={\n", + " \"model\": vm_embedder,\n", + " \"dataset\": vm_test_ds,\n", + " },\n", + " params={\"n_components\": 3, \"perplexity\": 20},\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup Vector Store\n", + "\n", + "Great, so now that we have assessed our embedding model and verified that it is performing well, we can go ahead and use it to compute embeddings for our question-answer pairs in the \"train\" set. We will then use these embeddings to insert the question-answer pairs into a vector store. We will use an in-memory `qdrant` vector database for demo purposes but any option would work just as well here. We will use the `QdrantClient` class from LangChain to interact with the vector store. This class will allow us to insert and search for embeddings in the vector store." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Generate embeddings for the Train Set\n", + "\n", + "We can use the same `assign_predictions` method from earlier except this time we will use the `vm_train_ds` object to compute the embeddings for the question-answer pairs in the \"train\" set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_train_ds.assign_predictions(vm_embedder)\n", + "print(vm_train_ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Insert embeddings and questions into Vector DB\n", + "\n", + "Now that we have computed the embeddings for our question-answer pairs in the \"train\" set, we can go ahead and insert them into the vector store:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.vectorstores import Qdrant\n", + "from langchain_openai import OpenAIEmbeddings\n", + "from langchain_community.document_loaders import DataFrameLoader\n", + "\n", + "# load documents from dataframe\n", + "loader = DataFrameLoader(train_df, page_content_column=\"question\")\n", + "docs = loader.load()\n", + "# choose model using embedding client\n", + "embedding_client = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", + "\n", + "# setup vector datastore\n", + "qdrant = Qdrant.from_documents(\n", + " docs,\n", + " embedding_client,\n", + " location=\":memory:\", # Local mode with in-memory storage only\n", + " collection_name=\"rfp_rag_collection\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Retrieval Model\n", + "\n", + "Now that we have an embedding model and a vector database setup and loaded with our data, we need a Retrieval model that can search for similar question-answer pairs for a given input question. Once created, we can initialize this as a ValidMind model and `assign_predictions` to it just like our embedding model." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "def retrieve(input):\n", + " contexts = []\n", + "\n", + " for result in qdrant.similarity_search_with_score(input[\"question\"]):\n", + " document, score = result\n", + " context = f\"Q: {document.page_content}\\n\"\n", + " context += f\"A: {document.metadata['ground_truth']}\\n\"\n", + "\n", + " contexts.append(context)\n", + "\n", + " return contexts\n", + "\n", + "\n", + "vm_retriever = vm.init_model(input_id=\"retrieval_model\", predict_fn=retrieve)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds.assign_predictions(model=vm_retriever)\n", + "print(vm_test_ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generation Model\n", + "\n", + "As the final piece of this simple RAG pipeline, we can create and initialize a generation model that will use the retrieved context to generate an answer to the input question. We will use the `gpt-3.5-turbo` model from OpenAI." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from openai import OpenAI\n", + "\n", + "from validmind.models import Prompt\n", + "\n", + "\n", + "system_prompt = \"\"\"\n", + "You are an expert RFP AI assistant.\n", + "You are tasked with answering new RFP questions based on existing RFP questions and answers.\n", + "You will be provided with the existing RFP questions and answer pairs that are the most relevant to the new RFP question.\n", + "After that you will be provided with a new RFP question.\n", + "You will generate an answer and respond only with the answer.\n", + "Ignore your pre-existing knowledge and answer the question based on the provided context.\n", + "\"\"\".strip()\n", + "\n", + "openai_client = OpenAI()\n", + "\n", + "\n", + "def generate(input):\n", + " response = openai_client.chat.completions.create(\n", + " model=\"gpt-3.5-turbo\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": \"\\n\\n\".join(input[\"retrieval_model\"])},\n", + " {\"role\": \"user\", \"content\": input[\"question\"]},\n", + " ],\n", + " )\n", + "\n", + " return response.choices[0].message.content\n", + "\n", + "\n", + "vm_generator = vm.init_model(\n", + " input_id=\"generation_model\",\n", + " predict_fn=generate,\n", + " prompt=Prompt(template=system_prompt),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's test it out real quick:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "vm_generator.predict(\n", + " pd.DataFrame(\n", + " {\"retrieval_model\": [[\"My name is anil\"]], \"question\": [\"what is my name\"]}\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Prompt Evaluation\n", + "\n", + "Now that we have our generator model initialized, we can run some LLM-as-Judge tests to evaluate the system prompt. This will allow us to get an initial sense of how well the prompt meets a few best practices for prompt engineering. These tests use an LLM to rate the prompt on a scale of 1-10 against the following criteria:\n", + "\n", + "- **Examplar Bias**: When using multi-shot prompting, does the prompt contain an unbiased distribution of examples?\n", + "- **Delimitation**: When using complex prompts containing examples, contextual information, or other elements, is the prompt formatted in such a way that each element is clearly separated?\n", + "- **Clarity**: How clearly the prompt states the task.\n", + "- **Conciseness**: How succinctly the prompt states the task.\n", + "- **Instruction Framing**: Whether the prompt contains negative instructions.\n", + "- **Specificity**: How specific the prompt defines the task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Bias\",\n", + " inputs={\n", + " \"model\": vm_generator,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Clarity\",\n", + " inputs={\n", + " \"model\": vm_generator,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Conciseness\",\n", + " inputs={\n", + " \"model\": vm_generator,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Delimitation\",\n", + " inputs={\n", + " \"model\": vm_generator,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.NegativeInstruction\",\n", + " inputs={\n", + " \"model\": vm_generator,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.prompt_validation.Specificity\",\n", + " inputs={\n", + " \"model\": vm_generator,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup RAG Pipeline Model\n", + "\n", + "Now that we have all of our individual \"component\" models setup and initialized we need some way to put them all together in a single \"pipeline\". We can use the `PipelineModel` class to do this. This ValidMind model type simply wraps any number of other ValidMind models and runs them in sequence. We can use a pipe(`|`) operator - in Python this is normally an `or` operator but we have overloaded it for easy pipeline creation - to chain together our models. We can then initialize this pipeline model and assign predictions to it just like any other model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_rag_model = vm.init_model(vm_retriever | vm_generator, input_id=\"rag_model\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can `assign_predictions` to the pipeline model just like we did with the individual models. This will run the pipeline on the test set and store the results in the test set for later use." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds.assign_predictions(model=vm_rag_model)\n", + "print(vm_test_ds)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_ds._df.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run tests" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## RAGAS evaluation\n", + "\n", + "Let's go ahead and run some of our new RAG tests against our model...\n", + "\n", + "> Note: these tests are still being developed and are not yet in a stable state. We are using advanced tests here that use LLM-as-Judge and other strategies to assess things like the relevancy of the retrieved context to the input question and the correctness of the generated answer when compared to the ground truth. There is more to come in this area so stay tuned!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Semantic Similarity\n", + "\n", + "The concept of Answer Semantic Similarity pertains to the assessment of the semantic resemblance between the generated answer and the ground truth. This evaluation is based on the ground truth and the answer, with values falling within the range of 0 to 1. A higher score signifies a better alignment between the generated answer and the ground truth.\n", + "\n", + "Measuring the semantic similarity between answers can offer valuable insights into the quality of the generated response. This evaluation utilizes a cross-encoder model to calculate the semantic similarity score." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.SemanticSimilarity\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"response_column\": \"rag_model_prediction\",\n", + " \"reference_column\": \"ground_truth\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Context Entity Recall\n", + "\n", + "This test gives the measure of recall of the retrieved context, based on the number of entities present in both ground_truths and contexts relative to the number of entities present in the ground_truths alone. Simply put, it is a measure of what fraction of entities are recalled from ground_truths. This test is useful in fact-based use cases like tourism help desk, historical QA, etc. This test can help evaluate the retrieval mechanism for entities, based on comparison with entities present in ground_truths, because in cases where entities matter, we need the contexts which cover them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextEntityRecall\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"reference_column\": \"ground_truth\",\n", + " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Context Precision\n", + "\n", + "Context Precision is a test that evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Ideally all the relevant chunks must appear at the top ranks. This test is computed using the question, ground_truth and the contexts, with values ranging between 0 and 1, where higher scores indicate better precision." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextPrecision\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"user_input_column\": \"question\",\n", + " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", + " \"reference_column\": \"ground_truth\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Context Precision Without Reference\n", + "\n", + "This test evaluates whether retrieved contexts align well with the expected response for a given user input, without requiring a ground-truth reference. This test assesses the relevance of each retrieved context chunk by comparing it directly to the response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextPrecisionWithoutReference\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"user_input_column\": \"question\",\n", + " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", + " \"response_column\": \"rag_model_prediction\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Faithfulness\n", + "\n", + "This measures the factual consistency of the generated answer against the given context. It is calculated from answer and retrieved context. The answer is scaled to (0,1) range. Higher the better.\n", + "\n", + "The generated answer is regarded as faithful if all the claims that are made in the answer can be inferred from the given context. To calculate this a set of claims from the generated answer is first identified. Then each one of these claims are cross checked with given context to determine if it can be inferred from given context or not." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.Faithfulness\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"user_input_column\": \"question\",\n", + " \"response_column\": \"rag_model_prediction\",\n", + " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Response Relevancy\n", + "\n", + "The Response Relevancy test, focuses on assessing how pertinent the generated answer is to the given prompt. A lower score is assigned to answers that are incomplete or contain redundant information and higher scores indicate better relevancy. This test is computed using the question, the context and the answer.\n", + "\n", + "The Response Relevancy is defined as the mean cosine similartiy of the original question to a number of artifical questions, which where generated (reverse engineered) based on the answer.\n", + "\n", + "Please note, that eventhough in practice the score will range between 0 and 1 most of the time, this is not mathematically guranteed, due to the nature of the cosine similarity ranging from -1 to 1.\n", + "\n", + "> Note: This is a reference free test. If you’re looking to compare ground truth answer with generated answer refer to Answer Correctness.\n", + "\n", + "An answer is deemed relevant when it directly and appropriately addresses the original question. Importantly, our assessment of answer relevance does not consider factuality but instead penalizes cases where the answer lacks completeness or contains redundant details. To calculate this score, the LLM is prompted to generate an appropriate question for the generated answer multiple times, and the mean cosine similarity between these generated questions and the original question is measured. The underlying idea is that if the generated answer accurately addresses the initial question, the LLM should be able to generate questions from the answer that align with the original question." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ResponseRelevancy\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"user_input_column\": \"question\",\n", + " \"response_column\": \"rag_model_prediction\",\n", + " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Context Recall\n", + "\n", + "Context recall measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. It is computed based on the ground truth and the retrieved context, and the values range between 0 and 1, with higher values indicating better performance.\n", + "\n", + "To estimate context recall from the ground truth answer, each sentence in the ground truth answer is analyzed to determine whether it can be attributed to the retrieved context or not. In an ideal scenario, all sentences in the ground truth answer should be attributable to the retrieved context." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.ContextRecall\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"user_input_column\": \"question\",\n", + " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", + " \"reference_column\": \"ground_truth\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Answer Correctness\n", + "\n", + "The assessment of Answer Correctness involves gauging the accuracy of the generated answer when compared to the ground truth. This evaluation relies on the ground truth and the answer, with scores ranging from 0 to 1. A higher score indicates a closer alignment between the generated answer and the ground truth, signifying better correctness.\n", + "\n", + "Answer correctness encompasses two critical aspects: semantic similarity between the generated answer and the ground truth, as well as factual similarity. These aspects are combined using a weighted scheme to formulate the answer correctness score.\n", + "\n", + "Factual correctness quantifies the factual overlap between the generated answer and the ground truth answer. This is done using the concepts of:\n", + "\n", + "- TP (True Positive): Facts or statements that are present in both the ground truth and the generated answer.\n", + "- FP (False Positive): Facts or statements that are present in the generated answer but not in the ground truth.\n", + "- FN (False Negative): Facts or statements that are present in the ground truth but not in the generated answer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.AnswerCorrectness\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"user_input_column\": \"question\",\n", + " \"response_column\": \"rag_model_prediction\",\n", + " \"reference_column\": \"ground_truth\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Aspect Critic\n", + "\n", + "This is designed to assess submissions based on predefined aspects such as harmlessness and correctness. Additionally, users have the flexibility to define their own aspects for evaluating submissions according to their specific criteria. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not. This evaluation is performed using the ‘answer’ as input.\n", + "\n", + "Critiques within the LLM evaluators evaluate submissions based on the provided aspect. Ragas Critiques offers a range of predefined aspects like correctness, harmfulness, etc. Users can also define their own aspects for evaluating submissions based on their specific criteria. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.AspectCritic\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"user_input_column\": \"question\",\n", + " \"response_column\": \"rag_model_prediction\",\n", + " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Noise Sensitivity\n", + "\n", + "This test is designed to evaluate the robustness of the RAG pipeline model against noise in the retrieved context. It works by checking how well the \"claims\" in the generated answer match up with the \"claims\" in the ground truth answer. If the generated answer contains \"claims\" from the contexts that the ground truth answer does not contain, those claims are considered incorrect. The score for each answer is the number of incorrect claims divided by the total number of claims. This *can* be interpreted as a measure of how sensitive the LLM is to \"noise\" in the context where \"noise\" is information that is relevant but should not be included in the answer since the ground truth answer does not contain it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ragas.NoiseSensitivity\",\n", + " inputs={\"dataset\": vm_test_ds},\n", + " params={\n", + " \"user_input_column\": \"question\",\n", + " \"response_column\": \"rag_model_prediction\",\n", + " \"reference_column\": \"ground_truth\",\n", + " \"retrieved_contexts_column\": \"retrieval_model_prediction\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Generation quality\n", + "\n", + "In this section, we evaluate the alignment and relevance of generated responses to reference outputs within our retrieval-augmented generation (RAG) application. We use metrics that assess various quality dimensions of the generated responses, including semantic similarity, structural alignment, and phrasing overlap. Semantic similarity metrics compare embeddings of generated and reference text to capture deeper contextual alignment, while overlap and alignment measures quantify how well the phrasing and structure of generated responses match the intended outputs." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Token Disparity\n", + "\n", + "This test assesses the difference in token counts between the reference texts (ground truth) and the answers generated by the RAG model. It helps evaluate how well the model's outputs align with the expected length and level of detail in the reference texts. A significant disparity in token counts could signal issues with generation quality, such as excessive verbosity or insufficient detail. Consistently low token counts in generated answers compared to references might suggest that the model’s outputs are incomplete or overly concise, missing important contextual information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.TokenDisparity\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " \"model\": vm_rag_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### ROUGE Score\n", + "\n", + "This test evaluates the quality of answers generated by the RAG model by measuring overlaps in n-grams, word sequences, and word pairs between the model output and the reference (ground truth) text. ROUGE, short for Recall-Oriented Understudy for Gisting Evaluation, assesses both precision and recall, providing a balanced view of how well the generated response captures the reference content. ROUGE precision measures the proportion of n-grams in the generated text that match the reference, highlighting relevance and conciseness, while ROUGE recall assesses the proportion of reference n-grams present in the generated text, indicating completeness and thoroughness. \n", + "\n", + "Low precision scores might reveal that the generated text includes redundant or irrelevant information, while low recall scores suggest omissions of essential details from the reference. Consistently low ROUGE scores could indicate poor overall alignment with the ground truth, suggesting the model may be missing key content or failing to capture the intended meaning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.RougeScore\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " \"model\": vm_rag_model,\n", + " },\n", + " params={\n", + " \"metric\": \"rouge-1\",\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### BLEU Score\n", + "\n", + "The BLEU Score test evaluates the quality of answers generated by the RAG model by measuring n-gram overlap between the generated text and the reference (ground truth) text, with a specific focus on exact precision in phrasing. While ROUGE precision also assesses overlap, BLEU differs in two main ways: first, it applies a geometric average across multiple n-gram levels, capturing precise phrase alignment, and second, it includes a brevity penalty to prevent overly short outputs from inflating scores artificially. This added precision focus is valuable in RAG applications where strict adherence to reference language is essential, as BLEU emphasizes the match to exact phrasing. In contrast, ROUGE precision evaluates general content overlap without penalizing brevity, offering a broader sense of content alignment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.BleuScore\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " \"model\": vm_rag_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### BERT Score\n", + "\n", + "This test evaluates the quality of the RAG generated answers using BERT embeddings to measure precision, recall, and F1 scores based on semantic similarity, rather than exact n-gram matches as in BLEU and ROUGE. This approach captures contextual meaning, making it valuable when wording differs but the intended message closely aligns with the reference. In RAG applications, the BERT score is especially useful for ensuring that generated answers convey the reference text’s meaning, even if phrasing varies. Consistently low scores indicate a lack of semantic alignment, suggesting the model may miss or misrepresent key content. Low precision may reflect irrelevant or redundant details, while low recall can indicate omissions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.BertScore\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " \"model\": vm_rag_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### METEOR Score\n", + "\n", + "This test evaluates the quality of the generated answers by measuring alignment with the ground truth, emphasizing both accuracy and fluency. Unlike BLEU and ROUGE, which focus on n-gram matches, METEOR combines precision, recall, synonym matching, and word order, focusing at how well the generated text conveys meaning and reads naturally. This metric is especially useful for RAG applications where sentence structure and natural flow are crucial for clear communication. Lower scores may suggest alignment issues, indicating that the answers may lack fluency or key content. Discrepancies in word order or high fragmentation penalties can reveal problems with how the model constructs sentences, potentially affecting readability." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.MeteorScore\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " \"model\": vm_rag_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Bias and Toxicity\n", + "\n", + "In this section, we use metrics like Toxicity Score and Regard Score to evaluate both the generated responses and the ground truth. These tests helps us detect any harmful, offensive, or inappropriate language and evaluate the level of bias and neutrality enabling us to assess and mitigate potential biases in both the model's responses and the original dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Toxicity Score\n", + "\n", + "This test measures the level of harmful or offensive content in the generated answers. The test uses a preloaded toxicity detection tool from Hugging Face, which identifies language that may be inappropriate, aggressive, or derogatory. High toxicity scores indicate potentially toxic content, while consistently elevated scores across multiple outputs may signal underlying issues in the model’s generation process that require attention to prevent the spread of harmful language." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.ToxicityScore\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " \"model\": vm_rag_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Regard Score\n", + "\n", + "This test evaluates the sentiment and perceived regard—categorized as positive, negative, neutral, or other—in answers generated by the RAG model. This is important for identifying any biases or sentiment tendencies in responses, ensuring that generated answers are balanced and appropriate for the context. The uses a preloaded regard evaluation tool from Hugging Face to compute scores for each response. High skewness in regard scores, especially if the generated responses consistently diverge from expected sentiments in the reference texts, may reveal biases in the model’s generation, such as overly positive or negative tones where neutrality is expected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_test(\n", + " \"validmind.model_validation.RegardScore\",\n", + " inputs={\n", + " \"dataset\": vm_test_ds,\n", + " \"model\": vm_rag_model,\n", + " },\n", + ").log()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conclusion\n", + "\n", + "In this notebook, we have seen how we can use LangChain and ValidMind together to build, evaluate and document a simple RAG Model as its developed. This is a great example of the interactive development experience that ValidMind is designed to support. We can quickly iterate on our model and document as we go... We have seen how ValidMind supports non-traditional \"models\" using a functional interface and how we can build pipelines of many models to support complex GenAI workflows.\n", + "\n", + "This is still a work in progress and we are actively developing new tests to support more advanced GenAI workflows. We are also keeping an eye on the most popular GenAI models and libraries to explore direct integrations. Stay tuned for more updates and new features in this area!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Upgrade ValidMind\n", + "\n", + "
After installing ValidMind, you’ll want to periodically make sure you are on the latest version to access any new features and other enhancements.
\n", + "\n", + "Retrieve the information for the currently installed version of ValidMind:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip show validmind" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the version returned is lower than the version indicated in our [production open-source code](https://github.com/validmind/validmind-library/blob/prod/validmind/__version__.py), restart your notebook and run:\n", + "\n", + "```bash\n", + "%pip install --upgrade validmind\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may need to restart your kernel after running the upgrade package for changes to be applied." + ] + }, + { + "cell_type": "markdown", + "id": "copyright-113dfc22cae44dfbb1ffefe8372a664b", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "***\n", + "\n", + "Copyright © 2023-2026 ValidMind Inc. All rights reserved.
\n", + "Refer to [LICENSE](https://github.com/validmind/validmind-library/blob/main/LICENSE) for details.
\n", + "SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "validmind-py3.10", + "language": "python", + "name": "validmind-py3.10" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/notebooks/use_cases/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb b/notebooks/use_cases/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb index ebdbaeae0..8ebb5b183 100644 --- a/notebooks/use_cases/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb +++ b/notebooks/use_cases/ongoing_monitoring/application_scorecard_ongoing_monitoring.ipynb @@ -225,7 +225,8 @@ " api_key = \"...\",\n", " api_secret = \"...\",\n", " model = \"...\",\n", - " monitoring = True\n", + " monitoring = True,\n", + " document=\"monitoring\",\n", ")" ] }, @@ -1318,7 +1319,7 @@ }, { "cell_type": "markdown", - "id": "copyright-4c5ddb7cde514c958ea0048f5e472de5", + "id": "copyright-9b204f07d6ef4508a249d0c7d5e7d44b", "metadata": {}, "source": [ "\n", diff --git a/notebooks/use_cases/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb b/notebooks/use_cases/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb index a5cabb9a9..8ca30216e 100644 --- a/notebooks/use_cases/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb +++ b/notebooks/use_cases/ongoing_monitoring/quickstart_customer_churn_ongoing_monitoring.ipynb @@ -223,7 +223,8 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", - " monitoring = True\n", + " monitoring = True,\n", + " document=\"monitoring\",\n", ")" ] }, @@ -846,7 +847,7 @@ }, { "cell_type": "markdown", - "id": "copyright-c2ce5d3e3aa04c4ab7f6401ac810c67f", + "id": "copyright-a5f8f86b12d74c72a867a01b3aeb6da2", "metadata": {}, "source": [ "\n", diff --git a/notebooks/use_cases/regression/quickstart_regression_full_suite.ipynb b/notebooks/use_cases/regression/quickstart_regression_full_suite.ipynb index 91765950e..078fc73f0 100644 --- a/notebooks/use_cases/regression/quickstart_regression_full_suite.ipynb +++ b/notebooks/use_cases/regression/quickstart_regression_full_suite.ipynb @@ -193,6 +193,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -546,7 +547,7 @@ }, { "cell_type": "markdown", - "id": "copyright-ad12b1c3e98d435ea8cc57eadf4f5a76", + "id": "copyright-e2b543078a614a038f27fa63a6c74297", "metadata": {}, "source": [ "\n", diff --git a/notebooks/use_cases/time_series/quickstart_time_series_full_suite.ipynb b/notebooks/use_cases/time_series/quickstart_time_series_full_suite.ipynb index 94b86b7a7..d130403e3 100644 --- a/notebooks/use_cases/time_series/quickstart_time_series_full_suite.ipynb +++ b/notebooks/use_cases/time_series/quickstart_time_series_full_suite.ipynb @@ -227,6 +227,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -709,7 +710,7 @@ }, { "cell_type": "markdown", - "id": "copyright-3e1111bc137c44b9866472934320b128", + "id": "copyright-b9c8d56383ed44cfa943d32f12fc037c", "metadata": {}, "source": [ "\n", diff --git a/notebooks/use_cases/time_series/quickstart_time_series_high_code.ipynb b/notebooks/use_cases/time_series/quickstart_time_series_high_code.ipynb index 979c91048..a09cf81d7 100644 --- a/notebooks/use_cases/time_series/quickstart_time_series_high_code.ipynb +++ b/notebooks/use_cases/time_series/quickstart_time_series_high_code.ipynb @@ -228,6 +228,7 @@ " # api_key=\"...\",\n", " # api_secret=\"...\",\n", " # model=\"...\",\n", + " # document=\"documentation\",\n", ")" ] }, @@ -967,7 +968,7 @@ }, { "cell_type": "markdown", - "id": "copyright-0e30120b73cd445b92ff0637c3467c01", + "id": "copyright-4f0137500d9c49d3b6641ef3779aa140", "metadata": {}, "source": [ "\n", diff --git a/poetry.lock b/poetry.lock index dbc58f63c..b661b86e7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.3.2 and should not be changed by hand. [[package]] name = "aiodns" @@ -714,6 +714,10 @@ files = [ {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a37b8f0391212d29b3a91a799c8e4a2855e0576911cdfb2515487e30e322253d"}, {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e84799f09591700a4154154cab9787452925578841a94321d5ee8fb9a9a328f0"}, {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f66b5337fa213f1da0d9000bc8dc0cb5b896b726eefd9c6046f699b169c41b9e"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5dab0844f2cf82be357a0eb11a9087f70c5430b2c241493fc122bb6f2bb0917c"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e4fe605b917c70283db7dfe5ada75e04561479075761a0b3866c081d035b01c1"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1e9a65b5736232e7a7f91ff3d02277f11d339bf34099a56cdab6a8b3410a02b2"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:58d4b711689366d4a03ac7957ab8c28890415e267f9b6589969e74b6e42225ec"}, {file = "Brotli-1.1.0-cp310-cp310-win32.whl", hash = "sha256:be36e3d172dc816333f33520154d708a2657ea63762ec16b62ece02ab5e4daf2"}, {file = "Brotli-1.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:0c6244521dda65ea562d5a69b9a26120769b7a9fb3db2fe9545935ed6735b128"}, {file = "Brotli-1.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a3daabb76a78f829cafc365531c972016e4aa8d5b4bf60660ad8ecee19df7ccc"}, @@ -726,8 +730,14 @@ files = [ {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:19c116e796420b0cee3da1ccec3b764ed2952ccfcc298b55a10e5610ad7885f9"}, {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:510b5b1bfbe20e1a7b3baf5fed9e9451873559a976c1a78eebaa3b86c57b4265"}, {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a1fd8a29719ccce974d523580987b7f8229aeace506952fa9ce1d53a033873c8"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c247dd99d39e0338a604f8c2b3bc7061d5c2e9e2ac7ba9cc1be5a69cb6cd832f"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1b2c248cd517c222d89e74669a4adfa5577e06ab68771a529060cf5a156e9757"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:2a24c50840d89ded6c9a8fdc7b6ed3692ed4e86f1c4a4a938e1e92def92933e0"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f31859074d57b4639318523d6ffdca586ace54271a73ad23ad021acd807eb14b"}, {file = "Brotli-1.1.0-cp311-cp311-win32.whl", hash = "sha256:39da8adedf6942d76dc3e46653e52df937a3c4d6d18fdc94a7c29d263b1f5b50"}, {file = "Brotli-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:aac0411d20e345dc0920bdec5548e438e999ff68d77564d5e9463a7ca9d3e7b1"}, + {file = "Brotli-1.1.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:32d95b80260d79926f5fab3c41701dbb818fde1c9da590e77e571eefd14abe28"}, + {file = "Brotli-1.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b760c65308ff1e462f65d69c12e4ae085cff3b332d894637f6273a12a482d09f"}, {file = "Brotli-1.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:316cc9b17edf613ac76b1f1f305d2a748f1b976b033b049a6ecdfd5612c70409"}, {file = "Brotli-1.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:caf9ee9a5775f3111642d33b86237b05808dafcd6268faa492250e9b78046eb2"}, {file = "Brotli-1.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70051525001750221daa10907c77830bc889cb6d865cc0b813d9db7fefc21451"}, @@ -738,8 +748,24 @@ files = [ {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:4093c631e96fdd49e0377a9c167bfd75b6d0bad2ace734c6eb20b348bc3ea180"}, {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7e4c4629ddad63006efa0ef968c8e4751c5868ff0b1c5c40f76524e894c50248"}, {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:861bf317735688269936f755fa136a99d1ed526883859f86e41a5d43c61d8966"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:87a3044c3a35055527ac75e419dfa9f4f3667a1e887ee80360589eb8c90aabb9"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c5529b34c1c9d937168297f2c1fde7ebe9ebdd5e121297ff9c043bdb2ae3d6fb"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ca63e1890ede90b2e4454f9a65135a4d387a4585ff8282bb72964fab893f2111"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e79e6520141d792237c70bcd7a3b122d00f2613769ae0cb61c52e89fd3443839"}, {file = "Brotli-1.1.0-cp312-cp312-win32.whl", hash = "sha256:5f4d5ea15c9382135076d2fb28dde923352fe02951e66935a9efaac8f10e81b0"}, {file = "Brotli-1.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:906bc3a79de8c4ae5b86d3d75a8b77e44404b0f4261714306e3ad248d8ab0951"}, + {file = "Brotli-1.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8bf32b98b75c13ec7cf774164172683d6e7891088f6316e54425fde1efc276d5"}, + {file = "Brotli-1.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7bc37c4d6b87fb1017ea28c9508b36bbcb0c3d18b4260fcdf08b200c74a6aee8"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c0ef38c7a7014ffac184db9e04debe495d317cc9c6fb10071f7fefd93100a4f"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91d7cc2a76b5567591d12c01f019dd7afce6ba8cba6571187e21e2fc418ae648"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a93dde851926f4f2678e704fadeb39e16c35d8baebd5252c9fd94ce8ce68c4a0"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0db75f47be8b8abc8d9e31bc7aad0547ca26f24a54e6fd10231d623f183d089"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6967ced6730aed543b8673008b5a391c3b1076d834ca438bbd70635c73775368"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7eedaa5d036d9336c95915035fb57422054014ebdeb6f3b42eac809928e40d0c"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:d487f5432bf35b60ed625d7e1b448e2dc855422e87469e3f450aa5552b0eb284"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:832436e59afb93e1836081a20f324cb185836c617659b07b129141a8426973c7"}, + {file = "Brotli-1.1.0-cp313-cp313-win32.whl", hash = "sha256:43395e90523f9c23a3d5bdf004733246fba087f2948f87ab28015f12359ca6a0"}, + {file = "Brotli-1.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:9011560a466d2eb3f5a6e4929cf4a09be405c64154e12df0dd72713f6500e32b"}, {file = "Brotli-1.1.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a090ca607cbb6a34b0391776f0cb48062081f5f60ddcce5d11838e67a01928d1"}, {file = "Brotli-1.1.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de9d02f5bda03d27ede52e8cfe7b865b066fa49258cbab568720aa5be80a47d"}, {file = "Brotli-1.1.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2333e30a5e00fe0fe55903c8832e08ee9c3b1382aacf4db26664a16528d51b4b"}, @@ -749,6 +775,10 @@ files = [ {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:fd5f17ff8f14003595ab414e45fce13d073e0762394f957182e69035c9f3d7c2"}, {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:069a121ac97412d1fe506da790b3e69f52254b9df4eb665cd42460c837193354"}, {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:e93dfc1a1165e385cc8239fab7c036fb2cd8093728cbd85097b284d7b99249a2"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:aea440a510e14e818e67bfc4027880e2fb500c2ccb20ab21c7a7c8b5b4703d75"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_i686.whl", hash = "sha256:6974f52a02321b36847cd19d1b8e381bf39939c21efd6ee2fc13a28b0d99348c"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_ppc64le.whl", hash = "sha256:a7e53012d2853a07a4a79c00643832161a910674a893d296c9f1259859a289d2"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:d7702622a8b40c49bffb46e1e3ba2e81268d5c04a34f460978c6b5517a34dd52"}, {file = "Brotli-1.1.0-cp36-cp36m-win32.whl", hash = "sha256:a599669fd7c47233438a56936988a2478685e74854088ef5293802123b5b2460"}, {file = "Brotli-1.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:d143fd47fad1db3d7c27a1b1d66162e855b5d50a89666af46e1679c496e8e579"}, {file = "Brotli-1.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:11d00ed0a83fa22d29bc6b64ef636c4552ebafcef57154b4ddd132f5638fbd1c"}, @@ -760,6 +790,10 @@ files = [ {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:919e32f147ae93a09fe064d77d5ebf4e35502a8df75c29fb05788528e330fe74"}, {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:23032ae55523cc7bccb4f6a0bf368cd25ad9bcdcc1990b64a647e7bbcce9cb5b"}, {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:224e57f6eac61cc449f498cc5f0e1725ba2071a3d4f48d5d9dffba42db196438"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:cb1dac1770878ade83f2ccdf7d25e494f05c9165f5246b46a621cc849341dc01"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:3ee8a80d67a4334482d9712b8e83ca6b1d9bc7e351931252ebef5d8f7335a547"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:5e55da2c8724191e5b557f8e18943b1b4839b8efc3ef60d65985bcf6f587dd38"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:d342778ef319e1026af243ed0a07c97acf3bad33b9f29e7ae6a1f68fd083e90c"}, {file = "Brotli-1.1.0-cp37-cp37m-win32.whl", hash = "sha256:587ca6d3cef6e4e868102672d3bd9dc9698c309ba56d41c2b9c85bbb903cdb95"}, {file = "Brotli-1.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:2954c1c23f81c2eaf0b0717d9380bd348578a94161a65b3a2afc62c86467dd68"}, {file = "Brotli-1.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:efa8b278894b14d6da122a72fefcebc28445f2d3f880ac59d46c90f4c13be9a3"}, @@ -772,6 +806,10 @@ files = [ {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ab4fbee0b2d9098c74f3057b2bc055a8bd92ccf02f65944a241b4349229185a"}, {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:141bd4d93984070e097521ed07e2575b46f817d08f9fa42b16b9b5f27b5ac088"}, {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fce1473f3ccc4187f75b4690cfc922628aed4d3dd013d047f95a9b3919a86596"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d2b35ca2c7f81d173d2fadc2f4f31e88cc5f7a39ae5b6db5513cf3383b0e0ec7"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:af6fa6817889314555aede9a919612b23739395ce767fe7fcbea9a80bf140fe5"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:2feb1d960f760a575dbc5ab3b1c00504b24caaf6986e2dc2b01c09c87866a943"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:4410f84b33374409552ac9b6903507cdb31cd30d2501fc5ca13d18f73548444a"}, {file = "Brotli-1.1.0-cp38-cp38-win32.whl", hash = "sha256:db85ecf4e609a48f4b29055f1e144231b90edc90af7481aa731ba2d059226b1b"}, {file = "Brotli-1.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3d7954194c36e304e1523f55d7042c59dc53ec20dd4e9ea9d151f1b62b4415c0"}, {file = "Brotli-1.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5fb2ce4b8045c78ebbc7b8f3c15062e435d47e7393cc57c25115cfd49883747a"}, @@ -784,6 +822,10 @@ files = [ {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:949f3b7c29912693cee0afcf09acd6ebc04c57af949d9bf77d6101ebb61e388c"}, {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:89f4988c7203739d48c6f806f1e87a1d96e0806d44f0fba61dba81392c9e474d"}, {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:de6551e370ef19f8de1807d0a9aa2cdfdce2e85ce88b122fe9f6b2b076837e59"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0737ddb3068957cf1b054899b0883830bb1fec522ec76b1098f9b6e0f02d9419"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4f3607b129417e111e30637af1b56f24f7a49e64763253bbc275c75fa887d4b2"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:6c6e0c425f22c1c719c42670d561ad682f7bfeeef918edea971a79ac5252437f"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:494994f807ba0b92092a163a0a283961369a65f6cbe01e8891132b7a320e61eb"}, {file = "Brotli-1.1.0-cp39-cp39-win32.whl", hash = "sha256:f0d8a7a6b5983c2496e364b969f0e526647a06b075d034f3297dc66f3b360c64"}, {file = "Brotli-1.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:cdad5b9014d83ca68c25d2e9444e28e967ef16e80f6b436918c700c117a85467"}, {file = "Brotli-1.1.0.tar.gz", hash = "sha256:81de08ac11bcb85841e440c13611c00b67d3bf82698314928d0b676362546724"}, @@ -1043,11 +1085,11 @@ description = "Composable command line interface toolkit" optional = false python-versions = ">=3.7" groups = ["main", "dev"] -markers = "python_version < \"3.11\"" files = [ {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"}, {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"}, ] +markers = {main = "python_version < \"3.11\" and (extra == \"all\" or extra == \"nlp\" or extra == \"llm\" or extra == \"pii-detection\")", dev = "python_version < \"3.11\""} [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} @@ -1059,11 +1101,11 @@ description = "Composable command line interface toolkit" optional = false python-versions = ">=3.10" groups = ["main", "dev"] -markers = "python_version >= \"3.11\"" files = [ {file = "click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b"}, {file = "click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202"}, ] +markers = {main = "python_version >= \"3.11\" and (extra == \"all\" or extra == \"nlp\" or extra == \"llm\" or extra == \"pii-detection\")", dev = "python_version >= \"3.11\""} [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} @@ -1819,7 +1861,7 @@ files = [ [package.dependencies] datasets = ">=2.0.0" dill = "*" -fsspec = {version = ">=2021.05.0", extras = ["http"]} +fsspec = {version = ">=2021.5.0", extras = ["http"]} huggingface-hub = ">=0.7.0" multiprocess = "*" numpy = ">=1.17" @@ -2219,6 +2261,8 @@ files = [ {file = "greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d"}, {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5"}, {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8854167e06950ca75b898b104b63cc646573aa5fef1353d4508ecdd1ee76254f"}, + {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f47617f698838ba98f4ff4189aef02e7343952df3a615f847bb575c3feb177a7"}, + {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af41be48a4f60429d5cad9d22175217805098a9ef7c40bfef44f7669fb9d74d8"}, {file = "greenlet-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:73f49b5368b5359d04e18d15828eecc1806033db5233397748f4ca813ff1056c"}, {file = "greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2"}, {file = "greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246"}, @@ -2228,6 +2272,8 @@ files = [ {file = "greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8"}, {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52"}, {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa"}, + {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c"}, + {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5"}, {file = "greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9"}, {file = "greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd"}, {file = "greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb"}, @@ -2237,6 +2283,8 @@ files = [ {file = "greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0"}, {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0"}, {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f"}, + {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0"}, + {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d"}, {file = "greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02"}, {file = "greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31"}, {file = "greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945"}, @@ -2246,6 +2294,8 @@ files = [ {file = "greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671"}, {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b"}, {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae"}, + {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b"}, + {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929"}, {file = "greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b"}, {file = "greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0"}, {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f"}, @@ -2253,6 +2303,8 @@ files = [ {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1"}, {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735"}, {file = "greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337"}, + {file = "greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269"}, + {file = "greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681"}, {file = "greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01"}, {file = "greenlet-3.2.4-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:b6a7c19cf0d2742d0809a4c05975db036fdff50cd294a93632d6a310bf9ac02c"}, {file = "greenlet-3.2.4-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:27890167f55d2387576d1f41d9487ef171849ea0359ce1510ca6e06c8bece11d"}, @@ -2262,6 +2314,8 @@ files = [ {file = "greenlet-3.2.4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9913f1a30e4526f432991f89ae263459b1c64d1608c0d22a5c79c287b3c70df"}, {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b90654e092f928f110e0007f572007c9727b5265f7632c2fa7415b4689351594"}, {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:81701fd84f26330f0d5f4944d4e92e61afe6319dcd9775e39396e39d7c3e5f98"}, + {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:28a3c6b7cd72a96f61b0e4b2a36f681025b60ae4779cc73c1535eb5f29560b10"}, + {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:52206cd642670b0b320a1fd1cbfd95bca0e043179c1d8a045f2c6109dfe973be"}, {file = "greenlet-3.2.4-cp39-cp39-win32.whl", hash = "sha256:65458b409c1ed459ea899e939f0e1cdb14f58dbc803f2f93c5eab5694d32671b"}, {file = "greenlet-3.2.4-cp39-cp39-win_amd64.whl", hash = "sha256:d2e685ade4dafd447ede19c31277a224a239a0a1a4eca4e6390efedf20260cfb"}, {file = "greenlet-3.2.4.tar.gz", hash = "sha256:0dca0d95ff849f9a364385f36ab49f50065d76964944638be9691e1832e9f86d"}, @@ -3054,7 +3108,7 @@ fqdn = {version = "*", optional = true, markers = "extra == \"format-nongpl\""} idna = {version = "*", optional = true, markers = "extra == \"format-nongpl\""} isoduration = {version = "*", optional = true, markers = "extra == \"format-nongpl\""} jsonpointer = {version = ">1.13", optional = true, markers = "extra == \"format-nongpl\""} -jsonschema-specifications = ">=2023.03.6" +jsonschema-specifications = ">=2023.3.6" referencing = ">=0.28.4" rfc3339-validator = {version = "*", optional = true, markers = "extra == \"format-nongpl\""} rfc3986-validator = {version = ">0.1.0", optional = true, markers = "extra == \"format-nongpl\""} @@ -8904,6 +8958,12 @@ files = [ {file = "statsmodels-0.14.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a085d47c8ef5387279a991633883d0e700de2b0acc812d7032d165888627bef"}, {file = "statsmodels-0.14.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9f866b2ebb2904b47c342d00def83c526ef2eb1df6a9a3c94ba5fe63d0005aec"}, {file = "statsmodels-0.14.5-cp313-cp313-win_amd64.whl", hash = "sha256:2a06bca03b7a492f88c8106103ab75f1a5ced25de90103a89f3a287518017939"}, + {file = "statsmodels-0.14.5-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:07c4dad25bbb15864a31b4917a820f6d104bdc24e5ddadcda59027390c3bed9e"}, + {file = "statsmodels-0.14.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:babb067c852e966c2c933b79dbb5d0240919d861941a2ef6c0e13321c255528d"}, + {file = "statsmodels-0.14.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:110194b137286173cc676d7bad0119a197778de6478fc6cbdc3b33571165ac1e"}, + {file = "statsmodels-0.14.5-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c8a9c384a60c80731b278e7fd18764364c8817f4995b13a175d636f967823d1"}, + {file = "statsmodels-0.14.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:557df3a870a57248df744fdfcc444ecbc5bdbf1c042b8a8b5d8e3e797830dc2a"}, + {file = "statsmodels-0.14.5-cp314-cp314-win_amd64.whl", hash = "sha256:95af7a9c4689d514f4341478b891f867766f3da297f514b8c4adf08f4fa61d03"}, {file = "statsmodels-0.14.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b23b8f646dd78ef5e8d775d879208f8dc0a73418b41c16acac37361ff9ab7738"}, {file = "statsmodels-0.14.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4e5e26b21d2920905764fb0860957d08b5ba2fae4466ef41b1f7c53ecf9fc7fa"}, {file = "statsmodels-0.14.5-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4a060c7e0841c549c8ce2825fd6687e6757e305d9c11c9a73f6c5a0ce849bb69"}, @@ -10349,4 +10409,4 @@ xgboost = ["xgboost"] [metadata] lock-version = "2.1" python-versions = ">=3.9,<3.13" -content-hash = "ed3612700d61dbd64bf4850986c5cc7fd527d82ffd448cd00a2d02a00356cc3a" +content-hash = "003c831ae251049b5e75e935ddb8979b9e53afbe2b23d09cc9335911f4016bdd" diff --git a/pyproject.toml b/pyproject.toml index b221c85a2..ee24a9e67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "validmind" -version = "2.11.3" +version = "2.12.0" description = "ValidMind Library" readme = "README.pypi.md" requires-python = ">=3.9,<3.13" @@ -44,7 +44,7 @@ all = [ "sentencepiece (>=0.2.0,<0.3.0)", "langchain-openai (>=0.1.8)", "scipy", - "statsmodels", + "statsmodels (>=0.14.2,<0.15.0)", "langdetect", "nltk (>=3.8.1,<4.0.0)", "textblob (>=0.18.0.post0,<0.19.0)", @@ -78,7 +78,7 @@ nlp = [ "pyarrow (<16)", ] pytorch = ["torch (>=2.0.0)"] -stats = ["scipy", "statsmodels", "arch"] +stats = ["scipy", "statsmodels (>=0.14.2,<0.15.0)", "arch"] xgboost = ["xgboost (>=1.5.2,<3)"] explainability = ["shap (>=0.46.0)"] credit_risk = ["scorecardpy==0.1.9.6"] diff --git a/tests/test_api_client.py b/tests/test_api_client.py index 94b24ccde..8232baead 100644 --- a/tests/test_api_client.py +++ b/tests/test_api_client.py @@ -19,7 +19,6 @@ MissingModelIdError, APIRequestError, ) -from validmind.utils import md_to_html from validmind.vm_models.figure import Figure @@ -91,6 +90,49 @@ def test_init_successful(self, mock_requests_get): }, ) + @patch("validmind.api_client.logger.error") + @patch("requests.get") + def test_init_warns_when_document_is_missing( + self, mock_requests_get, mock_logger_error + ): + mock_data = { + "project": {"name": "test_project", "cuid": os.environ["VM_API_MODEL"]} + } + mock_response = Mock(status_code=200, json=Mock(return_value=mock_data)) + mock_requests_get.return_value = mock_response + + api_client.init() + + mock_logger_error.assert_called_once_with( + "Future releases will require `document` as one of the options you must provide to `vm.init()`. " + "To learn more, refer to https://docs.validmind.ai/developer/validmind-library.html" + ) + + @patch("validmind.api_client.logger.error") + @patch("requests.get") + def test_init_no_warning_when_document_is_passed( + self, mock_requests_get, mock_logger_error + ): + mock_data = { + "project": {"name": "test_project", "cuid": os.environ["VM_API_MODEL"]} + } + mock_response = Mock(status_code=200, json=Mock(return_value=mock_data)) + mock_requests_get.return_value = mock_response + + api_client.init(document="documentation") + + mock_logger_error.assert_not_called() + mock_requests_get.assert_called_once_with( + url=f"{os.environ['VM_API_HOST']}/ping", + headers={ + "X-API-KEY": os.environ["VM_API_KEY"], + "X-API-SECRET": os.environ["VM_API_SECRET"], + "X-MODEL-CUID": os.environ["VM_API_MODEL"], + "X-MONITORING": "False", + "X-DOCUMENT-TYPE": "documentation", + }, + ) + def test_get_api_host(self): host = api_client.get_api_host() self.assertEqual(host, "your_api_host") diff --git a/validmind/__version__.py b/validmind/__version__.py index ddcf716b7..95a6d3a79 100644 --- a/validmind/__version__.py +++ b/validmind/__version__.py @@ -1 +1 @@ -__version__ = "2.11.3" +__version__ = "2.12.0" diff --git a/validmind/api_client.py b/validmind/api_client.py index 87b00d5f0..444a1765f 100644 --- a/validmind/api_client.py +++ b/validmind/api_client.py @@ -31,6 +31,7 @@ _api_secret = os.getenv("VM_API_SECRET") _api_host = os.getenv("VM_API_HOST") _model_cuid = os.getenv("VM_API_MODEL") +_document = None _monitoring = False __api_session: Optional[aiohttp.ClientSession] = None @@ -67,12 +68,15 @@ def get_api_model() -> Optional[str]: def _get_api_headers() -> Dict[str, str]: - return { + headers = { "X-API-KEY": _api_key, "X-API-SECRET": _api_secret, "X-MODEL-CUID": _model_cuid, "X-MONITORING": str(_monitoring), } + if _document: + headers["X-DOCUMENT-TYPE"] = _document + return headers def _get_session() -> aiohttp.ClientSession: @@ -194,6 +198,7 @@ def init( model: Optional[str] = None, monitoring: bool = False, generate_descriptions: Optional[bool] = None, + document: Optional[str] = None, ): """ Initializes the API client instances and calls the /ping endpoint to ensure @@ -209,11 +214,12 @@ def init( api_secret (str, optional): The API secret. Defaults to None. api_host (str, optional): The API host. Defaults to None. monitoring (bool): The ongoing monitoring flag. Defaults to False. - generate_descriptions (bool): Whether to use GenAI to generate test result descriptions. Defaults to True. + generate_descriptions (bool, optional): Whether to use GenAI to generate test result descriptions. Defaults to True. + document (str, optional): The name of the document. Omitting this argument is deprecated. Raises: ValueError: If the API key and secret are not provided """ - global _api_key, _api_secret, _api_host, _model_cuid, _monitoring + global _api_key, _api_secret, _api_host, _model_cuid, _monitoring, _document if api_key == "...": # special case to detect when running a notebook placeholder (...) @@ -238,6 +244,13 @@ def init( if generate_descriptions is not None: os.environ["VALIDMIND_LLM_DESCRIPTIONS_ENABLED"] = str(generate_descriptions) + if document is None: + logger.error( + "Future releases will require `document` as one of the options you must provide to `vm.init()`. " + "To learn more, refer to https://docs.validmind.ai/developer/validmind-library.html" + ) + + _document = document reload()