diff --git a/README.md b/README.md index 373f3385..117eeb15 100644 --- a/README.md +++ b/README.md @@ -17,17 +17,17 @@ Simplified Data Quality checking at Scale for PySpark Workloads on streaming and # Documentation -The full documentation is available at: [https://databrickslabs.github.io/dqx/](https://databrickslabs.github.io/dqx/) +The complete documentation is available at: [https://databrickslabs.github.io/dqx/](https://databrickslabs.github.io/dqx/) # Contribution -See contribution guidance [here](https://databrickslabs.github.io/dqx/docs/dev/contributing/) on how to contribute to the project (build, test, and submit a PR). +Please see the contribution guidance [here](https://databrickslabs.github.io/dqx/docs/dev/contributing/) on how to contribute to the project (build, test, and submit a PR). # Project Support Please note that this project is provided for your exploration only and is not formally supported by Databricks with Service Level Agreements (SLAs). They are -provided AS-IS, and we do not make any guarantees of any kind. Please do not +provided AS-IS, and we do not make any guarantees. Please do not submit a support ticket relating to any issues arising from the use of this project. Any issues discovered through the use of this project should be filed as GitHub diff --git a/demos/dqx_demo_library.py b/demos/dqx_demo_library.py index d98ad973..5f5006cc 100644 --- a/demos/dqx_demo_library.py +++ b/demos/dqx_demo_library.py @@ -41,7 +41,8 @@ print(yaml.safe_dump(summary_stats)) print(profiles) -# generate DQX quality rules/checks +# generate DQX quality rules/checks candidates +# they should be manually reviewed before being applied to the data generator = DQGenerator(ws) checks = generator.generate_dq_rules(profiles) # with default level "error" print(yaml.safe_dump(checks)) @@ -152,7 +153,7 @@ - criticality: error check: - function: value_is_in_list + function: is_in_list arguments: col_name: col1 allowed: @@ -185,7 +186,7 @@ # COMMAND ---------- -from databricks.labs.dqx.col_functions import is_not_null, is_not_null_and_not_empty, value_is_in_list +from databricks.labs.dqx.col_functions import is_not_null, is_not_null_and_not_empty, is_in_list from databricks.labs.dqx.engine import DQEngine, DQRule, DQRuleColSet from databricks.sdk import WorkspaceClient @@ -201,7 +202,7 @@ check=is_not_null_and_not_empty("col4")), DQRule( # name for the check auto-generated if not provided criticality="error", - check=value_is_in_list("col1", ["1", "2"])) + check=is_in_list("col1", ["1", "2"])) ] + DQRuleColSet( # define rule for multiple columns at once columns=["col1", "col2"], criticality="error", @@ -254,7 +255,7 @@ - dropoff_latitude criticality: warn - check: - function: not_less_than + function: is_not_less_than arguments: col_name: trip_distance limit: 1 @@ -267,7 +268,7 @@ name: pickup_datetime_greater_than_dropoff_datetime criticality: error - check: - function: not_in_future + function: is_not_in_future arguments: col_name: pickup_datetime name: pickup_datetime_not_in_future @@ -357,7 +358,8 @@ def ends_with_foo(col_name: str) -> Column: dq_engine = DQEngine(WorkspaceClient()) custom_check_functions = {"ends_with_foo": ends_with_foo} -#custom_check_functions=globals() # include all functions for simplicity +# or include all functions with globals() for simplicity +#custom_check_functions=globals() valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(input_df, checks, custom_check_functions) display(valid_and_quarantined_df) diff --git a/demos/dqx_demo_tool.py b/demos/dqx_demo_tool.py index f0f271cb..e1cbac64 100644 --- a/demos/dqx_demo_tool.py +++ b/demos/dqx_demo_tool.py @@ -65,7 +65,7 @@ # MAGIC # MAGIC You can also start the profiler by navigating to the Databricks Workflows UI. # MAGIC -# MAGIC Note that using the profiler is optional. It is usually one-time operation and not a scheduled activity. +# MAGIC Note that using the profiler is optional. It is usually one-time operation and not a scheduled activity. The generated check candidates should be manually reviewed before being applied to the data. # COMMAND ---------- @@ -135,7 +135,7 @@ - dropoff_latitude criticality: error - check: - function: not_less_than + function: is_not_less_than arguments: col_name: trip_distance limit: 1 @@ -148,7 +148,7 @@ name: pickup_datetime_greater_than_dropoff_datetime criticality: error - check: - function: not_in_future + function: is_not_in_future arguments: col_name: pickup_datetime name: pickup_datetime_not_in_future @@ -206,7 +206,7 @@ # MAGIC %md # MAGIC ### Save quarantined data to Unity Catalog table # MAGIC -# MAGIC Note: In this demo, we only save the quarantined data and omit the output. This is because the dashboards use only quarantined data as their input. Therefore, saving the output data is unnecessary in this demo. If you apply checks to flag invalid records without quarantining them (e.g. using the apply check methods without the split), ensure that the `quarantine_table` field in your run config is set to the same value as the `output_table` field. +# MAGIC Note: In this demo, we only save the quarantined data and omit the output. This is because the dashboard use only quarantined data as their input. Therefore, saving the output data is unnecessary in this demo. If you apply checks to flag invalid records without quarantining them (e.g. using the apply check methods without the split), ensure that the `quarantine_table` field in your run config is set to the same value as the `output_table` field. # MAGIC # COMMAND ---------- @@ -222,7 +222,7 @@ # COMMAND ---------- # MAGIC %md -# MAGIC ### View data quality in DQX Dashboards +# MAGIC ### View data quality in DQX Dashboard # COMMAND ---------- diff --git a/demos/dqx_dlt_demo.py b/demos/dqx_dlt_demo.py index 87c02aae..50015d57 100644 --- a/demos/dqx_dlt_demo.py +++ b/demos/dqx_dlt_demo.py @@ -64,21 +64,21 @@ def bronze(): criticality: "error" - check: - function: "not_in_future" + function: "is_not_in_future" arguments: col_name: "pickup_datetime" name: "pickup_datetime_isnt_in_range" criticality: "warn" - check: - function: "not_in_future" + function: "is_not_in_future" arguments: col_name: "pickup_datetime" name: "pickup_datetime_not_in_future" criticality: "warn" - check: - function: "not_in_future" + function: "is_not_in_future" arguments: col_name: "dropoff_datetime" name: "dropoff_datetime_not_in_future" diff --git a/docs/dqx/docs/demos.mdx b/docs/dqx/docs/demos.mdx index 696f898d..5c266d31 100644 --- a/docs/dqx/docs/demos.mdx +++ b/docs/dqx/docs/demos.mdx @@ -2,12 +2,17 @@ sidebar_position: 4 --- +import Admonition from '@theme/Admonition'; + # Demos -Install the [installation](/docs/installation) framework, and import the following notebooks in the Databricks workspace to try it out: +Import the following notebooks in the Databricks workspace to try DQX out: * [DQX Demo Notebook (library)](https://github.com/databrickslabs/dqx/blob/main/demos/dqx_demo_library.py) - demonstrates how to use DQX as a library. * [DQX Demo Notebook (tool)](https://github.com/databrickslabs/dqx/blob/main/demos/dqx_demo_tool.py) - demonstrates how to use DQX as a tool when installed in the workspace. * [DQX DLT Demo Notebook](https://github.com/databrickslabs/dqx/blob/main/demos/dqx_dlt_demo.py) - demonstrates how to use DQX with Delta Live Tables (DLT). -Note that DQX don't have to be run from a Notebook. You can run it from any Python script as long as it runs on Databricks. -For example, you can add DQX as a library to your job or cluster. \ No newline at end of file + +You don't have to run DQX from a Notebook. DQX can be run from any Python script as long as it runs on Databricks. +For example, you can run it from a Databricks job by adding DQX as a dependent library. +DQX also comes with a set of command-line tools for running DQX jobs (see the [User Guide](/docs/guide)). + \ No newline at end of file diff --git a/docs/dqx/docs/dev/contributing.mdx b/docs/dqx/docs/dev/contributing.mdx index 46bb2022..ccc2adb1 100644 --- a/docs/dqx/docs/dev/contributing.mdx +++ b/docs/dqx/docs/dev/contributing.mdx @@ -1,3 +1,5 @@ +import Admonition from '@theme/Admonition'; + # Contributing ## First Principles @@ -7,7 +9,7 @@ development. There are several reasons why this approach is encouraged: - Standard libraries are typically well-vetted, thoroughly tested, and maintained by the official maintainers of the programming language or platform. This ensures a higher level of stability and reliability. -- External dependencies, especially lesser-known or unmaintained ones, can introduce bugs, security vulnerabilities, or compatibility issues that can be challenging to resolve. Adding external dependencies increases the complexity of your codebase. +- External dependencies, especially lesser-known or unmaintained ones, can introduce bugs, security vulnerabilities, or compatibility issues that can be challenging to resolve. Adding external dependencies increases the complexity of your codebase. - Each dependency may have its own set of dependencies, potentially leading to a complex web of dependencies that can be difficult to manage. This complexity can lead to maintenance challenges, increased risk, and longer build times. - External dependencies can pose security risks. If a library or package has known security vulnerabilities and is widely used, it becomes an attractive target for attackers. Minimizing external dependencies reduces the potential attack surface and makes it easier to keep your code secure. - Relying on standard libraries enhances code portability. It ensures your code can run on different platforms and environments without being tightly coupled to specific external dependencies. This is particularly important in settings like Databricks, where you may need to run your code on different clusters or setups. @@ -21,26 +23,26 @@ or specialized functionality unavailable in standard libraries. ## First contribution -If you're interested in contributing, please create a PR, reach out to us or open an issue to discuss your ideas. +If you're interested in contributing, please create a PR, contact us, or open an issue to discuss your ideas. Here are the example steps to submit your first contribution: -1. Fork the repo. You can also create a branch if you are added as writer to the repo. -2. The locally: `git clone` +1. Fork the [DQX](https://github.com/databrickslabs/dqx) repo. You can also create a branch if you are added as a writer to the repo. +2. Clone the repo locally: `git clone` 3. `git checkout main` (or `gcm` if you're using [ohmyzsh](https://ohmyz.sh/)). 4. `git pull` (or `gl` if you're using [ohmyzsh](https://ohmyz.sh/)). 5. `git checkout -b FEATURENAME` (or `gcb FEATURENAME` if you're using [ohmyzsh](https://ohmyz.sh/)). 6. .. do the work 7. `make fmt` 8. `make lint` -9. .. fix if any issues reported +9. .. fix if any issues are reported 10. `make test` and `make integration`, and optionally `make coverage` (generate coverage report) 11. .. fix if any issues reported 12. `git commit -S -a -m "message"` Make sure to enter a meaningful commit message title. You need to sign commits with your GPG key (hence -S option). - To setup GPG key in your Github account follow [these instructions](https://docs.github.com/en/github/authenticating-to-github/managing-commit-signature-verification). + To set up GPG key in your Github account, follow [these instructions](https://docs.github.com/en/github/authenticating-to-github/managing-commit-signature-verification). You can configure Git to sign all commits with your GPG key by default: `git config --global commit.gpgsign true` If you have not signed your commits initially, you can re-apply all of them and sign as follows: @@ -51,7 +53,7 @@ Here are the example steps to submit your first contribution: ``` 13. `git push origin FEATURENAME` - To access the repository, you must use the HTTPS remote with a personal access token or SSH with an SSH key and passphrase that has been authorized for `databrickslabs` organization. + To access the repository, you must use the HTTPS remote with a personal access token or SSH with an SSH key and passphrase that has been authorized for the `databrickslabs` organization. 14. Go to GitHub UI and create PR. Alternatively, `gh pr create` (if you have [GitHub CLI](https://cli.github.com/) installed). Use a meaningful pull request title because it'll appear in the release notes. Use `Resolves #NUMBER` in pull request description to [automatically link it](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/using-keywords-in-issues-and-pull-requests#linking-a-pull-request-to-an-issue) @@ -59,7 +61,7 @@ Here are the example steps to submit your first contribution: ## Local Setup -This section provides a step-by-step guide to set up and start working on the project. These steps will help you set up your project environment and dependencies for efficient development. +This section provides a step-by-step guide for setting up and starting work on the project. These steps will help you set up your project environment and dependencies for efficient development. {/* Go through the [prerequisites](./README.md#prerequisites) and clone the [dqx github repo](https://github.com/databrickslabs/dqx). */} @@ -81,7 +83,7 @@ make fmt ``` Before every commit, run automated bug detector and unit tests to ensure that automated -pull request checks do pass, before your code is reviewed by others: +pull request checks do pass before your code is reviewed by others: ```shell make lint make test @@ -91,7 +93,7 @@ make test Integration tests and code coverage are run automatically when you create a Pull Request in Github. You can also trigger the tests from a local machine by configuring authentication to a Databricks workspace. -You can use any Unity Catalog enabled Databricks workspace. +You can use any Unity Catalog-enabled Databricks workspace. #### Using terminal @@ -117,13 +119,13 @@ Run integration tests with the following command: make integration ``` -Calculate test coverage and display report in html: +Calculate test coverage and display report in HTML: ```shell make coverage ``` #### Using IDE -If you want to run integration tests from your IDE, you must setup `.env` or `~/.databricks/debug-env.json` file +If you want to run integration tests from your IDE, you must set `.env` or `~/.databricks/debug-env.json` file (see [instructions](https://github.com/databrickslabs/pytester?tab=readme-ov-file#debug_env_name-fixture)). The name of the debug environment that you must define is `ws` (see `debug_env_name` fixture in the `conftest.py`). @@ -140,7 +142,7 @@ Create the `~/.databricks/debug-env.json` with the following content, replacing } } ``` -You must provide an existing cluster which will be auto-started for you as part of the tests. +You must provide an existing cluster that will auto-start for you as part of the tests. We recommend using [OAuth access token](https://docs.databricks.com/en/dev-tools/auth/oauth-m2m.html) generated for a service principal to authenticate with Databricks as presented above. Alternatively, you can authenticate using [PAT token](https://docs.databricks.com/en/dev-tools/auth/pat.html) by providing the `DATABRICKS_TOKEN` field. However, we do not recommend this method, as it is less secure than OAuth. @@ -160,11 +162,11 @@ To run integration tests on serverless compute, add the `DATABRICKS_SERVERLESS_C } } ``` -When `DATABRICKS_SERVERLESS_COMPUTE_ID` is set the `DATABRICKS_CLUSTER_ID` is ignored, and tests run on serverless compute. +When `DATABRICKS_SERVERLESS_COMPUTE_ID` is set, the `DATABRICKS_CLUSTER_ID` is ignored, and tests run on serverless compute. ## Manual testing of the framework -We require that all changes be covered by unit tests and integration tests. A pull request (PR) will be blocked if the code coverage is negatively impacted by the proposed change. +We require that all changes must be covered by unit tests and integration tests. A pull request (PR) will be blocked if the proposed change negatively impacts the code coverage. However, manual testing may still be useful before creating or merging a PR. To test DQX from your feature branch, you can install it directly as follows: @@ -177,7 +179,7 @@ Replace `feature_branch_name` with the name of your branch. ## Manual testing of the CLI commands from the current codebase Once you clone the repo locally and install Databricks CLI you can run labs CLI commands from the root of the repository. -Similar to other databricks cli commands we can specify Databricks profile to use with `--profile`. +Similar to other Databricks CLI commands, we can specify the Databricks profile to use with `--profile`. Build the project: ```commandline @@ -216,7 +218,9 @@ In most cases, installing DQX directly from the current codebase is sufficient t When DQX is installed from a released version, it creates a fresh and isolated Python virtual environment locally and installs all the required packages, ensuring a clean setup. If you need to perform end-to-end testing of the CLI before an official release, follow the process outlined below. -Note: This is only available for GitHub accounts that have write access to the repository. If you contribute from a fork this method is not available. + +This method is only available for GitHub accounts with write access to the repository. It is not available if you contribute from a fork. + ```commandline # create new tag @@ -229,8 +233,10 @@ git push origin v0.1.12-alpha databricks labs install dqx@v0.1.12-alpha ``` + The release pipeline only triggers when a valid semantic version is provided (e.g. v0.1.12). Pre-release versions (e.g. v0.1.12-alpha) do not trigger the release pipeline, allowing you to test changes safely before making an official release. + ## Troubleshooting @@ -240,7 +246,7 @@ If you encounter any package dependency errors after `git pull`, run `make clean See https://mypy.readthedocs.io/en/stable/cheat_sheet_py3.html for more details -**..., expression has type "None", variable has type "str"** +**..., expression has type "None", variable has a type "str"** * Add `assert ... is not None` if it's a body of a method. Example: diff --git a/docs/dqx/docs/dev/docs_authoring.mdx b/docs/dqx/docs/dev/docs_authoring.mdx index c4089728..fa7fc52f 100644 --- a/docs/dqx/docs/dev/docs_authoring.mdx +++ b/docs/dqx/docs/dev/docs_authoring.mdx @@ -4,12 +4,11 @@ import Admonition from '@theme/Admonition'; This document provides guidelines for writing documentation for the DQX project. - ## Tech Stack The DQX documentation is built using [Docusaurus](https://docusaurus.io/), a modern static site generator. -Docusaurus is a project of Facebook Open Source and is used by many open-source projects to build their documentation websites. +Docusaurus is a Facebook open source project used by many open source projects to build their documentation websites. We also use [MDX](https://mdxjs.com/) to write markdown files that include JSX components. This allows us to write markdown files with embedded React components. @@ -56,7 +55,7 @@ make docs-serve-dev ## Checking search functionality -We're using local search, and it won't be available in the development server. +We are using local search, which won't be available in the development server. To check the search functionality, run the following command: @@ -129,7 +128,7 @@ The rule of thumb is: Do not put any technical details in the main documentation.
- All technical details should be kept in /docs/dev/ section. + All technical details should be kept in the /docs/dev/ section.
diff --git a/docs/dqx/docs/dev/index.mdx b/docs/dqx/docs/dev/index.mdx index 505c34d9..b7681b35 100644 --- a/docs/dqx/docs/dev/index.mdx +++ b/docs/dqx/docs/dev/index.mdx @@ -4,4 +4,4 @@ sidebar_position: 7 # Contributing to DQX -This section is for contributors to the DQX project. It contains information on how to contribute, including how to submit issues, pull requests, and how to contribute to the documentation. \ No newline at end of file +This section is for contributors to the DQX project. It contains information on how to contribute, including submitting issues, pulling requests, and contributing to the documentation. \ No newline at end of file diff --git a/docs/dqx/docs/guide.mdx b/docs/dqx/docs/guide.mdx index 46e0ac1d..3cef0280 100644 --- a/docs/dqx/docs/guide.mdx +++ b/docs/dqx/docs/guide.mdx @@ -2,144 +2,126 @@ sidebar_position: 3 --- +import Admonition from '@theme/Admonition'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # User guide ## Data Profiling and Quality Rules Generation Data profiling can be run to profile the input data and generate quality rule candidates with summary statistics. -The generated rules/checks are input for the quality checking (see [Adding quality checks to the application](#adding-quality-checks-to-the-application)). -In addition, the DLT generator can be used to generated native Delta Live Tables (DLT) expectations. - -Data profiling is typically performed as a one-time action for a table to discover the initial set of quality rule candidates. -This is not intended to be a continuously repeated or scheduled process, thereby minimizing concerns regarding compute intensity and associated costs. - -### In Python - -Profiling and generating DQX rules/checks: - -```python -from databricks.labs.dqx.profiler.profiler import DQProfiler -from databricks.labs.dqx.profiler.generator import DQGenerator -from databricks.labs.dqx.profiler.dlt_generator import DQDltGenerator -from databricks.labs.dqx.engine import DQEngine -from databricks.sdk import WorkspaceClient - -input_df = spark.read.table("catalog1.schema1.table1") - -# profile input data -ws = WorkspaceClient() -profiler = DQProfiler(ws) -summary_stats, profiles = profiler.profile(input_df) - -# generate DQX quality rules/checks -generator = DQGenerator(ws) -checks = generator.generate_dq_rules(profiles) # with default level "error" - -dq_engine = DQEngine(ws) - -# save checks in arbitrary workspace location -dq_engine.save_checks_in_workspace_file(checks, workspace_path="/Shared/App1/checks.yml") -# save checks in the installation folder specified in the default run config (only works if DQX is installed in the workspace) -dq_engine.save_checks_in_installation(checks, run_config_name="default") - -# generate DLT expectations -dlt_generator = DQDltGenerator(ws) - -dlt_expectations = dlt_generator.generate_dlt_rules(profiles, language="SQL") -print(dlt_expectations) - -dlt_expectations = dlt_generator.generate_dlt_rules(profiles, language="Python") -print(dlt_expectations) - -dlt_expectations = dlt_generator.generate_dlt_rules(profiles, language="Python_Dict") -print(dlt_expectations) -``` - -### Using CLI - -You can optionally install DQX in the workspace, see the [Installation Guide](/docs/installation#dqx-installation-in-a-databricks-workspace). -As part of the installation, a config, dashboards and profiler workflow is installed. The workflow can be run manually in the workspace UI or using the CLI as below. - -DQX operates at the moment exclusively at the pySpark dataframe level and does not interact directly with databases or storage systems. -DQX does not persist data after performing quality checks, meaning users must handle data storage themselves. -Since DQX does not manage the input location, output table, or quarantine table, it is the user's responsibility to store or persist the processed data as needed. - -Open the config to check available run configs and adjust the settings if needed: -```commandline -databricks labs dqx open-remote-config -``` - -See example config below: -```yaml -log_level: INFO -version: 1 -run_configs: -- name: default # <- unique name of the run config (default used during installation) - input_location: s3://iot-ingest/raw # <- Input location for profiling (UC table or cloud path) - input_format: delta # <- format, required if cloud path provided - output_table: main.iot.silver # <- output UC table used in quality dashboard - quarantine_table: main.iot.quarantine # <- quarantine UC table used in quality dashboard - checks_file: iot_checks.yml # <- location of the quality rules (checks) - profile_summary_stats_file: iot_profile_summary_stats.yml # <- location of profiling summary stats - warehouse_id: your-warehouse-id # <- warehouse id for refreshing dashboards -- name: another_run_config # <- unique name of the run config - ... -``` - -Run profiler workflow: -```commandline -databricks labs dqx profile --run-config "default" -``` - -You will find the generated quality rule candidates and summary statistics in the installation folder as defined in the run config. -If run config is not provided, the "default" run config will be used. The run config is used to select specific run configuration from the 'config.yml'. - -The following DQX configuration from 'config.yml' are used by the profiler workflow: -- 'input_location': input data as a path or a table. -- 'input_format': input data format. Required if input data is a path. -- 'checks_file': relative location of the generated quality rule candidates (default: `checks.yml`) inside installation folder. -- 'profile_summary_stats_file': relative location of the summary statistics (default: `profile_summary.yml`) inside installation folder. - -Logs are be printed in the console and saved in the installation folder. -You can display the logs from the latest profiler workflow run by executing: -```commandline -databricks labs dqx logs --workflow profiler -``` - -## Validating quality rules (checks) - -If you manually adjust the generated rules or create your own checks, you can validate them before applying. - -### In Python - -```python -from databricks.labs.dqx.engine import DQEngine - -status = DQEngine.validate_checks(checks) -print(status) -``` - -Validating quality rules can be added to the CI/CD process to ensure checks are ready to use in the application. - -Note that checks are validated automatically when applied as part of the -`apply_checks_by_metadata_and_split` and `apply_checks_by_metadata` methods -(see [Quality rules defined as config](#quality-rules-defined-in-files)). - -### Using CLI - -Validate checks stored in the installation folder: -```commandline -databricks labs dqx validate-checks --run-config "default" -``` - -The following DQX configuration from 'config.yml' will be used by default: -- 'checks_file': relative location of the quality rules (default: `checks.yml`) inside installation folder. +The generated data quality rules (checks) candidates can be used as input for the quality checking (see [Adding quality checks to the application](#adding-quality-checks-to-the-application)). +In addition, the DLT generator can generate native Delta Live Tables (DLT) expectations. + + +Data profiling is typically performed as a one-time action for the input dataset to discover the initial set of quality rule candidates. +The check candidates should be manually reviewed before being applied to the data. +This is not intended to be a continuously repeated or scheduled process, thereby also minimizing concerns regarding compute intensity and associated costs. + + + + + Profiling and generating DQX rules/checks: + + ```python + from databricks.labs.dqx.profiler.profiler import DQProfiler + from databricks.labs.dqx.profiler.generator import DQGenerator + from databricks.labs.dqx.profiler.dlt_generator import DQDltGenerator + from databricks.labs.dqx.engine import DQEngine + from databricks.sdk import WorkspaceClient + + input_df = spark.read.table("catalog1.schema1.table1") + + # profile input data + ws = WorkspaceClient() + profiler = DQProfiler(ws) + summary_stats, profiles = profiler.profile(input_df) + + # generate DQX quality rules/checks + generator = DQGenerator(ws) + checks = generator.generate_dq_rules(profiles) # with default level "error" + + dq_engine = DQEngine(ws) + + # save checks in arbitrary workspace location + dq_engine.save_checks_in_workspace_file(checks, workspace_path="/Shared/App1/checks.yml") + # save checks in the installation folder specified in the default run config (only works if DQX is installed in the workspace) + dq_engine.save_checks_in_installation(checks, run_config_name="default") + + # generate DLT expectations + dlt_generator = DQDltGenerator(ws) + + dlt_expectations = dlt_generator.generate_dlt_rules(profiles, language="SQL") + print(dlt_expectations) + + dlt_expectations = dlt_generator.generate_dlt_rules(profiles, language="Python") + print(dlt_expectations) + + dlt_expectations = dlt_generator.generate_dlt_rules(profiles, language="Python_Dict") + print(dlt_expectations) + ``` + + + You can optionally install DQX in the workspace (see the [Installation Guide](/docs/installation#dqx-installation-as-a-tool-in-a-databricks-workspace)). + A config, dashboard, and profiler workflow are installed as part of the installation. The workflow can be run manually in the workspace UI or using the CLI as below. + + DQX operates exclusively at the PySpark dataframe level and does not interact directly with databases or storage systems. + DQX does not persist data after performing quality checks, meaning users must handle data storage themselves. + Since DQX does not manage the input location, output table, or quarantine table, it is the user's responsibility to store or persist the processed data as needed. + + Open the config to check available run configs and adjust the settings if needed: + ```commandline + databricks labs dqx open-remote-config + ``` + + See the example config below: + ```yaml + log_level: INFO + version: 1 + run_configs: + - name: default # <- unique name of the run config (default used during installation) + input_location: s3://iot-ingest/raw # <- Input location for profiling (UC table or cloud path) + input_format: delta # <- format, required if cloud path provided + output_table: main.iot.silver # <- output UC table used in quality dashboard + quarantine_table: main.iot.quarantine # <- quarantine UC table used in quality dashboard + checks_file: iot_checks.yml # <- relative location of the quality rules (checks) defined as json or yaml + profile_summary_stats_file: iot_profile_summary_stats.yml # <- relative location of profiling summary stats + warehouse_id: your-warehouse-id # <- warehouse id for refreshing dashboard + - name: another_run_config # <- unique name of the run config + ... + ``` + + Run profiler workflow: + ```commandline + databricks labs dqx profile --run-config "default" + ``` + + The generated quality rule candidates and summary statistics will be in the installation folder, as defined in the run config. + The "default" run config will be used if the run config is not provided. The run config is used to select specific run configuration from the 'config.yml'. + + The following DQX configuration from 'config.yml' is used by the profiler workflow: + - 'input_location': input data as a path or a table. + - 'input_format': input data format. Required if input data is a path. + - 'checks_file': relative location of the generated quality rule candidates as `yaml` or `json` file inside the installation folder (default: `checks.yml`). + - 'profile_summary_stats_file': relative location of the summary statistics (default: `profile_summary.yml`) inside the installation folder. + + Logs are printed in the console and saved in the installation folder. + You can display the logs from the latest profiler workflow run by executing: + ```commandline + databricks labs dqx logs --workflow profiler + ``` + + ## Adding quality checks to the application -### Quality rules defined in files +DQX offers a set of predefined quality rules (checks) to leverage. See details and list of all check functions [here](/docs/reference/quality_rules). +Additionally, you can define custom checks to meet specific requirements. Learn more [here](/docs/reference/quality_rules). -Quality rules can be stored in `yaml` or `json` file. Below an example `yaml` file defining checks ('checks.yml'): +### Quality rules configured in a file + +Quality rules can be stored in a `yaml` or `json` file. Below is an example `yaml` file ('checks.yml') defining several checks: ```yaml - criticality: error check: @@ -156,7 +138,7 @@ Quality rules can be stored in `yaml` or `json` file. Below an example `yaml` fi col_name: col3 - criticality: warn check: - function: value_is_in_list + function: is_in_list arguments: col_name: col4 allowed: @@ -170,76 +152,83 @@ Fields: ### Loading and execution methods -Checks can be loaded from a file in the installation folder, workspace, or local file system. If the checks file contains invalid json or yaml syntax, the engine will raise an error. +Checks can be loaded from a file in the installation folder, workspace, or local file system. The engine will raise an error if the checks file contains invalid JSON or YAML definition. Checks loaded from a file can be applied using one of the following methods: * `apply_checks_by_metadata_and_split`: splits the input data into valid and invalid (quarantined) dataframes. * `apply_checks_by_metadata`: report issues as additional columns. -Syntax of the loaded checks are validated automatically as part of these methods. +Syntax of the loaded checks is validated automatically as part of these methods. +In addition, you can also perform a standalone syntax validation of the checks as described [here](#validating-syntax-of-quality-checks-defined-in-yamljson). #### Method 1: Loading checks from a workspace file in the installation folder -If DQX is installed in the workspace, you can load checks based on the run configuration: + + + ```python + from databricks.labs.dqx.engine import DQEngine + from databricks.sdk import WorkspaceClient -```python -from databricks.labs.dqx.engine import DQEngine -from databricks.sdk import WorkspaceClient + dq_engine = DQEngine(WorkspaceClient()) + # load check file specified in the run configuration (if DQX installed in the workspace) + checks = dq_engine.load_checks_from_installation(assume_user=True, run_config_name="default") -dq_engine = DQEngine(WorkspaceClient()) -# load check file specified in the run configuration -checks = dq_engine.load_checks_from_installation(assume_user=True, run_config_name="default") + input_df = spark.read.table("catalog1.schema1.table1") -input_df = spark.read.table("catalog1.schema1.table1") + # Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes + valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks) -# Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes -valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks) + # Option 2: apply quality rules on the dataframe and report issues as additional columns (`_warning` and `_error`) + valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(input_df, checks) + ``` + + -# Option 2: apply quality rules on the dataframe and report issues as additional columns (`_warning` and `_error`) -valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(input_df, checks) -``` +#### Method 2: Loading checks from an arbitrary workspace file -#### Method 2: Loading checks from a workspace file + + + ```python + from databricks.labs.dqx.engine import DQEngine + from databricks.sdk import WorkspaceClient -The checks can also be loaded from any file in the Databricks workspace: + dq_engine = DQEngine(WorkspaceClient()) + checks = dq_engine.load_checks_from_workspace_file(workspace_path="/Shared/App1/checks.yml") -```python -from databricks.labs.dqx.engine import DQEngine -from databricks.sdk import WorkspaceClient + input_df = spark.read.table("catalog1.schema1.table1") -dq_engine = DQEngine(WorkspaceClient()) -checks = dq_engine.load_checks_from_workspace_file(workspace_path="/Shared/App1/checks.yml") + # Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes + valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks) -input_df = spark.read.table("catalog1.schema1.table1") - -# Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes -valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks) - -# Option 2: apply quality rules on the dataframe and report issues as additional columns (`_warning` and `_error`) -valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(input_df, checks) -``` + # Option 2: apply quality rules on the dataframe and report issues as additional columns (`_warning` and `_error`) + valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(input_df, checks) + ``` + + #### Method 3: Loading checks from a local file -Checks can also be loaded from a file in the local file system: + + + ```python + from databricks.labs.dqx.engine import DQEngine + from databricks.sdk import WorkspaceClient -```python -from databricks.labs.dqx.engine import DQEngine -from databricks.sdk import WorkspaceClient + checks = DQEngine.load_checks_from_local_file("checks.yml") + dq_engine = DQEngine(WorkspaceClient()) -checks = DQEngine.load_checks_from_local_file("checks.yml") -dq_engine = DQEngine(WorkspaceClient()) + input_df = spark.read.table("catalog1.schema1.table1") -input_df = spark.read.table("catalog1.schema1.table1") + # Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes + valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks) -# Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes -valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks) + # Option 2: apply quality rules on the dataframe and report issues as additional columns (`_warning` and `_error`) + valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(input_df, checks) + ``` + + -# Option 2: apply quality rules on the dataframe and report issues as additional columns (`_warning` and `_error`) -valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(input_df, checks) -``` - -### Quality rules defined as code +### Quality rules defined in code #### Method 1: Using DQX classes @@ -248,180 +237,245 @@ Checks defined using DQX classes can applied using one of the following methods: * `apply_checks`: if you want to report issues as additional columns. Example: -```python -from databricks.labs.dqx.col_functions import is_not_null, is_not_null_and_not_empty, value_is_in_list -from databricks.labs.dqx.engine import DQEngine -from databricks.labs.dqx.rule import DQRuleColSet, DQRule -from databricks.sdk import WorkspaceClient - - -dq_engine = DQEngine(WorkspaceClient()) - -checks = [ - DQRule( # define rule for a single column - name="col3_is_null_or_empty", - criticality="error", - check=is_not_null_and_not_empty("col3")), - DQRule( # define rule with a filter - name="col_4_is_null_or_empty", - criticality="error", - filter="col1 < 3", - check=is_not_null_and_not_empty("col4")), - DQRule( # name for the check auto-generated if not provided - criticality="warn", - check=value_is_in_list("col4", ["1", "2"])) - ] + DQRuleColSet( # define rule for multiple columns at once - columns=["col1", "col2"], - criticality="error", - check_func=is_not_null).get_rules() - -input_df = spark.read.table("catalog1.schema1.table1") - -# Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes -valid_df, quarantined_df = dq_engine.apply_checks_and_split(input_df, checks) - -# Option 2: apply quality rules on the dataframe and report issues as additional columns (`_warning` and `_error`) -valid_and_quarantined_df = dq_engine.apply_checks(input_df, checks) -``` - -See details of the check functions [here](/docs/reference/quality_rules). - -#### Method 2: Using metadata (yaml/json) + + + ```python + from databricks.labs.dqx.col_functions import is_not_null, is_not_null_and_not_empty, is_in_list + from databricks.labs.dqx.engine import DQEngine + from databricks.labs.dqx.rule import DQRuleColSet, DQRule + from databricks.sdk import WorkspaceClient + + + dq_engine = DQEngine(WorkspaceClient()) + + checks = [ + DQRule( # define rule for a single column + name="col3_is_null_or_empty", + criticality="error", + check=is_not_null_and_not_empty("col3")), + DQRule( # define rule with a filter + name="col_4_is_null_or_empty", + criticality="error", + filter="col1 < 3", + check=is_not_null_and_not_empty("col4")), + DQRule( # name for the check auto-generated if not provided + criticality="warn", + check=is_in_list("col4", ["1", "2"])) + ] + DQRuleColSet( # define rule for multiple columns at once + columns=["col1", "col2"], + criticality="error", + check_func=is_not_null).get_rules() + + input_df = spark.read.table("catalog1.schema1.table1") + + # Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes + valid_df, quarantined_df = dq_engine.apply_checks_and_split(input_df, checks) + + # Option 2: apply quality rules on the dataframe and report issues as additional columns (`_warning` and `_error`) + valid_and_quarantined_df = dq_engine.apply_checks(input_df, checks) + ``` + + + +#### Method 2: Using metadata config (yaml/json) Checks defined as metadata in `yaml` or `json` can applied using one of the following methods: * `apply_checks_by_metadata_and_split`: if you want to split the checked data into valid and invalid (quarantined) dataframes. * `apply_checks_by_metadata`: if you want to report issues as additional columns. Example: -```python -import yaml -from databricks.labs.dqx.engine import DQEngine -from databricks.sdk import WorkspaceClient - -dq_engine = DQEngine(WorkspaceClient()) - -checks = yaml.safe_load(""" -- criticality: error - check: - function: is_not_null - arguments: - col_names: - - col1 - - col2 - -- criticality: error - check: - function: is_not_null_and_not_empty - arguments: - col_name: col3 - -- criticality: error - filter: col1 < 3 - check: - function: is_not_null_and_not_empty - arguments: - col_name: col4 - -- criticality: warn - check: - function: value_is_in_list - arguments: - col_name: col4 - allowed: - - 1 - - 2 -""") - -input_df = spark.read.table("catalog1.schema1.table1") - -# Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes -valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks) - -# Option 2: apply quality rules on the dataframe and report issues as additional columns (`_warning` and `_error`) -valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(input_df, checks) -``` - -See details of the check functions [here](/docs/reference/quality_rules). + + + ```python + import yaml + from databricks.labs.dqx.engine import DQEngine + from databricks.sdk import WorkspaceClient + + dq_engine = DQEngine(WorkspaceClient()) + + checks = yaml.safe_load(""" + - criticality: error + check: + function: is_not_null + arguments: + col_names: + - col1 + - col2 + + - criticality: error + check: + function: is_not_null_and_not_empty + arguments: + col_name: col3 + + - criticality: error + filter: col1 < 3 + check: + function: is_not_null_and_not_empty + arguments: + col_name: col4 + + - criticality: warn + check: + function: is_in_list + arguments: + col_name: col4 + allowed: + - 1 + - 2 + """) + + input_df = spark.read.table("catalog1.schema1.table1") + + # Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes + valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks) + + # Option 2: apply quality rules on the dataframe and report issues as additional columns (`_warning` and `_error`) + valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(input_df, checks) + ``` + + ### Integration with DLT (Delta Live Tables) DLT provides [expectations](https://docs.databricks.com/en/delta-live-tables/expectations.html) to enforce data quality constraints. However, expectations don't offer detailed insights into why certain checks fail. -The example below demonstrates how to integrate DQX with DLT to provide comprehensive quality information. -The DQX integration with DLT does not use DLT Expectations but DQX own methods. +The example below demonstrates integrating DQX with DLT to provide comprehensive quality information. +The DQX integration with DLT does not use DLT Expectations but DQX's own methods. #### Option 1: Apply quality rules and quarantine bad records -```python -import dlt -from databricks.labs.dqx.engine import DQEngine -from databricks.sdk import WorkspaceClient - -dq_engine = DQEngine(WorkspaceClient()) - -checks = ... # quality rules / checks - -@dlt.view -def bronze_dq_check(): - df = dlt.read_stream("bronze") - return dq_engine.apply_checks_by_metadata(df, checks) - -@dlt.table -def silver(): - df = dlt.read_stream("bronze_dq_check") - # get rows without errors or warnings, and drop auxiliary columns - return dq_engine.get_valid(df) - -@dlt.table -def quarantine(): - df = dlt.read_stream("bronze_dq_check") - # get only rows with errors or warnings - return dq_engine.get_invalid(df) -``` + + + ```python + import dlt + from databricks.labs.dqx.engine import DQEngine + from databricks.sdk import WorkspaceClient + + dq_engine = DQEngine(WorkspaceClient()) + + checks = ... # quality rules / checks + + @dlt.view + def bronze_dq_check(): + df = dlt.read_stream("bronze") + return dq_engine.apply_checks_by_metadata(df, checks) + + @dlt.table + def silver(): + df = dlt.read_stream("bronze_dq_check") + # get rows without errors or warnings, and drop auxiliary columns + return dq_engine.get_valid(df) + + @dlt.table + def quarantine(): + df = dlt.read_stream("bronze_dq_check") + # get only rows with errors or warnings + return dq_engine.get_invalid(df) + ``` + + #### Option 2: Apply quality rules and report issues as additional columns -```python -import dlt -from databricks.labs.dqx.engine import DQEngine -from databricks.sdk import WorkspaceClient - -checks = ... # quality rules / checks -dq_engine = DQEngine(WorkspaceClient()) - -@dlt.view -def bronze_dq_check(): - df = dlt.read_stream("bronze") - return dq_engine.apply_checks_by_metadata(df, checks) - -@dlt.table -def silver(): - df = dlt.read_stream("bronze_dq_check") - return df -``` - -## Data Quality Dashboards - -Data quality dashboards are automatically installed in `dashboards` folder in the workspace installation directory when you install DQX in a Databricks workspace. For more details on the installation process, see the [Installation Guide](/docs/installation). - -The dashboards let you monitor and track data quality issues easily. You can customize them to align with your specific requirements. - -By default, dashboards are not scheduled to refresh automatically, thereby minimizing concerns regarding associated cluster costs. When you open a dashboard, you need to refresh it manually to view the latest data. However, you can configure the dashboard to [refresh periodically](https://docs.databricks.com/en/dashboards/index.html#schedules-and-subscriptions) as needed. - -To navigate to the dashboards directory in the workspace UI, use the following command: -```commandline -databricks labs dqx open-dashboards -``` - -After executing the command: -* Locate and click on a dashboard file in the workspace UI. -* Open the dashboard, and click `Refresh` to load the latest data. - -Note: the dashboards are only using the quarantined data as input as defined during the installation process. -If you change the quarantine table in the run config after the deployment (`quarantine_table` field), you need to update the dashboard queries accordingly. - -## Quality Rules and Creation of Custom Checks - -DQX offers a set of predefined quality rules (checks) that you can leverage. Additionally, you have the flexibility to define custom checks to meet specific requirements. Learn more [here](/docs/reference/quality_rules). + + + ```python + import dlt + from databricks.labs.dqx.engine import DQEngine + from databricks.sdk import WorkspaceClient + + checks = ... # quality rules / checks + dq_engine = DQEngine(WorkspaceClient()) + + @dlt.view + def bronze_dq_check(): + df = dlt.read_stream("bronze") + return dq_engine.apply_checks_by_metadata(df, checks) + + @dlt.table + def silver(): + df = dlt.read_stream("bronze_dq_check") + return df + ``` + + + +## Validating syntax of quality checks defined in yaml/json + +You can validate the syntax of checks defined as metadata in `yaml` or `json` format before applying them. This validation ensures that the checks are correctly defined and can be interpreted by the DQX engine. +The validation cannot be used for checks defined using [DQX classes](#method-1-using-dqx-classes). When checks are defined with DQX classes, syntax validation is unnecessary because the application will fail to interpret them if the DQX objects are constructed incorrectly. + + + + ```python + import yaml + from databricks.labs.dqx.engine import DQEngine + + checks = yaml.safe_load(""" + - criticality: error + check: + function: is_not_null + arguments: + col_names: + - col1 + - col2 + """) + + status = DQEngine.validate_checks(checks) + print(status) + ``` + + + Validate checks stored in the installation folder: + ```commandline + databricks labs dqx validate-checks --run-config "default" + ``` + + The following DQX configuration from 'config.yml' will be used by default: + - 'checks_file': relative location of the quality rules defined as `yaml` or `json` inside the installation folder (default: `checks.yml`). + + + + +Validating quality rules are typically done as part of the CI/CD process to ensure checks are ready to use in the application. + + +## Data Quality Dashboard + +The data quality dashboard is automatically installed in the `dashboards` folder of the workspace installation directory when you install DQX in the Databricks workspace. For more details on the installation process, see the [Installation Guide](/docs/installation). + +The dashboard lets you monitor and track data quality issues easily. You can customize them to align with your specific requirements. + +The dashboard is not scheduled to refresh automatically by default, minimizing concerns regarding associated cluster costs. When you open a dashboard, refresh it manually to view the latest data. However, as needed, you can configure the dashboard to [refresh periodically](https://docs.databricks.com/en/dashboards/index.html#schedules-and-subscriptions). + + + + You can locate the dashboard using Databricks workspace UI directly or use the following command: + ```commandline + databricks labs dqx open-dashboards + ``` + + After executing the command: + * Locate and click on a dashboard file in the workspace UI. + * Open the dashboard and click `Refresh` to load the latest data. + + + You can locate the dashboard using Databricks workspace UI directly or use the following code: + ```python + from databricks.labs.dqx.contexts.workspace import WorkspaceContext + + ctx = WorkspaceContext(WorkspaceClient()) + dashboards_folder_link = f"{ctx.installation.workspace_link('')}dashboards/" + print(f"Open a dashboard from the following folder and refresh it:") + print(dashboards_folder_link) + ``` + + + + +DQX dashboard(s) only use the quarantined table for queries as defined in `config.yml` during installation. +If you change the quarantine table in the run config after the deployment (`quarantine_table` field), you must update the dashboard queries accordingly. + ## Additional Configuration @@ -431,16 +485,20 @@ By default, DQX appends `_error` and `_warning` reporting columns to the output You can customize the names of these reporting columns by specifying additional configurations in the engine. -```python -from databricks.sdk import WorkspaceClient -from databricks.labs.dqx.engine import ( - DQEngine, - ExtraParams, -) - -# customize reporting column names -extra_parameters = ExtraParams(column_names={"errors": "dq_errors", "warnings": "dq_warnings"}) - -ws = WorkspaceClient() -dq_engine = DQEngine(ws, extra_params=extra_parameters) -``` + + + ```python + from databricks.sdk import WorkspaceClient + from databricks.labs.dqx.engine import ( + DQEngine, + ExtraParams, + ) + + # customize reporting column names + extra_parameters = ExtraParams(column_names={"errors": "dq_errors", "warnings": "dq_warnings"}) + + ws = WorkspaceClient() + dq_engine = DQEngine(ws, extra_params=extra_parameters) + ``` + + diff --git a/docs/dqx/docs/installation.mdx b/docs/dqx/docs/installation.mdx index 1f632fa4..f84b53da 100644 --- a/docs/dqx/docs/installation.mdx +++ b/docs/dqx/docs/installation.mdx @@ -1,6 +1,9 @@ --- sidebar_position: 2 --- + +import Admonition from '@theme/Admonition'; + # Installation The framework can be installed on a Databricks workspace or used as a standalone library. @@ -25,12 +28,12 @@ Install a specific version of the project via `pip` (e.g. version 0.1.12): pip install databricks-labs-dqx==0.1.12 ``` -## DQX installation in a Databricks Workspace +## DQX installation as a Tool in a Databricks Workspace -If you choose to install DQX via PyPI and use it purely as a library, you don’t need to pre-install DQX in the workspace. -However, installing DQX in the workspace offers additional benefits such as profiling job/workflow, pre-configured dashboards, convenient configuration management. +If you install DQX via PyPI and use it purely as a library, you don’t need to pre-install DQX in the workspace. +However, installing DQX in the workspace offers additional benefits, such as profiling jobs/workflows, a pre-configured dashboard, and convenient configuration management. -### Authentication +### Authenticate Databricks CLI Once you install Databricks CLI, authenticate your current machine to your Databricks Workspace: @@ -41,13 +44,21 @@ databricks auth login --host To enable debug logs, simply add `--debug` flag to any command. More about authentication options [here](https://docs.databricks.com/en/dev-tools/cli/authentication.html). -### Install DQX +### Install DQX using Databricks CLI Install DQX in your Databricks workspace via Databricks CLI: ```commandline databricks labs install dqx ``` + +* Make sure to have Databricks CLI v0.241 or later installed locally to avoid encountering the error: ModuleNotFoundError: No module named 'pyspark'. +* You must have Python 3.10 or later to install DQX using the Databricks Labs CLI. +The Databricks Labs CLI relies on the user's Python installation to create a virtual environment and install the required DQX packages. The packages (e.g. pyspark) don't have to be installed locally before running the CLI. +* Running the Databricks CLI from within a Databricks workspace is not supported. The CLI is designed for use from a local machine or a separate compute environment, not directly inside Databricks. +* The CLI supports the private PyPI package index. If you encounter SSL-related errors, you may need to install OpenSSL on your system or reinstall Python. + + Install a specific version of DQX in your Databricks workspace via Databricks CLI (e.g. version 0.1.12): ```commandline databricks labs install dqx@v0.1.12 @@ -60,18 +71,18 @@ The cli command will install the following components in the workspace installat - A Python [wheel file](https://peps.python.org/pep-0427/) with the library packaged. - DQX configuration file (`config.yml`). - Profiling workflow for generating quality rule candidates (not scheduled by default eliminating cost concerns) -- Quality dashboards for monitoring to display information about the data quality issues (not scheduled by default eliminating cost concerns) +- Quality dashboard for monitoring to display information about the data quality issues (not scheduled by default eliminating cost concerns) -By default, DQX is installed in the user home directory (under `/Users//.dqx`). You can also install DQX globally -by setting 'DQX_FORCE_INSTALL' environment variable. The following options are available: +DQX is installed by default in the user home directory (under `/Users//.dqx`). You can also install DQX globally +by setting the 'DQX_FORCE_INSTALL' environment variable. The following options are available: * `DQX_FORCE_INSTALL=global databricks labs install dqx`: will force the installation to be for root only (`/Applications/dqx`) * `DQX_FORCE_INSTALL=user databricks labs install dqx`: will force the installation to be for user only (`/Users//.dqx`) **Configration file** -DQX configuration file can contain multiple run configurations for different pipelines or projects defining specific set of input, output and quarantine locations etc. -During the installation the "default" run configuration is created. When DQX is upgraded, the config is preserved. +DQX configuration file can contain multiple run configurations for different pipelines or projects defining specific input, output and quarantine locations, etc. +The "default" run configuration is created during the installation. When DQX is upgraded, the configuration is preserved. Open the configuration file: ```commandline @@ -88,42 +99,44 @@ run_configs: input_location: s3://iot-ingest/raw # <- Input location for profiling (UC table or cloud path) input_format: delta # <- format, required if cloud path provided output_table: main.iot.silver # <- output UC table - quarantine_table: main.iot.quarantine # <- quarantine UC table used as input for quality dashboards - checks_file: iot_checks.yml # <- location of the quality rules (checks) - profile_summary_stats_file: iot_profile_summary_stats.yml # <- location of profiling summary stats - warehouse_id: your-warehouse-id # <- warehouse id for refreshing dashboards + quarantine_table: main.iot.quarantine # <- quarantine UC table used as input for quality dashboard + checks_file: iot_checks.yml # <- relative location of the quality rules (checks) defined as json or yaml + profile_summary_stats_file: iot_profile_summary_stats.yml # <- relative location of profiling summary stats + warehouse_id: your-warehouse-id # <- warehouse id for refreshing dashboard - name: another_run_config # <- unique name of the run config ... ``` -To specify a particular run configuration when executing DQX Labs CLI commands, use the --run-config parameter. If no configuration is provided, the "default" run config is used. +Use the `—-run-config` parameter to specify a particular run configuration when executing DQX Labs CLI commands. If no configuration is provided, the "default" run configuration is used. **Workflows** -Profiling workflow is intended as a one-time operation. It is not scheduled by default ensuring that no costs are incurred. +Profiling workflow is intended as a one-time operation. It is not scheduled by default, ensuring no costs are incurred. List all installed workflows in the workspace and their latest run state: ```commandline databricks labs dqx workflows ``` -**Dashboards** +**Dashboard** -DQX data quality dashboards are deployed to the installation directory. -Dashboards are not scheduled to refresh by default ensuring that no costs are incurred. +DQX data quality dashboard is deployed to the installation directory. +The dashboard is not scheduled to refresh by default, ensuring no costs are incurred. -Open dashboards: +Open dashboard: ```commandline databricks labs dqx open-dashboards ``` -Note: the dashboards are only using the quarantined data as input as defined during the installation process. -If you change the quarantine table in the run config after the deployment (`quarantine_table` field), you need to update the dashboard queries accordingly. + +DQX dashboard(s) only use the quarantined table for queries as defined in `config.yml` during installation. +If you change the quarantine table in the run config after the deployment (`quarantine_table` field), you must update the dashboard queries accordingly. + ### Install DQX on Databricks cluster -You need to install the DQX package on a Databricks cluster to be able to use it. -You can install it either from PYPI or use a wheel file generated as part of the installation in the workspace. +You need to install the DQX package on a Databricks cluster to use it. +You can install it either from PYPI or use a wheel file generated during the installation in the workspace. There are multiple ways to install libraries in a Databricks cluster (see [here](https://docs.databricks.com/en/libraries/index.html)). For example, you can install DQX directly from a notebook cell as follows: @@ -167,4 +180,4 @@ databricks labs uninstall dqx ``` Databricks CLI will confirm a few options: -- Whether you want to remove all dqx artefacts from the workspace as well. Defaults to 'no'. +- Whether you want to remove all DQX artefacts from the workspace or not. Defaults to 'no'. diff --git a/docs/dqx/docs/motivation.mdx b/docs/dqx/docs/motivation.mdx index 74645557..a246e675 100644 --- a/docs/dqx/docs/motivation.mdx +++ b/docs/dqx/docs/motivation.mdx @@ -5,32 +5,40 @@ import useBaseUrl from '@docusaurus/useBaseUrl'; # Motivation -Current data quality frameworks often fall short in providing detailed explanations for specific row or column +Current data quality frameworks often fall short of providing detailed explanations for specific row or column data quality issues and are primarily designed for complete datasets, making integration into streaming workloads difficult. -They also lack the ability to quarantine invalid data and have compatibility issues with Databricks Runtime. +They cannot quarantine invalid data and have compatibility issues with Databricks Runtime. -This project introduces a simple Python validation framework for assessing data quality of PySpark DataFrames. +This project introduces a simple but powerful Python validation framework for assessing the data quality of PySpark DataFrames. It enables real-time quality validation during data processing rather than relying solely on post-factum monitoring. The validation output includes detailed information on why specific rows and columns have issues, allowing for quicker identification and resolution of data quality problems. The framework offers the ability to quarantine invalid data and investigate quality issues before they escalate. -
- DQX -
+## How DQX works -Invalid data can be quarantined to make sure bad data is never written to the output. +### Option 1: Apply checks and quarantine "bad" data. + +Apply checks on the DataFrame and quarantine invalid records to ensure "bad" data is never written to the output.
Quarantine
+### Option 2: Apply checks and flag "bad" data. + +Apply checks on the DataFrame and flag invalid records as additional columns. + +
+ DQX +
-In the Lakehouse architecture, the validation of new data should happen at the time of data entry into the Curated Layer -to make sure bad data is not propagated to the subsequent layers. With DQX you can easily quarantine invalid data and re-ingest it -after curation to ensure that data quality constraints are met. +### DQX usage in the Lakehouse Architecture +In the Lakehouse architecture, new data validation should happen during data entry into the curated layer to ensure bad data is not propagated to the subsequent layers. +With DQX, you can quickly quarantine invalid data and re-ingest it after curation to ensure data quality constraints are met. +The data quality can be monitored in real-time between layers, and the quarantine process can be automated.
Lakehouse @@ -38,8 +46,8 @@ after curation to ensure that data quality constraints are met. ## When to use DQX -* Use DQX if you need pro-active monitoring (before data is written to a target table). +* Use DQX if you need pro-active monitoring (before data is written to a target table), especially in the streaming pipelines. * For monitoring data quality of already persisted data in Delta tables (post-factum monitoring), try [Databricks Lakehouse Monitoring](https://docs.databricks.com/en/lakehouse-monitoring/index.html). -* DQX can be integrated with DLT for data quality checking but your first choice for DLT pipelines should be [DLT Expectations](https://docs.databricks.com/en/delta-live-tables/expectations.html#what-are-delta-live-tables-expectations). DQX can be used to profile data and generate DLT expectation candidates. +* DQX can be integrated with DLT to check data quality. Still, your first choice for DLT pipelines should be [DLT Expectations](https://docs.databricks.com/en/delta-live-tables/expectations.html#what-are-delta-live-tables-expectations). DQX can be used to profile data and generate DLT expectation candidates. * DQX can be integrated with other data transformation frameworks that support PySpark, such as [dbt](https://docs.getdbt.com/docs/build/python-models). However, this integration is limited to dbt Python models and does not extend to dbt SQL models. diff --git a/docs/dqx/docs/reference/engine.mdx b/docs/dqx/docs/reference/engine.mdx index 95124689..829de3f1 100644 --- a/docs/dqx/docs/reference/engine.mdx +++ b/docs/dqx/docs/reference/engine.mdx @@ -1,34 +1,45 @@ +import Admonition from '@theme/Admonition'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # DQX Engine -To perform data quality checking with DQX, you need to create `DQEngine` object. +To perform data quality checking with DQX, you must create a `DQEngine` object. The engine requires a Databricks [workspace client](https://docs.databricks.com/aws/en/dev-tools/sdk-python) for authentication and interaction with the Databricks workspace. -When running the code on a Databricks workspace, the workspace client is automatically authenticated, whether DQX is used in a notebook, script, or as part of a job/workflow. +When running the code on a Databricks workspace, the workspace client is automatically authenticated, whether DQX is used in a notebook, script, or job/workflow. You only need the following code to create the workspace client if you run DQX on Databricks workspace: -```python -from databricks.sdk import WorkspaceClient -from databricks.labs.dqx.engine import DQEngine -ws = WorkspaceClient() -dq_engine = DQEngine(ws) -``` + + + ```python + from databricks.sdk import WorkspaceClient + from databricks.labs.dqx.engine import DQEngine + + ws = WorkspaceClient() + dq_engine = DQEngine(ws) + ``` + + For external environments, such as CI servers or local machines, you can authenticate to Databricks using any method supported by the Databricks SDK. For detailed instructions, refer to the [default authentication flow](https://databricks-sdk-py.readthedocs.io/en/latest/authentication.html#default-authentication-flow). -If you're using Databricks [configuration profiles](https://docs.databricks.com/dev-tools/auth.html#configuration-profiles) or Databricks-specific [environment variables](https://docs.databricks.com/dev-tools/auth.html#environment-variables) for authentication, you can easily create the workspace client without needing to provide additional arguments: +If you're using Databricks [configuration profiles](https://docs.databricks.com/dev-tools/auth.html#configuration-profiles) or Databricks-specific [environment variables](https://docs.databricks.com/dev-tools/auth.html#environment-variables) for authentication, you can create the workspace client without needing to provide additional arguments: ```python ws = WorkspaceClient() ``` -Information on testing applications that use `DQEngine` can be found [here](/docs/reference/testing). +Information on testing applications that use `DQEngine` including local execution without a Databricks workspace can be found [here](/docs/reference/testing). ## DQX engine methods -The following table outlines the available methods of the DQEngine and their functionalities: +The following table outlines the available methods of the `DQEngine` and their functionalities: +
+**Available DQX engine methods** | Check | Description | Arguments | | ---------------------------------- | --------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| apply_checks | Applies quality checks to the DataFrame and returns a DataFrame with reporting columns. | df: DataFrame to check; checks: List of checks to the DataFrame. Each check is an instance of DQRule class. | -| apply_checks_and_split | Applies quality checks to the DataFrame and returns valid and invalid (quarantine) DataFrames with reporting columns. | df: DataFrame to check; checks: List of checks to apply to the DataFrame. Each check is an instance of DQRule class. | +| apply_checks | Applies quality checks to the DataFrame and returns a DataFrame with reporting columns. | df: DataFrame to check; checks: List of checks to the DataFrame. Each check is an instance of the DQRule class. | +| apply_checks_and_split | Applies quality checks to the DataFrame and returns valid and invalid (quarantine) DataFrames with reporting columns. | df: DataFrame to check; checks: List of checks to apply to the DataFrame. Each check is an instance of the DQRule class. | | apply_checks_by_metadata | Applies quality checks defined as a dictionary to the DataFrame and returns a DataFrame with reporting columns. | df: DataFrame to check. checks: List of dictionaries describing checks; custom_check_functions: Optional dictionary with custom check functions (e.g., globals() of the calling module). | | apply_checks_by_metadata_and_split | Applies quality checks defined as a dictionary and returns valid and invalid (quarantine) DataFrames. | df: DataFrame to check; checks: List of dictionaries describing checks; custom_check_functions: Optional dictionary with custom check functions (e.g., globals() of the calling module). | | validate_checks | Validates the provided quality checks to ensure they conform to the expected structure and types. | checks: List of checks to validate; custom_check_functions: Optional dictionary of custom check functions that can be used. | @@ -41,3 +52,4 @@ The following table outlines the available methods of the DQEngine and their fun | save_checks_in_workspace_file | Saves checks to a file (YAML) in the Databricks workspace. | checks: List of checks to save; workspace_path: Destination path for the checks file in the workspace. | | save_checks_in_installation | Saves checks to the installation folder as a YAML file. | checks: List of checks to save; run_config_name: Name of the run config to use; assume_user: If True, assume user installation. | | load_run_config | Loads run configuration from the installation folder. | run_config_name: Name of the run config to use; assume_user: If True, assume user installation. | +
diff --git a/docs/dqx/docs/reference/quality_rules.mdx b/docs/dqx/docs/reference/quality_rules.mdx index 25ca722d..d0420473 100644 --- a/docs/dqx/docs/reference/quality_rules.mdx +++ b/docs/dqx/docs/reference/quality_rules.mdx @@ -1,52 +1,491 @@ +import Admonition from '@theme/Admonition'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # Quality rules -This page provides a reference for the quality rule functions (checks) available in DQX. - -## Quality rule functions (checks) - -The following quality rules / functions are currently available: - -| Check | Description | Arguments | -| -------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| is_not_null | Check if input column is not null | col_name: column name to check | -| is_not_empty | Check if input column is not empty | col_name: column name to check | -| is_not_null_and_not_empty | Check if input column is not null or empty | col_name: column name to check; trim_strings: boolean flag to trim spaces from strings | -| value_is_in_list | Check if the provided value is present in the input column. | col_name: column name to check; allowed: list of allowed values | -| value_is_not_null_and_is_in_list | Check if provided value is present if the input column is not null | col_name: column name to check; allowed: list of allowed values | -| is_not_null_and_not_empty_array | Check if input array column is not null or empty | col_name: column name to check | -| is_in_range | Check if input column is in the provided range (inclusive of both boundaries) | col_name: column name to check; min_limit: min limit value; max_limit: max limit value; min_limit_col_expr: min limit column name or expr; max_limit_col_expr: max limit column name or expr | -| is_not_in_range | Check if input column is not within defined range (inclusive of both boundaries) | col_name: column name to check; min_limit: min limit value; max_limit: max limit value; min_limit_col_expr: min limit column name or expr; max_limit_col_expr: max limit column name or expr | -| not_less_than | Check if input column is not less than the provided limit | col_name: column name to check; limit: limit value | -| not_greater_than | Check if input column is not greater than the provided limit | col_name: column name to check; limit: limit value | -| is_valid_date | Check if input column is a valid date | col_name: column name to check; date_format: date format (e.g. 'yyyy-mm-dd') | -| is_valid_timestamp | Check if input column is a valid timestamp | col_name: column name to check; timestamp_format: timestamp format (e.g. 'yyyy-mm-dd HH:mm:ss') | -| not_in_future | Check if input column defined as date is not in the future (future defined as current_timestamp + offset) | col_name: column name to check; offset: offset to use; curr_timestamp: current timestamp, if not provided current_timestamp() function is used | -| not_in_near_future | Check if input column defined as date is not in the near future (near future defined as grater than current timestamp but less than current timestamp + offset) | col_name: column name to check; offset: offset to use; curr_timestamp: current timestamp, if not provided current_timestamp() function is used | -| is_older_than_n_days | Check if input column is older than n number of days | col_name: column name to check; days: number of days; curr_date: current date, if not provided current_date() function is used | -| is_older_than_col2_for_n_days | Check if one column is not older than another column by n number of days | col_name1: first column name to check; col_name2: second column name to check; days: number of days | -| regex_match | Check if input column matches a given regex | col_name: column name to check; regex: regex to check; negate: if the condition should be negated (true) or not | -| sql_expression | Check if input column is matches the provided sql expression, eg. a = 'str1', a > b | expression: sql expression to check; msg: optional message to output; name: optional name of the resulting column; negate: if the condition should be negated | - -You can check implementation details of the rules [here](https://github.com/databrickslabs/dqx/blob/main/src/databricks/labs/dqx/col_functions.py). - -## Apply filters on checks +This page provides a reference for the quality checks (rule functions) available in DQX. + +## Row-level quality checks + +The following row-level checks are currently available in DQX. +These checks are applied to each row of a PySpark DataFrame and generate issue reports as additional columns. +You can also define your own custom checks (see [Creating custom checks](#creating-custom-checks)). + +
+**Available row-level checks** +| Check | Description | Arguments | +| -------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| is_not_null | Checks whether the values in the input column are not null. | col_name: column name to check | +| is_not_empty | Checks whether the values in the input column are not empty (but may be null). | col_name: column name to check | +| is_not_null_and_not_empty | Checks whether the values in the input column are not null and not empty. | col_name: column name to check; trim_strings: optional boolean flag to trim spaces from strings | +| is_in_list | Checks whether the values in the input column are present in the list of allowed values (null values are allowed). | col_name: column name to check; allowed: list of allowed values | +| is_not_null_and_is_in_list | Checks whether the values in the input column are not null and present in the list of allowed values. | col_name: column name to check; allowed: list of allowed values | +| is_not_null_and_not_empty_array | Checks whether the values in the array input column are not null and not empty. | col_name: column name to check | +| is_in_range | Checks whether the values in the input column are in the provided range (inclusive of both boundaries). | col_name: column name to check; min_limit: min limit as number, date, timestamp, column name or expression; max_limit: max limit as number, date, timestamp, column name or expression | +| is_not_in_range | Checks whether the values in the input column are outside the provided range (inclusive of both boundaries). | col_name: column name to check; min_limit: min limit as number, date, timestamp, column name or expression; max_limit: max limit as number, date, timestamp, column name or expression | +| is_not_less_than | Checks whether the values in the input column are not less than the provided limit. | col_name: column name to check; limit: limit as number, date, timestamp, column name or expression | +| is_not_greater_than | Checks whether the values in the input column are not greater than the provided limit. | col_name: column name to check; limit: limit as number, date, timestamp, column name or expression | +| is_valid_date | Checks whether the values in the input column have valid date formats. | col_name: column name to check; date_format: optional date format (e.g. 'yyyy-mm-dd') | +| is_valid_timestamp | Checks whether the values in the input column have valid timestamp formats. | col_name: column name to check; timestamp_format: optional timestamp format (e.g. 'yyyy-mm-dd HH:mm:ss') | +| is_not_in_future | Checks whether the values in the input column contain a timestamp that is not in the future, where 'future' is defined as current_timestamp + offset (in seconds). | col_name: column name to check; offset: offset to use; curr_timestamp: current timestamp, if not provided current_timestamp() function is used | +| is_not_in_near_future | Checks whether the values in the input column contain a timestamp that is not in the near future, where 'near future' is defined as greater than the current timestamp but less than the current_timestamp + offset (in seconds). | col_name: column name to check; offset: offset to use; curr_timestamp: current timestamp, if not provided current_timestamp() function is used | +| is_older_than_n_days | Checks whether the values in one input column are at least N days older than the values in another column. | col_name: column name to check; days: number of days; curr_date: current date, if not provided current_date() function is used | +| is_older_than_col2_for_n_days | Checks whether the values in one input column are at least N days older than the values in another column. | col_name1: first column name to check; col_name2: second column name to check; days: number of days | +| is_unique | Checks whether the values in the input column are unique and reports an issue for each row that contains a duplicate value. Null values are not considered duplicates, following the ANSI SQL standard. | col_name: column name to check: window_spec: optional window specification as a string or column object, you must handle NULLs correctly using coalesce() to prevent rows exclusion | +| regex_match | Checks whether the values in the input column match a given regex. | col_name: column name to check; regex: regex to check; negate: if the condition should be negated (true) or not | +| sql_expression | Checks whether the values meet the condition provided as an SQL expression, e.g. a = 'str1', a > b | expression: sql expression to check; msg: optional message to output; name: optional name of the resulting column; negate: if the condition should be negated | +
+ +You can explore the implementation details of the rules [here](https://github.com/databrickslabs/dqx/blob/main/src/databricks/labs/dqx/col_functions.py). +If you have a custom check that could be broadly useful, feel free to submit a PR to [DQX](https://github.com/databrickslabs/dqx) (see the [contribution guide](/docs/dev/contributing) for details). + +### Usage examples of row-level checks + +Below are fully specified examples of how to use each check in YAML format and with DQX classes. Both are equivalent and can be used interchangeably. + +The `criticality` field can be either "error" (data goes only into the 'bad' or 'quarantine' DataFrame) or "warn" (data goes into both DataFrames). +For brevity, the `name` field in the examples is omitted, meaning it will be auto-generated in the results. + +
+**Checks define in YAML** +```yaml +# is_not_null check +- criticality: error + check: + function: is_not_null + arguments: + col_name: col1 + +# is_not_empty check +- criticality: error + check: + function: is_not_empty + arguments: + col_name: col1 + +# is_not_null_and_not_empty check +- criticality: error + check: + function: is_not_null_and_not_empty + arguments: + col_name: col1 + trim_strings: true + +# is_in_list check +- criticality: error + check: + function: is_in_list + arguments: + col_name: col2 + allowed: + - 1 + - 2 + - 3 + +# is_not_null_and_is_in_list check +- criticality: error + check: + function: is_not_null_and_is_in_list + arguments: + col_name: col2 + allowed: + - 1 + - 2 + - 3 + +# is_not_null_and_not_empty_array check +- criticality: error + check: + function: is_not_null_and_not_empty_array + arguments: + col_name: col4 + +# is_in_range check +- criticality: error + check: + function: is_in_range + arguments: + col_name: col2 + min_limit: 1 + max_limit: 10 +- criticality: error + check: + function: is_in_range + arguments: + col_name: col5 + min_limit: 2025-01-01 + max_limit: 2025-02-24 +- criticality: error + check: + function: is_in_range + arguments: + col_name: col6 + min_limit: 2025-01-01 00:00:00 + max_limit: 2025-02-24 01:00:00 +- criticality: error + check: + function: is_in_range + arguments: + col_name: col3 + min_limit: col2 + max_limit: col2 * 2 + +# is_not_in_range check +- criticality: error + check: + function: is_not_in_range + arguments: + col_name: col2 + min_limit: 11 + max_limit: 20 +- criticality: error + check: + function: is_not_in_range + arguments: + col_name: col5 + min_limit: 2025-02-25 + max_limit: 2025-02-26 +- criticality: error + check: + function: is_not_in_range + arguments: + col_name: col6 + min_limit: 2025-02-25 00:00:00 + max_limit: 2025-02-26 01:00:00 +- criticality: error + check: + function: is_not_in_range + arguments: + col_name: col3 + min_limit: col2 + 10 + max_limit: col2 * 10 + +# is_not_less_than check +- criticality: error + check: + function: is_not_less_than + arguments: + col_name: col2 + limit: 0 +- criticality: error + check: + function: is_not_less_than + arguments: + col_name: col5 + limit: 2025-01-01 +- criticality: error + check: + function: is_not_less_than + arguments: + col_name: col6 + limit: 2025-01-01 01:00:00 +- criticality: error + check: + function: is_not_less_than + arguments: + col_name: col3 + limit: col2 - 10 + +# is_not_greater_than check +- criticality: error + check: + function: is_not_greater_than + arguments: + col_name: col2 + limit: 10 +- criticality: error + check: + function: is_not_greater_than + arguments: + col_name: col5 + limit: 2025-03-01 +- criticality: error + check: + function: is_not_greater_than + arguments: + col_name: col6 + limit: 2025-03-24 01:00:00 +- criticality: error + check: + function: is_not_greater_than + arguments: + col_name: col3 + limit: col2 + 10 + +# is_valid_date check +- criticality: error + check: + function: is_valid_date + arguments: + col_name: col5 +- criticality: error + name: col5_is_not_valid_date2 + check: + function: is_valid_date + arguments: + col_name: col5 + date_format: yyyy-MM-dd + +# is_valid_timestamp check +- criticality: error + check: + function: is_valid_timestamp + arguments: + col_name: col6 + timestamp_format: yyyy-MM-dd HH:mm:ss +- criticality: error + name: col6_is_not_valid_timestamp2 + check: + function: is_valid_timestamp + arguments: + col_name: col6 + +# is_not_in_future check +- criticality: error + check: + function: is_not_in_future + arguments: + col_name: col6 + offset: 86400 + +# is_not_in_near_future check +- criticality: error + check: + function: is_not_in_near_future + arguments: + col_name: col6 + offset: 36400 + +# is_older_than_n_days check +- criticality: error + check: + function: is_older_than_n_days + arguments: + col_name: col5 + days: 10000 + +# is_older_than_col2_for_n_days check +- criticality: error + check: + function: is_older_than_col2_for_n_days + arguments: + col_name1: col5 + col_name2: col6 + days: 2 + +# is_unique check +- criticality: error + check: + function: is_unique + arguments: + col_name: col1 + +# is_unique check with custom window +# default value for NULL in the time column of the window spec must be provided using coalesce() to prevent rows exclusion! +- criticality: error + name: col1_is_not_unique2 + check: + function: is_unique + arguments: + col_name: col1 + window_spec: window(coalesce(col6, '1970-01-01'), '10 minutes') + +# regex_match check +- criticality: error + check: + function: regex_match + arguments: + col_name: col2 + regex: '[0-9]+' + negate: false + +# sql_expression check +- criticality: error + check: + function: sql_expression + arguments: + expression: col3 > col2 and col3 < 10 + msg: col3 is greater than col2 and col3 less than 10 + name: custom_output_name + negate: false +``` +
+ +
+**Checks defined using DQX classes** +```python +from databricks.labs.dqx.col_functions import * +from datetime import datetime + +checks = [ + DQRule( + criticality="error", + check=is_not_null("col1") + ), + DQRule( + criticality="error", + check=is_not_empty("col1") + ), + DQRule( + criticality="error", + check=is_not_null_and_not_empty("col1", trim_strings=True) + ), + DQRule( + criticality="error", + check=is_in_list("col2", [1, 2, 3]) + ), + DQRule( + criticality="error", + check=is_not_null_and_is_in_list("col2", [1, 2, 3]) + ), + DQRule( + criticality="error", + check=is_not_null_and_not_empty_array("col4") + ), + DQRule( + criticality="error", + check=is_in_range("col2", min_limit=1, max_limit=10) + ), + DQRule( + criticality="error", + check=is_in_range("col5", min_limit=datetime(2025, 1, 1).date(), max_limit=datetime(2025, 2, 24).date()) + ), + DQRule( + criticality="error", + check=is_in_range("col6", min_limit=datetime(2025, 1, 1, 0, 0, 0), max_limit=datetime(2025, 2, 24, 1, 0, 0)) + ), + DQRule( + criticality="error", + check=is_in_range("col3", min_limit="col2", max_limit="col2 * 2") + ), + DQRule( + criticality="error", + check=is_not_in_range("col2", min_limit=11, max_limit=20) + ), + DQRule( + criticality="error", + check=is_not_in_range("col5", min_limit=datetime(2025, 2, 25).date(), max_limit=datetime(2025, 2, 26).date()) + ), + DQRule( + criticality="error", + check=is_not_in_range("col6", min_limit=datetime(2025, 2, 25, 0, 0, 0), max_limit=datetime(2025, 2, 26, 1, 0, 0)) + ), + DQRule( + criticality="error", + check=is_not_in_range("col3", min_limit="col2 + 10", max_limit="col2 * 10") + ), + DQRule( + criticality="error", + check=is_not_less_than("col2", limit=0) + ), + DQRule( + criticality="error", + check=is_not_less_than("col5", limit=datetime(2025, 1, 1).date()) + ), + DQRule( + criticality="error", + check=is_not_less_than("col6", limit=datetime(2025, 1, 1, 1, 0, 0)) + ), + DQRule( + criticality="error", + check=is_not_less_than("col3", limit="col2 - 10") + ), + DQRule( + criticality="error", + check=is_not_greater_than("col2", limit=10) + ), + DQRule( + criticality="error", + check=is_not_greater_than("col5", limit=datetime(2025, 3, 1).date()) + ), + DQRule( + criticality="error", + check=is_not_greater_than("col6", limit=datetime(2025, 3, 24, 1, 0, 0)) + ), + DQRule( + criticality="error", + check=is_not_greater_than("col3", limit="col2 + 10") + ), + DQRule( + criticality="error", + check=is_valid_date("col5") + ), + DQRule( + criticality="error", + check=is_valid_date("col5", date_format="yyyy-MM-dd"), + name="col5_is_not_valid_date2" + ), + DQRule( + criticality="error", + check=is_valid_timestamp("col6") + ), + DQRule( + criticality="error", + check=is_valid_timestamp("col6", timestamp_format="yyyy-MM-dd HH:mm:ss"), + name="col6_is_not_valid_timestamp2" + ), + DQRule( + criticality="error", + check=is_not_in_future("col6", offset=86400) + ), + DQRule( + criticality="error", + check=is_not_in_near_future("col6", offset=36400) + ), + DQRule( + criticality="error", + check=is_older_than_n_days("col5", days=10000) + ), + DQRule( + criticality="error", + check=is_older_than_col2_for_n_days("col5", "col6", days=2) + ), + DQRule( + criticality="error", + check=is_unique("col1") + ), + DQRule( + criticality="error", + name="col1_is_not_unique2", + # default value for NULL in the time column of the window spec must be provided using coalesce() to prevent rows exclusion! + check=is_unique("col1", window_spec=F.window(F.coalesce(F.col("col6"), F.lit(datetime(1970, 1, 1))), "10 minutes")) + ), + DQRule( + criticality="error", + check=regex_match("col2", regex="[0-9]+", negate=False) + ), + DQRule( + criticality="error", + check=sql_expression( + expression="col3 > col2 and col3 < 10", + msg="col3 is greater than col2 and col3 less than 10", + name="custom_output_name", + negate=False + ) + ), +] +``` +
+ +## Applying filters on checks You can apply checks to a part of the DataFrame by using a `filter`. -For example, to ensure that a column `a` is not null only when a column `b` is positive, you can define the check as follows: +For example, to check that a column `a` is not null only when a column `b` is positive, you can define the check as follows: ```yaml - criticality: error - filter: b > 0 + filter: col2 > 0 check: function: is_not_null arguments: - col_name: a + col_name: col1 ``` -## Creating your own checks +## Creating custom checks -### Use sql expression +### Custom checks with SQL Expression -If a check that you need does not exist in DQX, you can define them using sql expression rule (`sql_expression`), +You can define custom checks using SQL Expression rule (`sql_expression`), for example: ```yaml - criticality: error @@ -57,83 +496,96 @@ for example: msg: col1 ends with 'foo' ``` -Sql expression is also useful if you want to make cross-column validation, for example: +SQL Expressions are also useful if you need to make cross-column validation, for example: ```yaml - criticality: error check: function: sql_expression arguments: - expression: a > b - msg: a is greater than b + expression: col1 > col2 + msg: col1 is greater than col2 ``` -### Define custom check functions +### Custom checks as a Python function -If you need a reusable check or want to implement more complex logic, you can define your own custom check functions. -A check function is a callable that returns a `pyspark.sql.Column`. For example: +If you need a reusable check or want to implement more complex logic which is challenging to implement with SQL, you can define your own custom check functions. +A check function is a callable that returns a `pyspark.sql.Column`. -```python -import pyspark.sql.functions as F -from pyspark.sql import Column -from databricks.labs.dqx.col_functions import make_condition +#### Custom check example -def ends_with_foo(col_name: str) -> Column: - column = F.col(col_name) - return make_condition(column.endswith("foo"), f"Column {col_name} ends with foo", f"{col_name}_ends_with_foo") -``` + + + ```python + import pyspark.sql.functions as F + from pyspark.sql import Column + from databricks.labs.dqx.col_functions import make_condition -You can use custom functions directly when defining checks using DQX classes: -```python -import yaml -from databricks.labs.dqx.engine import DQEngine -from databricks.sdk import WorkspaceClient -from databricks.labs.dqx.col_functions import is_not_null + def ends_with_foo(col_name: str) -> Column: + column = F.col(col_name) + return make_condition(column.endswith("foo"), f"Column {col_name} ends with foo", f"{col_name}_ends_with_foo") + ``` + + -checks = [ - DQRule(criticality="error", check=is_not_null("col1")), - DQRule(criticality="error", check=ends_with_foo("col1")), -] +#### Execution of the custom check using DQX classes -dq_engine = DQEngine(WorkspaceClient()) + + + ```python + import yaml + from databricks.labs.dqx.engine import DQEngine + from databricks.sdk import WorkspaceClient + from databricks.labs.dqx.col_functions import is_not_null -# Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes -valid_df, quarantined_df = dq_engine.apply_checks_and_split(input_df, checks) + checks = [ + DQRule(criticality="error", check=is_not_null("col1")), + DQRule(criticality="error", check=ends_with_foo("col1")), + ] -# Option 2: apply quality rules on the dataframe and report issues as additional columns -valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(input_df, checks) -``` + dq_engine = DQEngine(WorkspaceClient()) -You can use custom functions as follows when defining checks using metadata (yaml): -```python -import yaml -from databricks.labs.dqx.engine import DQEngine -from databricks.sdk import WorkspaceClient + # Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes + valid_df, quarantined_df = dq_engine.apply_checks_and_split(input_df, checks) -checks = yaml.safe_load(""" -- criticality: error - check: - function: ends_with_foo - arguments: - col_name: col1 -- criticality: error - check: - function: is_not_null - arguments: - col_name: col1 -""") + # Option 2: apply quality rules on the dataframe and report issues as additional columns + valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(input_df, checks) + ``` + + -dq_engine = DQEngine(WorkspaceClient()) +#### Execution of the custom check using YAML definition -custom_check_functions = {"ends_with_foo": ends_with_foo} # list of custom check functions -#custom_check_functions=globals() # include all functions for simplicity + + + ```python + import yaml + from databricks.labs.dqx.engine import DQEngine + from databricks.sdk import WorkspaceClient -# Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes -valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks, custom_check_functions) + checks = yaml.safe_load(""" + - criticality: error + check: + function: ends_with_foo + arguments: + col_name: col1 + - criticality: error + check: + function: is_not_null + arguments: + col_name: col1 + """) -# Option 2: apply quality rules on the dataframe and report issues as additional columns -valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(input_df, checks, custom_check_functions) -``` + dq_engine = DQEngine(WorkspaceClient()) + + custom_check_functions = {"ends_with_foo": ends_with_foo} # list of custom check functions + # or include all functions with globals() for simplicity + #custom_check_functions=globals() -You can see all existing DQX checks [here](https://github.com/databrickslabs/dqx/blob/main/src/databricks/labs/dqx/col_functions.py). + # Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes + valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks, custom_check_functions) -Feel free to submit a PR to DQX with your own check so that other can benefit from it (see [contribution guide](/docs/dev/contributing)). + # Option 2: apply quality rules on the dataframe and report issues as additional columns + valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(input_df, checks, custom_check_functions) + ``` + + diff --git a/docs/dqx/docs/reference/testing.mdx b/docs/dqx/docs/reference/testing.mdx index 79533d63..8f2f9345 100644 --- a/docs/dqx/docs/reference/testing.mdx +++ b/docs/dqx/docs/reference/testing.mdx @@ -1,3 +1,7 @@ +import Admonition from '@theme/Admonition'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # Testing Applications Using DQX ## Standard testing with DQEngine @@ -5,42 +9,47 @@ Testing applications that use DQEngine requires proper initialization of the Databricks workspace client. Detailed guidance on authentication for the workspace client is available [here](https://databricks-sdk-py.readthedocs.io/en/latest/authentication.html#default-authentication-flow). For testing, we recommend: -* [pytester fixtures](https://github.com/databrickslabs/pytester) to setup Databricks remote Spark session and workspace client. For pytester to be able to authenticate to a workspace you need to use [debug_env_name fixture](https://github.com/databrickslabs/pytester?tab=readme-ov-file#debug_env_name-fixture). We recommend using the `~/.databricks/debug-env.json` file to store different sets of environment variables (see more details below). +* [pytester fixtures](https://github.com/databrickslabs/pytester) is used to set up Databricks remote Spark session and workspace client. For pytester to be able to authenticate to a workspace, you need to use [debug_env_name fixture](https://github.com/databrickslabs/pytester?tab=readme-ov-file#debug_env_name-fixture). We recommend using the `~/.databricks/debug-env.json` file to store different sets of environment variables (see more details below). * [chispa](https://github.com/MrPowers/chispa) for asserting Spark DataFrames. -These libraries are also used internally for testing DQX. +These libraries are also used internally to test DQX. -Example test: -```python -from chispa.dataframe_comparer import assert_df_equality -from databricks.labs.dqx.col_functions import is_not_null_and_not_empty -from databricks.labs.dqx.engine import DQEngine -from databricks.labs.dqx.rule import DQRule +Below is an example test. + + + ```python + from chispa.dataframe_comparer import assert_df_equality + from databricks.labs.dqx.col_functions import is_not_null_and_not_empty + from databricks.labs.dqx.engine import DQEngine + from databricks.labs.dqx.rule import DQRule -@pytest.fixture -def debug_env_name(): - return "ws" # Specify the name of the target environment from ~/.databricks/debug-env.json + @pytest.fixture + def debug_env_name(): + return "ws" # Specify the name of the target environment from ~/.databricks/debug-env.json -def test_dq(ws, spark): # use ws and spark pytester fixtures to initialize workspace client and spark session - schema = "a: int, b: int, c: int" - expected_schema = schema + ", _errors: map, _warnings: map" - test_df = spark.createDataFrame([[1, 3, 3]], schema) - checks = [ - DQRule(name="col_a_is_null_or_empty", criticality="warn", check=is_not_null_and_not_empty("a")), - DQRule(name="col_b_is_null_or_empty", criticality="error", check=is_not_null_and_not_empty("b")), - ] + def test_dq(ws, spark): # use ws and spark pytester fixtures to initialize workspace client and spark session + schema = "a: int, b: int, c: int" + expected_schema = schema + ", _errors: map, _warnings: map" + test_df = spark.createDataFrame([[1, 3, 3]], schema) - dq_engine = DQEngine(ws) - df = dq_engine.apply_checks(test_df, checks) + checks = [ + DQRule(name="col_a_is_null_or_empty", criticality="warn", check=is_not_null_and_not_empty("a")), + DQRule(name="col_b_is_null_or_empty", criticality="error", check=is_not_null_and_not_empty("b")), + ] - expected_df = spark.createDataFrame([[1, 3, 3, None, None]], expected_schema) - assert_df_equality(df, expected_df) -``` + dq_engine = DQEngine(ws) + df = dq_engine.apply_checks(test_df, checks) + + expected_df = spark.createDataFrame([[1, 3, 3, None, None]], expected_schema) + assert_df_equality(df, expected_df) + ``` + + -### Setting up Databricks workspace client authentication in a terminal +### Setting up Databricks Workspace Client authentication in a terminal If you want to run the tests from your local machine in the terminal, you need to set up the following environment variables: ```shell @@ -58,7 +67,7 @@ export DATABRICKS_SERVERLESS_COMPUTE_ID=auto We recommend using [OAuth access token](https://docs.databricks.com/en/dev-tools/auth/oauth-m2m.html) generated for a service principal to authenticate with Databricks as presented above. Alternatively, you can authenticate using [PAT token](https://docs.databricks.com/en/dev-tools/auth/pat.html) by setting the `DATABRICKS_TOKEN` environment variable. However, we do not recommend this method, as it is less secure than OAuth. -### Setting up Databricks workspace client authentication in an IDE +### Setting up Databricks Workspace Client authentication in an IDE If you want to run the tests from your IDE, you must setup `.env` or `~/.databricks/debug-env.json` file (see [instructions](https://github.com/databrickslabs/pytester?tab=readme-ov-file#debug_env_name-fixture)). @@ -78,7 +87,7 @@ Create the `~/.databricks/debug-env.json` with the following content, replacing } ``` -You must provide an existing cluster which will be auto-started for you as part of the tests. +You must provide an existing cluster. It will auto-start for you as part of the tests. We recommend using [OAuth access token](https://docs.databricks.com/en/dev-tools/auth/oauth-m2m.html) generated for a service principal to authenticate with Databricks as presented above. Alternatively, you can authenticate using [PAT token](https://docs.databricks.com/en/dev-tools/auth/pat.html) by providing the `DATABRICKS_TOKEN` field. However, we do not recommend this method, as it is less secure than OAuth. @@ -97,44 +106,50 @@ To run the integration tests on serverless compute, add the `DATABRICKS_SERVERLE } } ``` -When `DATABRICKS_SERVERLESS_COMPUTE_ID` is set the `DATABRICKS_CLUSTER_ID` is ignored, and tests run on serverless compute. +When `DATABRICKS_SERVERLESS_COMPUTE_ID` is set, the `DATABRICKS_CLUSTER_ID` is ignored, and tests run on serverless compute. ### Local testing with DQEngine If workspace-level access is unavailable in your testing environment, you can perform local testing by installing the latest `pyspark` package and mocking the workspace client. +Below is an example test. -**Note: This approach should be treated as experimental!** It does not offer the same level of testing as the standard approach and it is only applicable to selected methods. + +**This approach should be treated as experimental!** It does not offer the same level of testing as the standard approach, and it is only applicable to selected methods. We strongly recommend following the standard testing procedure outlined above, which includes proper initialization of the workspace client. - -Example test: -```python -from unittest.mock import MagicMock -from databricks.sdk import WorkspaceClient -from pyspark.sql import SparkSession -from chispa.dataframe_comparer import assert_df_equality -from databricks.labs.dqx.col_functions import is_not_null_and_not_empty -from databricks.labs.dqx.engine import DQEngine -from databricks.labs.dqx.rule import DQRule - - -def test_dq(): - spark = SparkSession.builder.master("local[*]").getOrCreate() # create spark local session - ws = MagicMock(spec=WorkspaceClient, **{"catalogs.list.return_value": []}) # mock the workspace client - - schema = "a: int, b: int, c: int" - expected_schema = schema + ", _errors: map, _warnings: map" - test_df = spark.createDataFrame([[1, None, 3]], schema) - - checks = [ - DQRule(name="col_a_is_null_or_empty", criticality="warn", check=is_not_null_and_not_empty("a")), - DQRule(name="col_b_is_null_or_empty", criticality="error", check=is_not_null_and_not_empty("b")), - ] - - dq_engine = DQEngine(ws) - df = dq_engine.apply_checks(test_df, checks) - - expected_df = spark.createDataFrame( - [[1, None, 3, {"col_b_is_null_or_empty": "Column b is null or empty"}, None]], expected_schema - ) - assert_df_equality(df, expected_df) -``` + + + + + ```python + from unittest.mock import MagicMock + from databricks.sdk import WorkspaceClient + from pyspark.sql import SparkSession + from chispa.dataframe_comparer import assert_df_equality + from databricks.labs.dqx.col_functions import is_not_null_and_not_empty + from databricks.labs.dqx.engine import DQEngine + from databricks.labs.dqx.rule import DQRule + + + def test_dq(): + spark = SparkSession.builder.master("local[*]").getOrCreate() # create spark local session + ws = MagicMock(spec=WorkspaceClient, **{"catalogs.list.return_value": []}) # mock the workspace client + + schema = "a: int, b: int, c: int" + expected_schema = schema + ", _errors: map, _warnings: map" + test_df = spark.createDataFrame([[1, None, 3]], schema) + + checks = [ + DQRule(name="col_a_is_null_or_empty", criticality="warn", check=is_not_null_and_not_empty("a")), + DQRule(name="col_b_is_null_or_empty", criticality="error", check=is_not_null_and_not_empty("b")), + ] + + dq_engine = DQEngine(ws) + df = dq_engine.apply_checks(test_df, checks) + + expected_df = spark.createDataFrame( + [[1, None, 3, {"col_b_is_null_or_empty": "Column b is null or empty"}, None]], expected_schema + ) + assert_df_equality(df, expected_df) + ``` + + diff --git a/docs/dqx/src/pages/index.tsx b/docs/dqx/src/pages/index.tsx index f22582f7..8c74ecde 100644 --- a/docs/dqx/src/pages/index.tsx +++ b/docs/dqx/src/pages/index.tsx @@ -7,7 +7,7 @@ const CallToAction = () => { return (

- Improve your data quality now 🚀 + Improve your Data Quality now 🚀

Follow our comprehensive guide to get up and running with DQX in no time. @@ -33,11 +33,11 @@ const Capabilities = () => { }, { title: 'Data Format Agnostic', - description: 'Works seamlessly with Spark DataFrames.', + description: 'Works seamlessly with PySpark DataFrames.', icon: FileText, }, { - title: 'Spark Batch & Streaming Support', + title: 'Spark Batch & Spark Structured Streaming Support', description: 'Includes Delta Live Tables (DLT) integration.', icon: Activity, }, @@ -57,8 +57,8 @@ const Capabilities = () => { icon: Grid, }, { - title: 'Profiling & Rule Generation', - description: 'Automatically profile and generate data quality rule candidates.', + title: 'Profiling & Quality Rules Generation', + description: 'Automatically profile input data and generate data quality rule candidates.', icon: BarChart2, }, { @@ -67,7 +67,7 @@ const Capabilities = () => { icon: Code, }, { - title: 'Validation Summary & Dashboard', + title: 'Validation Summary & Quality Dashboard', description: 'Track and identify data quality issues effectively.', icon: PieChart, }, @@ -116,7 +116,7 @@ const Hero = () => {

DQX is a data quality framework for Apache Spark that enables you to define, monitor, and - react to data quality issues in your data pipelines. + address data quality issues in your Python-based data pipelines.

{/* Call to Action Buttons */} diff --git a/src/databricks/labs/dqx/col_functions.py b/src/databricks/labs/dqx/col_functions.py index 43f18c6f..f8dbfdfd 100644 --- a/src/databricks/labs/dqx/col_functions.py +++ b/src/databricks/labs/dqx/col_functions.py @@ -3,6 +3,7 @@ import pyspark.sql.functions as F from pyspark.sql import Column +from pyspark.sql.window import Window def make_condition(condition: Column, message: Column | str, alias: str) -> Column: @@ -22,13 +23,8 @@ def make_condition(condition: Column, message: Column | str, alias: str) -> Colu return (F.when(condition, msg_col).otherwise(F.lit(None).cast("string"))).alias(_cleanup_alias_name(alias)) -def _cleanup_alias_name(col_name: str) -> str: - # avoid issues with structs - return col_name.replace(".", "_") - - -def is_not_null_and_not_empty(col_name: str, trim_strings: bool = False) -> Column: - """Creates a condition column to check if value is null or empty. +def is_not_null_and_not_empty(col_name: str, trim_strings: bool | None = False) -> Column: + """Checks whether the values in the input column are not null and not empty. :param col_name: column name to check :param trim_strings: boolean flag to trim spaces from strings @@ -42,7 +38,7 @@ def is_not_null_and_not_empty(col_name: str, trim_strings: bool = False) -> Colu def is_not_empty(col_name: str) -> Column: - """Creates a condition column to check if value is empty (but could be null). + """Checks whether the values in the input column are not empty (but may be null). :param col_name: column name to check :return: Column object for condition @@ -53,7 +49,7 @@ def is_not_empty(col_name: str) -> Column: def is_not_null(col_name: str) -> Column: - """Creates a condition column to check if value is null. + """Checks whether the values in the input column are not null. :param col_name: column name to check :return: Column object for condition @@ -62,13 +58,16 @@ def is_not_null(col_name: str) -> Column: return make_condition(column.isNull(), f"Column {col_name} is null", f"{col_name}_is_null") -def value_is_not_null_and_is_in_list(col_name: str, allowed: list) -> Column: - """Creates a condition column to check if value is null or not in the list of allowed values. +def is_not_null_and_is_in_list(col_name: str, allowed: list) -> Column: + """Checks whether the values in the input column are not null and present in the list of allowed values. :param col_name: column name to check :param allowed: list of allowed values (actual values or Column objects) :return: Column object for condition """ + if not allowed: + raise ValueError("allowed list is not provided.") + allowed_cols = [item if isinstance(item, Column) else F.lit(item) for item in allowed] column = F.col(col_name) condition = column.isNull() | ~column.isin(*allowed_cols) @@ -78,21 +77,25 @@ def value_is_not_null_and_is_in_list(col_name: str, allowed: list) -> Column: "", F.lit("Value "), F.when(column.isNull(), F.lit("null")).otherwise(column.cast("string")), - F.lit(" is not in the allowed list: ["), + F.lit(" is null or not in the allowed list: ["), F.concat_ws(", ", *allowed_cols), F.lit("]"), ), - f"{col_name}_value_is_not_in_the_list", + f"{col_name}_is_null_or_is_not_in_the_list", ) -def value_is_in_list(col_name: str, allowed: list) -> Column: - """Creates a condition column to check if value not in the list of allowed values (could be null). +def is_in_list(col_name: str, allowed: list) -> Column: + """Checks whether the values in the input column are present in the list of allowed values + (null values are allowed). :param col_name: column name to check :param allowed: list of allowed values (actual values or Column objects) :return: Column object for condition """ + if not allowed: + raise ValueError("allowed list is not provided.") + allowed_cols = [item if isinstance(item, Column) else F.lit(item) for item in allowed] column = F.col(col_name) condition = ~column.isin(*allowed_cols) @@ -106,7 +109,7 @@ def value_is_in_list(col_name: str, allowed: list) -> Column: F.concat_ws(", ", *allowed_cols), F.lit("]"), ), - f"{col_name}_value_is_not_in_the_list", + f"{col_name}_is_not_in_the_list", ) @@ -114,7 +117,7 @@ def value_is_in_list(col_name: str, allowed: list) -> Column: def sql_expression(expression: str, msg: str | None = None, name: str | None = None, negate: bool = False) -> Column: - """Creates a condition column from the SQL expression. + """Checks whether the condition provided as an SQL expression is met. :param expression: SQL expression :param msg: optional message of the `Column` type, automatically generated if None @@ -137,8 +140,8 @@ def sql_expression(expression: str, msg: str | None = None, name: str | None = N return make_condition(expr_col, F.concat_ws("", F.lit(f"Value matches expression: {expression_msg}")), name) -def is_older_than_col2_for_n_days(col_name1: str, col_name2: str, days: int) -> Column: - """Creates a condition column for case when one date or timestamp column is older than another column by N days. +def is_older_than_col2_for_n_days(col_name1: str, col_name2: str, days: int = 0) -> Column: + """Checks whether the values in one input column are at least N days older than the values in another column. :param col_name1: first column :param col_name2: second column @@ -159,13 +162,12 @@ def is_older_than_col2_for_n_days(col_name1: str, col_name2: str, days: int) -> col2_date, F.lit(f"' for more than {days} days"), ), - f"is_col_{col_name1}_older_than_{col_name2}_for_N_days", + f"is_col_{col_name1}_older_than_{col_name2}_for_n_days", ) def is_older_than_n_days(col_name: str, days: int, curr_date: Column | None = None) -> Column: - """Creates a condition column for case when specified date or timestamp column is older (compared to current date) - than N days. + """Checks whether the values in the input column are at least N days older than the current date. :param col_name: name of the column to check :param days: number of days @@ -188,13 +190,13 @@ def is_older_than_n_days(col_name: str, days: int, curr_date: Column | None = No curr_date, F.lit(f"' for more than {days} days"), ), - f"is_col_{col_name}_older_than_N_days", + f"is_col_{col_name}_older_than_n_days", ) -def not_in_future(col_name: str, offset: int = 0, curr_timestamp: Column | None = None) -> Column: - """Creates a condition column that checks if specified date or timestamp column is in the future. - Future is considered as grater than current timestamp plus `offset` seconds. +def is_not_in_future(col_name: str, offset: int = 0, curr_timestamp: Column | None = None) -> Column: + """Checks whether the values in the input column contain a timestamp that is not in the future, + where 'future' is defined as current_timestamp + offset (in seconds). :param col_name: column name :param offset: offset (in seconds) to add to the current timestamp at time of execution @@ -216,9 +218,10 @@ def not_in_future(col_name: str, offset: int = 0, curr_timestamp: Column | None ) -def not_in_near_future(col_name: str, offset: int = 0, curr_timestamp: Column | None = None) -> Column: - """Creates a condition column that checks if specified date or timestamp column is in the near future. - Near future is considered as grater than current timestamp but less than current timestamp plus `offset` seconds. +def is_not_in_near_future(col_name: str, offset: int = 0, curr_timestamp: Column | None = None) -> Column: + """Checks whether the values in the input column contain a timestamp that is not in the near future, + where 'near future' is defined as greater than the current timestamp + but less than the current_timestamp + offset (in seconds). :param col_name: column name :param offset: offset (in seconds) to add to the current timestamp at time of execution @@ -247,87 +250,59 @@ def not_in_near_future(col_name: str, offset: int = 0, curr_timestamp: Column | ) -def not_less_than(col_name: str, limit: int | datetime.date | datetime.datetime) -> Column: - """Creates a condition column that checks if a value is less than specified limit. +def is_not_less_than( + col_name: str, limit: int | datetime.date | datetime.datetime | str | Column | None = None +) -> Column: + """Checks whether the values in the input column are not less than the provided limit. :param col_name: column name - :param limit: limit to use in the condition + :param limit: limit to use in the condition as number, date, timestamp, column name or expression :return: new Column """ - limit_expr = F.lit(limit) + limit_expr = _get_column_expr_limit(limit) condition = F.col(col_name) < limit_expr return make_condition( condition, - F.concat_ws(" ", F.lit("Value"), F.col(col_name), F.lit("is less than limit:"), F.lit(limit).cast("string")), + F.concat_ws(" ", F.lit("Value"), F.col(col_name), F.lit("is less than limit:"), limit_expr.cast("string")), f"{col_name}_less_than_limit", ) -def not_greater_than(col_name: str, limit: int | datetime.date | datetime.datetime) -> Column: - """Creates a condition column that checks if a value is greater than specified limit. +def is_not_greater_than( + col_name: str, limit: int | datetime.date | datetime.datetime | str | Column | None = None +) -> Column: + """Checks whether the values in the input column are not greater than the provided limit. :param col_name: column name - :param limit: limit to use in the condition + :param limit: limit to use in the condition as number, date, timestamp, column name or expression :return: new Column """ - limit_expr = F.lit(limit) + limit_expr = _get_column_expr_limit(limit) condition = F.col(col_name) > limit_expr return make_condition( condition, - F.concat_ws(" ", F.lit("Value"), F.col(col_name), F.lit("is greater than limit:"), F.lit(limit).cast("string")), + F.concat_ws(" ", F.lit("Value"), F.col(col_name), F.lit("is greater than limit:"), limit_expr.cast("string")), f"{col_name}_greater_than_limit", ) -def _get_min_max_column_expr( - min_limit: int | datetime.date | datetime.datetime | str | None = None, - max_limit: int | datetime.date | datetime.datetime | str | None = None, - min_limit_col_expr: str | Column | None = None, - max_limit_col_expr: str | Column | None = None, -) -> tuple[Column, Column]: - """Helper function to create a condition for the is_(not)_in_range functions. - - :param min_limit: min limit value - :param max_limit: max limit value - :param min_limit_col_expr: min limit column name or expr - :param max_limit_col_expr: max limit column name or expr - :return: tuple containing min_limit_expr and max_limit_expr - :raises: ValueError when both min_limit/min_limit_col_expr or max_limit/max_limit_col_expr are null - """ - if (min_limit is None and min_limit_col_expr is None) or (max_limit is None and max_limit_col_expr is None): - raise ValueError('Either min_limit / min_limit_col_expr or max_limit / max_limit_col_expr is empty') - if min_limit_col_expr is None: - min_limit_expr = F.lit(min_limit) - else: - min_limit_expr = F.col(min_limit_col_expr) if isinstance(min_limit_col_expr, str) else min_limit_col_expr - if max_limit_col_expr is None: - max_limit_expr = F.lit(max_limit) - else: - max_limit_expr = F.col(max_limit_col_expr) if isinstance(max_limit_col_expr, str) else max_limit_col_expr - return (min_limit_expr, max_limit_expr) - - def is_in_range( col_name: str, - min_limit: int | datetime.date | datetime.datetime | str | None = None, - max_limit: int | datetime.date | datetime.datetime | str | None = None, - min_limit_col_expr: str | Column | None = None, - max_limit_col_expr: str | Column | None = None, + min_limit: int | datetime.date | datetime.datetime | str | Column | None = None, + max_limit: int | datetime.date | datetime.datetime | str | Column | None = None, ) -> Column: - """Creates a condition column that checks if a value is smaller than min limit or greater than max limit. + """Checks whether the values in the input column are in the provided limits (inclusive of both boundaries). :param col_name: column name - :param min_limit: min limit value - :param max_limit: max limit value - :param min_limit_col_expr: min limit column name or expr - :param max_limit_col_expr: max limit column name or expr + :param min_limit: min limit to use in the condition as number, date, timestamp, column name or expression + :param max_limit: max limit to use in the condition as number, date, timestamp, column name or expression :return: new Column """ - min_limit_expr, max_limit_expr = _get_min_max_column_expr( - min_limit, max_limit, min_limit_col_expr, max_limit_col_expr - ) + min_limit_expr = _get_column_expr_limit(min_limit) + max_limit_expr = _get_column_expr_limit(max_limit) + condition = (F.col(col_name) < min_limit_expr) | (F.col(col_name) > max_limit_expr) return make_condition( @@ -348,24 +323,20 @@ def is_in_range( def is_not_in_range( col_name: str, - min_limit: int | datetime.date | datetime.datetime | str | None = None, - max_limit: int | datetime.date | datetime.datetime | str | None = None, - min_limit_col_expr: str | Column | None = None, - max_limit_col_expr: str | Column | None = None, + min_limit: int | datetime.date | datetime.datetime | str | Column | None = None, + max_limit: int | datetime.date | datetime.datetime | str | Column | None = None, ) -> Column: - """Creates a condition column that checks if a value is within min and max limits. + """Checks whether the values in the input column are outside the provided limits (inclusive of both boundaries). :param col_name: column name - :param min_limit: min limit value - :param max_limit: max limit value - :param min_limit_col_expr: min limit column name or expr - :param max_limit_col_expr: max limit column name or expr + :param min_limit: min limit to use in the condition as number, date, timestamp, column name or expression + :param max_limit: min limit to use in the condition as number, date, timestamp, column name or expression :return: new Column """ - min_limit_expr, max_limit_expr = _get_min_max_column_expr( - min_limit, max_limit, min_limit_col_expr, max_limit_col_expr - ) - condition = (F.col(col_name) > min_limit_expr) & (F.col(col_name) < max_limit_expr) + min_limit_expr = _get_column_expr_limit(min_limit) + max_limit_expr = _get_column_expr_limit(max_limit) + + condition = (F.col(col_name) >= min_limit_expr) & (F.col(col_name) <= max_limit_expr) return make_condition( condition, @@ -384,7 +355,7 @@ def is_not_in_range( def regex_match(col_name: str, regex: str, negate: bool = False) -> Column: - """Creates a condition column to check if value not matches given regex. + """Checks whether the values in the input column matches a given regex. :param col_name: column name to check :param regex: regex to check @@ -402,8 +373,8 @@ def regex_match(col_name: str, regex: str, negate: bool = False) -> Column: def is_not_null_and_not_empty_array(col_name: str) -> Column: - """ - Creates a condition column to check if an array is null and or empty. + """Checks whether the values in the array input column are not null and not empty. + :param col_name: column name to check :return: Column object for condition """ @@ -413,8 +384,8 @@ def is_not_null_and_not_empty_array(col_name: str) -> Column: def is_valid_date(col_name: str, date_format: str | None = None) -> Column: - """ - Creates a condition column to check if a string is a valid date. + """Checks whether the values in the input column have valid date formats. + :param col_name: column name to check :param date_format: date format (e.g. 'yyyy-mm-dd') :return: Column object for condition @@ -433,8 +404,8 @@ def is_valid_date(col_name: str, date_format: str | None = None) -> Column: def is_valid_timestamp(col_name: str, timestamp_format: str | None = None) -> Column: - """ - Creates a condition column to check if a string is a valid timestamp. + """Checks whether the values in the input column have valid timestamp formats. + :param col_name: column name to check :param timestamp_format: timestamp format (e.g. 'yyyy-mm-dd HH:mm:ss') :return: Column object for condition @@ -452,3 +423,52 @@ def is_valid_timestamp(col_name: str, timestamp_format: str | None = None) -> Co F.concat_ws("", F.lit("Value '"), column, F.lit(condition_str)), f"{col_name}_is_not_valid_timestamp", ) + + +def is_unique(col_name: str, window_spec: str | Column | None = None) -> Column: + """Checks whether the values in the input column are unique + and reports an issue for each row that contains a duplicate value. + Null values are not considered duplicates, following the ANSI SQL standard. + It should be used carefully in the streaming context, + as uniqueness check will only be performed on individual micro-batches. + + :param col_name: column name to check + :param window_spec: window specification for the partition by clause. Default value for NULL in the time column + of the window spec must be provided using coalesce() to prevent rows exclusion! + e.g. "window(coalesce(b, '1970-01-01'), '2 hours')" + :return: Column object for condition + """ + column = F.col(col_name) + if window_spec is None: + partition_by_spec = Window.partitionBy(column) + else: + if isinstance(window_spec, str): + window_spec = F.expr(window_spec) + partition_by_spec = Window.partitionBy(window_spec) + + condition = F.when(column.isNotNull(), F.count(column).over(partition_by_spec) == 1) + return make_condition(~condition, f"Column {col_name} has duplicate values", f"{col_name}_is_not_unique") + + +def _cleanup_alias_name(col_name: str) -> str: + # avoid issues with structs + return col_name.replace(".", "_") + + +def _get_column_expr_limit( + limit: int | datetime.date | datetime.datetime | str | Column | None = None, +) -> Column: + """Helper function to generate a column expression limit based on the provided limit value. + + :param limit: limit to use in the condition (literal value or Column expression) + :return: column expression. + :raises ValueError: if limit is not provided. + """ + if limit is None: + raise ValueError("Limit is not provided.") + + if isinstance(limit, str): + return F.expr(limit) + if isinstance(limit, Column): + return limit + return F.lit(limit) diff --git a/src/databricks/labs/dqx/profiler/generator.py b/src/databricks/labs/dqx/profiler/generator.py index 2c51b518..2d1e6489 100644 --- a/src/databricks/labs/dqx/profiler/generator.py +++ b/src/databricks/labs/dqx/profiler/generator.py @@ -48,7 +48,7 @@ def dq_generate_is_in(col_name: str, level: str = "error", **params: dict): :return: A dictionary representing the data quality rule. """ return { - "check": {"function": "value_is_in_list", "arguments": {"col_name": col_name, "allowed": params["in"]}}, + "check": {"function": "is_in_list", "arguments": {"col_name": col_name, "allowed": params["in"]}}, "name": f"{col_name}_other_value", "criticality": level, } @@ -86,7 +86,7 @@ def dq_generate_min_max(col_name: str, level: str = "error", **params: dict): if max_limit is not None: return { "check": { - "function": "not_greater_than", + "function": "is_not_greater_than", "arguments": { "col_name": col_name, "val": val_maybe_to_str(max_limit, include_sql_quotes=False), @@ -99,7 +99,7 @@ def dq_generate_min_max(col_name: str, level: str = "error", **params: dict): if min_limit is not None: return { "check": { - "function": "not_less_than", + "function": "is_not_less_than", "arguments": { "col_name": col_name, "val": val_maybe_to_str(min_limit, include_sql_quotes=False), diff --git a/tests/conftest.py b/tests/conftest.py index f8dcda26..1c79fa9a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,7 +22,7 @@ def checks_yml_content(): trim_strings: true - criticality: warn check: - function: value_is_in_list + function: is_in_list arguments: col_name: col4 allowed: @@ -67,7 +67,7 @@ def checks_json_content(): { "criticality": "warn", "check": { - "function": "value_is_in_list", + "function": "is_in_list", "arguments": { "col_name": "col4", "allowed": [1, 2] @@ -130,7 +130,7 @@ def expected_checks(): }, { "criticality": "warn", - "check": {"function": "value_is_in_list", "arguments": {"col_name": "col4", "allowed": [1, 2]}}, + "check": {"function": "is_in_list", "arguments": {"col_name": "col4", "allowed": [1, 2]}}, }, { "criticality": "error", diff --git a/tests/integration/test_apply_checks.py b/tests/integration/test_apply_checks.py index 9b3a65d3..48c194e4 100644 --- a/tests/integration/test_apply_checks.py +++ b/tests/integration/test_apply_checks.py @@ -1,3 +1,6 @@ +from datetime import datetime + +import yaml import pyspark.sql.functions as F import pytest from pyspark.sql import Column @@ -213,14 +216,14 @@ def test_apply_checks_and_split_by_metadata(ws, spark): "check": {"function": "is_not_null_and_not_empty", "arguments": {"col_name": "c"}}, }, { - "name": "col_a_value_is_not_in_the_list", + "name": "col_a_is_not_in_the_list", "criticality": "warn", - "check": {"function": "value_is_in_list", "arguments": {"col_name": "a", "allowed": [1, 3, 4]}}, + "check": {"function": "is_in_list", "arguments": {"col_name": "a", "allowed": [1, 3, 4]}}, }, { - "name": "col_c_value_is_not_in_the_list", + "name": "col_c_is_not_in_the_list", "criticality": "warn", - "check": {"function": "value_is_in_list", "arguments": {"col_name": "c", "allowed": [1, 3, 4]}}, + "check": {"function": "is_in_list", "arguments": {"col_name": "c", "allowed": [1, 3, 4]}}, }, ] @@ -236,7 +239,7 @@ def test_apply_checks_and_split_by_metadata(ws, spark): None, 4, {"col_b_is_null_or_empty": "Column b is null or empty"}, - {"col_a_value_is_not_in_the_list": "Value 2 is not in the allowed list: [1, 3, 4]"}, + {"col_a_is_not_in_the_list": "Value 2 is not in the allowed list: [1, 3, 4]"}, ], [ None, @@ -280,7 +283,7 @@ def test_apply_checks_and_split_by_metadata_with_autogenerated_col_names(ws, spa }, { "criticality": "warn", - "check": {"function": "value_is_in_list", "arguments": {"col_names": ["a", "c"], "allowed": [1, 3, 4]}}, + "check": {"function": "is_in_list", "arguments": {"col_names": ["a", "c"], "allowed": [1, 3, 4]}}, }, ] @@ -296,7 +299,7 @@ def test_apply_checks_and_split_by_metadata_with_autogenerated_col_names(ws, spa None, 4, {"col_b_is_null_or_empty": "Column b is null or empty"}, - {"col_a_value_is_not_in_the_list": "Value 2 is not in the allowed list: [1, 3, 4]"}, + {"col_a_is_not_in_the_list": "Value 2 is not in the allowed list: [1, 3, 4]"}, ], [ None, @@ -340,7 +343,7 @@ def test_apply_checks_by_metadata(ws, spark): }, { "criticality": "warn", - "check": {"function": "value_is_in_list", "arguments": {"col_names": ["a", "c"], "allowed": [1, 3, 4]}}, + "check": {"function": "is_in_list", "arguments": {"col_names": ["a", "c"], "allowed": [1, 3, 4]}}, }, ] @@ -354,7 +357,7 @@ def test_apply_checks_by_metadata(ws, spark): None, 4, {"col_b_is_null_or_empty": "Column b is null or empty"}, - {"col_a_value_is_not_in_the_list": "Value 2 is not in the allowed list: [1, 3, 4]"}, + {"col_a_is_not_in_the_list": "Value 2 is not in the allowed list: [1, 3, 4]"}, ], [ None, @@ -751,3 +754,343 @@ def test_apply_checks_with_sql_expression(ws, spark): expected_schema, ) assert_df_equality(checked, expected, ignore_nullable=True) + + +def test_apply_checks_with_is_unique(ws, spark, set_utc_timezone): + schema = "col1: int, col2: timestamp" + test_df = spark.createDataFrame([[1, datetime(2025, 1, 1)], [1, datetime(2025, 1, 2)], [None, None]], schema) + + checks = [ + { + "criticality": "error", + "check": {"function": "is_unique", "arguments": {"col_name": "col1"}}, + }, + { + "criticality": "error", + "name": "col_col1_is_not_unique2", + "check": { + "function": "is_unique", + "arguments": {"col_name": "col1", "window_spec": "window(coalesce(col2, '1970-01-01'), '30 days')"}, + }, + }, + ] + + dq_engine = DQEngine(ws) + checked = dq_engine.apply_checks_by_metadata(test_df, checks) + + expected_schema = schema + REPORTING_COLUMNS + expected = spark.createDataFrame( + [ + [None, None, None, None], + [ + 1, + datetime(2025, 1, 1), + { + "col_col1_is_not_unique": "Column col1 has duplicate values", + "col_col1_is_not_unique2": "Column col1 has duplicate values", + }, + None, + ], + [ + 1, + datetime(2025, 1, 2), + { + "col_col1_is_not_unique": "Column col1 has duplicate values", + "col_col1_is_not_unique2": "Column col1 has duplicate values", + }, + None, + ], + ], + expected_schema, + ) + assert_df_equality(checked, expected, ignore_nullable=True) + + +def test_apply_checks_all_checks_as_yaml(ws, spark): + checks = yaml.safe_load( + """ + # is_not_null check + - criticality: error + check: + function: is_not_null + arguments: + col_name: col1 + + # is_not_empty check + - criticality: error + check: + function: is_not_empty + arguments: + col_name: col1 + + # is_not_null_and_not_empty check + - criticality: error + check: + function: is_not_null_and_not_empty + arguments: + col_name: col1 + trim_strings: true + + # is_in_list check + - criticality: error + check: + function: is_in_list + arguments: + col_name: col2 + allowed: + - 1 + - 2 + - 3 + + # is_not_null_and_is_in_list check + - criticality: error + check: + function: is_not_null_and_is_in_list + arguments: + col_name: col2 + allowed: + - 1 + - 2 + - 3 + + # is_not_null_and_not_empty_array check + - criticality: error + check: + function: is_not_null_and_not_empty_array + arguments: + col_name: col4 + + # is_in_range check + - criticality: error + check: + function: is_in_range + arguments: + col_name: col2 + min_limit: 1 + max_limit: 10 + - criticality: error + check: + function: is_in_range + arguments: + col_name: col5 + min_limit: 2025-01-01 + max_limit: 2025-02-24 + - criticality: error + check: + function: is_in_range + arguments: + col_name: col6 + min_limit: 2025-01-01 00:00:00 + max_limit: 2025-02-24 01:00:00 + - criticality: error + check: + function: is_in_range + arguments: + col_name: col3 + min_limit: col2 + max_limit: col2 * 2 + + # is_not_in_range check + - criticality: error + check: + function: is_not_in_range + arguments: + col_name: col2 + min_limit: 11 + max_limit: 20 + - criticality: error + check: + function: is_not_in_range + arguments: + col_name: col5 + min_limit: 2025-02-25 + max_limit: 2025-02-26 + - criticality: error + check: + function: is_not_in_range + arguments: + col_name: col6 + min_limit: 2025-02-25 00:00:00 + max_limit: 2025-02-26 01:00:00 + - criticality: error + check: + function: is_not_in_range + arguments: + col_name: col3 + min_limit: col2 + 10 + max_limit: col2 * 10 + + # is_not_less_than check + - criticality: error + check: + function: is_not_less_than + arguments: + col_name: col2 + limit: 0 + - criticality: error + check: + function: is_not_less_than + arguments: + col_name: col5 + limit: 2025-01-01 + - criticality: error + check: + function: is_not_less_than + arguments: + col_name: col6 + limit: 2025-01-01 01:00:00 + - criticality: error + check: + function: is_not_less_than + arguments: + col_name: col3 + limit: col2 - 10 + + # is_not_greater_than check + - criticality: error + check: + function: is_not_greater_than + arguments: + col_name: col2 + limit: 10 + - criticality: error + check: + function: is_not_greater_than + arguments: + col_name: col5 + limit: 2025-03-01 + - criticality: error + check: + function: is_not_greater_than + arguments: + col_name: col6 + limit: 2025-03-24 01:00:00 + - criticality: error + check: + function: is_not_greater_than + arguments: + col_name: col3 + limit: col2 + 10 + + # is_valid_date check + - criticality: error + check: + function: is_valid_date + arguments: + col_name: col5 + - criticality: error + name: col5_is_not_valid_date2 + check: + function: is_valid_date + arguments: + col_name: col5 + date_format: yyyy-MM-dd + + # is_valid_timestamp check + - criticality: error + check: + function: is_valid_timestamp + arguments: + col_name: col6 + timestamp_format: yyyy-MM-dd HH:mm:ss + - criticality: error + name: col6_is_not_valid_timestamp2 + check: + function: is_valid_timestamp + arguments: + col_name: col6 + + # is_not_in_future check + - criticality: error + check: + function: is_not_in_future + arguments: + col_name: col6 + offset: 86400 + + # is_not_in_near_future check + - criticality: error + check: + function: is_not_in_near_future + arguments: + col_name: col6 + offset: 36400 + + # is_older_than_n_days check + - criticality: error + check: + function: is_older_than_n_days + arguments: + col_name: col5 + days: 10000 + + # is_older_than_col2_for_n_days check + - criticality: error + check: + function: is_older_than_col2_for_n_days + arguments: + col_name1: col5 + col_name2: col6 + days: 2 + + # is_unique check + - criticality: error + check: + function: is_unique + arguments: + col_name: col1 + - criticality: error + name: col1_is_not_unique2 + check: + function: is_unique + arguments: + col_name: col1 + window_spec: window(coalesce(col6, '1970-01-01'), '10 minutes') + + # regex_match check + - criticality: error + check: + function: regex_match + arguments: + col_name: col2 + regex: '[0-9]+' + negate: false + + # sql_expression check + - criticality: error + check: + function: sql_expression + arguments: + expression: col3 > col2 and col3 < 10 + msg: col3 is greater than col2 and col3 less than 10 + name: custom_output_name + negate: false + """ + ) + + dq_engine = DQEngine(ws) + status = dq_engine.validate_checks(checks) + assert not status.has_errors + + schema = "col1: string, col2: int, col3: int, col4 array, col5: date, col6: timestamp" + test_df = spark.createDataFrame( + [ + ["val1", 1, 1, [1], datetime(2025, 1, 2).date(), datetime(2025, 1, 2, 1, 0, 0)], + ["val2", 2, 2, [2], datetime(2025, 1, 2).date(), datetime(2025, 1, 2, 2, 0, 0)], + ["val3", 3, 3, [3], datetime(2025, 1, 2).date(), datetime(2025, 1, 2, 3, 0, 0)], + ], + schema, + ) + + checked = dq_engine.apply_checks_by_metadata(test_df, checks) + + expected_schema = schema + REPORTING_COLUMNS + expected = spark.createDataFrame( + [ + ["val1", 1, 1, [1], datetime(2025, 1, 2).date(), datetime(2025, 1, 2, 1, 0, 0), None, None], + ["val2", 2, 2, [2], datetime(2025, 1, 2).date(), datetime(2025, 1, 2, 2, 0, 0), None, None], + ["val3", 3, 3, [3], datetime(2025, 1, 2).date(), datetime(2025, 1, 2, 3, 0, 0), None, None], + ], + expected_schema, + ) + assert_df_equality(checked, expected, ignore_nullable=True) diff --git a/tests/integration/test_col_functions.py b/tests/integration/test_col_functions.py index f1e3e568..d5959f62 100644 --- a/tests/integration/test_col_functions.py +++ b/tests/integration/test_col_functions.py @@ -1,6 +1,9 @@ from datetime import datetime +from decimal import Decimal import pyspark.sql.functions as F from chispa.dataframe_comparer import assert_df_equality # type: ignore + + from databricks.labs.dqx.col_functions import ( is_in_range, is_not_empty, @@ -9,17 +12,18 @@ is_not_null_and_not_empty, is_older_than_col2_for_n_days, is_older_than_n_days, - not_in_future, - not_in_near_future, - not_less_than, - not_greater_than, + is_not_in_future, + is_not_in_near_future, + is_not_less_than, + is_not_greater_than, regex_match, sql_expression, - value_is_in_list, - value_is_not_null_and_is_in_list, + is_in_list, + is_not_null_and_is_in_list, is_not_null_and_not_empty_array, is_valid_date, is_valid_timestamp, + is_unique, ) SCHEMA = "a: string, b: int" @@ -60,19 +64,20 @@ def test_col_is_not_null(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_col_value_is_not_null_and_is_in_list(spark): +def test_col_is_not_null_and_is_in_list(spark): test_df = spark.createDataFrame([["str1", 1], ["str2", None], ["", 3]], SCHEMA) - actual = test_df.select( - value_is_not_null_and_is_in_list("a", ["str1"]), value_is_not_null_and_is_in_list("b", [F.lit(3)]) - ) + actual = test_df.select(is_not_null_and_is_in_list("a", ["str1"]), is_not_null_and_is_in_list("b", [F.lit(3)])) - checked_schema = "a_value_is_not_in_the_list: string, b_value_is_not_in_the_list: string" + checked_schema = "a_is_null_or_is_not_in_the_list: string, b_is_null_or_is_not_in_the_list: string" expected = spark.createDataFrame( [ - [None, "Value 1 is not in the allowed list: [3]"], - ["Value str2 is not in the allowed list: [str1]", "Value null is not in the allowed list: [3]"], - ["Value is not in the allowed list: [str1]", None], + [None, "Value 1 is null or not in the allowed list: [3]"], + [ + "Value str2 is null or not in the allowed list: [str1]", + "Value null is null or not in the allowed list: [3]", + ], + ["Value is null or not in the allowed list: [str1]", None], ], checked_schema, ) @@ -80,12 +85,12 @@ def test_col_value_is_not_null_and_is_in_list(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_col_value_is_not_in_list(spark): +def test_col_is_not_in_list(spark): test_df = spark.createDataFrame([["str1", 1], ["str2", None], ["", 3]], SCHEMA) - actual = test_df.select(value_is_in_list("a", ["str1"]), value_is_in_list("b", [F.lit(3)])) + actual = test_df.select(is_in_list("a", ["str1"]), is_in_list("b", [F.lit(3)])) - checked_schema = "a_value_is_not_in_the_list: string, b_value_is_not_in_the_list: string" + checked_schema = "a_is_not_in_the_list: string, b_is_not_in_the_list: string" expected = spark.createDataFrame( [ [None, "Value 1 is not in the allowed list: [3]"], @@ -136,7 +141,7 @@ def test_is_col_older_than_col2_for_n_days(spark): actual = test_df.select(is_older_than_col2_for_n_days("a", "b", 2)) - checked_schema = "is_col_a_older_than_b_for_N_days: string" + checked_schema = "is_col_a_older_than_b_for_n_days: string" expected = spark.createDataFrame( [ ["Value of a: '2023-01-10' less than value of b: '2023-01-13' for more than 2 days"], @@ -157,7 +162,7 @@ def test_is_col_older_than_n_days(spark): actual = test_df.select(is_older_than_n_days("a", 2, F.lit("2023-01-13"))) - checked_schema = "is_col_a_older_than_N_days: string" + checked_schema = "is_col_a_older_than_n_days: string" expected = spark.createDataFrame( [["Value of a: '2023-01-10' less than current date: '2023-01-13' for more than 2 days"], [None], [None]], checked_schema, @@ -166,11 +171,11 @@ def test_is_col_older_than_n_days(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_col_not_in_future(spark): +def test_col_is_not_in_future(spark): schema_dates = "a: string" test_df = spark.createDataFrame([["2023-01-10 11:08:37"], ["2023-01-10 11:08:43"], [None]], schema_dates) - actual = test_df.select(not_in_future("a", 2, F.lit("2023-01-10 11:08:40"))) + actual = test_df.select(is_not_in_future("a", 2, F.lit("2023-01-10 11:08:40"))) checked_schema = "a_in_future: string" expected = spark.createDataFrame( @@ -180,13 +185,13 @@ def test_col_not_in_future(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_col_not_in_near_future(spark): +def test_col_is_not_in_near_future(spark): schema_dates = "a: string" test_df = spark.createDataFrame( [["2023-01-10 11:08:40"], ["2023-01-10 11:08:41"], ["2023-01-10 11:08:42"], [None]], schema_dates ) - actual = test_df.select(not_in_near_future("a", 2, F.lit("2023-01-10 11:08:40"))) + actual = test_df.select(is_not_in_near_future("a", 2, F.lit("2023-01-10 11:08:40"))) checked_schema = "a_in_near_future: string" expected = spark.createDataFrame( @@ -210,7 +215,7 @@ def test_is_col_older_than_n_days_cur(spark): actual = test_df.select(is_older_than_n_days("a", 2, None)) - checked_schema = "is_col_a_older_than_N_days: string" + checked_schema = "is_col_a_older_than_n_days: string" expected = spark.createDataFrame( [[f"Value of a: '2023-01-10' less than current date: '{cur_date}' for more than 2 days"], [None]], @@ -220,33 +225,59 @@ def test_is_col_older_than_n_days_cur(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_col_not_less_than(spark, set_utc_timezone): - schema_num = "a: int, b: date, c: timestamp" +def test_col_is_not_less_than(spark, set_utc_timezone): + schema_num = "a: int, b: int, c: date, d: timestamp, e: decimal(10,2)" test_df = spark.createDataFrame( [ - [1, datetime(2025, 1, 1).date(), datetime(2025, 1, 1)], - [2, datetime(2025, 2, 1).date(), datetime(2025, 2, 1)], - [None, None, None], + [1, 1, datetime(2025, 1, 1).date(), datetime(2025, 1, 1), Decimal("1.00")], + [2, 4, datetime(2025, 2, 1).date(), datetime(2025, 2, 1), Decimal("1.99")], + [4, 3, None, None, Decimal("2.01")], + [None, None, None, None, None], ], schema_num, ) actual = test_df.select( - not_less_than("a", 2), - not_less_than("b", datetime(2025, 2, 1).date()), - not_less_than("c", datetime(2025, 2, 1)), + is_not_less_than("a", 2), + is_not_less_than("a", F.col("b") * 2), + is_not_less_than("b", "a"), + is_not_less_than("c", datetime(2025, 2, 1).date()), + is_not_less_than("d", datetime(2025, 2, 1)), + is_not_less_than("e", 2), + ) + + checked_schema = ( + "a_less_than_limit: string, a_less_than_limit: string, b_less_than_limit: string, " + "c_less_than_limit: string, d_less_than_limit: string, e_less_than_limit: string" ) - checked_schema = "a_less_than_limit: string, b_less_than_limit: string, c_less_than_limit: string" expected = spark.createDataFrame( [ [ "Value 1 is less than limit: 2", + None, + None, "Value 2025-01-01 is less than limit: 2025-02-01", "Value 2025-01-01 00:00:00 is less than limit: 2025-02-01 00:00:00", + "Value 1.00 is less than limit: 2", + ], + [ + None, + "Value 2 is less than limit: 8", + None, + None, + None, + "Value 1.99 is less than limit: 2", + ], + [ + None, + "Value 4 is less than limit: 6", + "Value 3 is less than limit: 4", + None, + None, + None, ], - [None, None, None], - [None, None, None], + [None, None, None, None, None, None], ], checked_schema, ) @@ -254,33 +285,44 @@ def test_col_not_less_than(spark, set_utc_timezone): assert_df_equality(actual, expected, ignore_nullable=True) -def test_col_not_greater_than(spark, set_utc_timezone): - schema_num = "a: int, b: date, c: timestamp" +def test_col_is_not_greater_than(spark, set_utc_timezone): + schema_num = "a: int, b: int, c: date, d: timestamp, e: decimal(10,2)" test_df = spark.createDataFrame( [ - [1, datetime(2025, 1, 1).date(), datetime(2025, 1, 1)], - [2, datetime(2025, 2, 1).date(), datetime(2025, 2, 1)], - [None, None, None], + [1, 1, datetime(2025, 1, 1).date(), datetime(2025, 1, 1), Decimal("1.00")], + [2, 4, datetime(2025, 2, 1).date(), datetime(2025, 2, 1), Decimal("1.01")], + [8, 3, None, None, Decimal("0.99")], + [None, None, None, None, None], ], schema_num, ) actual = test_df.select( - not_greater_than("a", 1), - not_greater_than("b", datetime(2025, 1, 1).date()), - not_greater_than("c", datetime(2025, 1, 1)), + is_not_greater_than("a", 1), + is_not_greater_than("a", F.col("b") * 2), + is_not_greater_than("b", "a"), + is_not_greater_than("c", datetime(2025, 1, 1).date()), + is_not_greater_than("d", datetime(2025, 1, 1)), + is_not_greater_than("e", 1), ) - checked_schema = "a_greater_than_limit: string, b_greater_than_limit: string, c_greater_than_limit: string" + checked_schema = ( + "a_greater_than_limit: string, a_greater_than_limit: string, b_greater_than_limit: string, " + "c_greater_than_limit: string, d_greater_than_limit: string, e_greater_than_limit: string" + ) expected = spark.createDataFrame( [ - [None, None, None], + [None, None, None, None, None, None], [ "Value 2 is greater than limit: 1", + None, + "Value 4 is greater than limit: 2", "Value 2025-02-01 is greater than limit: 2025-01-01", "Value 2025-02-01 00:00:00 is greater than limit: 2025-01-01 00:00:00", + "Value 1.01 is greater than limit: 1", ], - [None, None, None], + ["Value 8 is greater than limit: 1", "Value 8 is greater than limit: 6", None, None, None, None], + [None, None, None, None, None, None], ], checked_schema, ) @@ -289,15 +331,15 @@ def test_col_not_greater_than(spark, set_utc_timezone): def test_col_is_in_range(spark, set_utc_timezone): - schema_num = "a: int, b: date, c: timestamp, d: int, e: int, f: int" + schema_num = "a: int, b: date, c: timestamp, d: int, e: int, f: int, g: decimal(10,2)" test_df = spark.createDataFrame( [ - [0, datetime(2024, 12, 1).date(), datetime(2024, 12, 1), -1, 5, 6], - [1, datetime(2025, 1, 1).date(), datetime(2025, 1, 1), 2, 6, 3], - [2, datetime(2025, 2, 1).date(), datetime(2025, 2, 1), 2, 7, 3], - [3, datetime(2025, 3, 1).date(), datetime(2025, 3, 1), 3, 8, 3], - [4, datetime(2025, 4, 1).date(), datetime(2025, 4, 1), 2, 9, 3], - [None, None, None, None, None, None], + [0, datetime(2024, 12, 1).date(), datetime(2024, 12, 1), -1, 5, 6, Decimal("2.00")], + [1, datetime(2025, 1, 1).date(), datetime(2025, 1, 1), 2, 6, 3, Decimal("1.00")], + [2, datetime(2025, 2, 1).date(), datetime(2025, 2, 1), 2, 7, 3, Decimal("3.00")], + [3, datetime(2025, 3, 1).date(), datetime(2025, 3, 1), 3, 8, 3, Decimal("1.01")], + [4, datetime(2025, 4, 1).date(), datetime(2025, 4, 1), 2, 9, 3, Decimal("3.01")], + [None, None, None, None, None, None, None], ], schema_num, ) @@ -308,10 +350,15 @@ def test_col_is_in_range(spark, set_utc_timezone): is_in_range("a", 1, 3), is_in_range("b", start_date.date(), end_date.date()), is_in_range("c", start_date, end_date), - is_in_range("d", min_limit_col_expr=F.col("a"), max_limit_col_expr=F.expr("e - 1")), - is_in_range("f", min_limit_col_expr="a", max_limit=5), + is_in_range("d", F.col("a"), F.expr("e - 1")), + is_in_range("f", "a", 5), + is_in_range("g", 1, 3), + ) + + checked_schema = ( + "a_not_in_range: string, b_not_in_range: string, c_not_in_range: string, " + "d_not_in_range: string, f_not_in_range: string, g_not_in_range: string" ) - checked_schema = "a_not_in_range: string, b_not_in_range: string, c_not_in_range: string, d_not_in_range: string, f_not_in_range: string" expected = spark.createDataFrame( [ [ @@ -320,18 +367,20 @@ def test_col_is_in_range(spark, set_utc_timezone): "Value 2024-12-01 00:00:00 not in range: [ 2025-01-01 00:00:00 , 2025-03-01 00:00:00 ]", "Value -1 not in range: [ 0 , 4 ]", "Value 6 not in range: [ 0 , 5 ]", + None, ], - [None, None, None, None, None], - [None, None, None, None, None], - [None, None, None, None, None], + [None, None, None, None, None, None], + [None, None, None, None, None, None], + [None, None, None, None, None, None], [ "Value 4 not in range: [ 1 , 3 ]", "Value 2025-04-01 not in range: [ 2025-01-01 , 2025-03-01 ]", "Value 2025-04-01 00:00:00 not in range: [ 2025-01-01 00:00:00 , 2025-03-01 00:00:00 ]", "Value 2 not in range: [ 4 , 8 ]", "Value 3 not in range: [ 4 , 5 ]", + "Value 3.01 not in range: [ 1 , 3 ]", ], - [None, None, None, None, None], + [None, None, None, None, None, None], ], checked_schema, ) @@ -340,40 +389,48 @@ def test_col_is_in_range(spark, set_utc_timezone): def test_col_is_not_in_range(spark, set_utc_timezone): - schema_num = "a: int, b: date, c: timestamp, d: timestamp" + schema_num = "a: int, b: date, c: timestamp, d: timestamp, e: decimal(10,2)" test_df = spark.createDataFrame( [ - [1, datetime(2025, 1, 1).date(), datetime(2024, 1, 1), datetime(2024, 1, 1)], - [2, datetime(2025, 2, 1).date(), datetime(2025, 2, 1), datetime(2025, 2, 2)], - [3, datetime(2025, 3, 1).date(), datetime(2025, 3, 1), datetime(2025, 3, 1)], - [None, None, None, None], + [0, datetime(2024, 12, 31).date(), datetime(2025, 1, 4), datetime(2025, 1, 7), Decimal("0.99")], + [1, datetime(2025, 1, 1).date(), datetime(2025, 1, 3), datetime(2025, 1, 1), Decimal("1.00")], + [3, datetime(2025, 2, 1).date(), datetime(2025, 2, 1), datetime(2025, 2, 3), Decimal("3.00")], + [None, None, None, None, None], ], schema_num, ) start_date = datetime(2025, 1, 1) - end_date = datetime(2025, 3, 1) + end_date = datetime(2025, 1, 3) actual = test_df.select( is_not_in_range("a", 1, 3), is_not_in_range("b", start_date.date(), end_date.date()), is_not_in_range("c", start_date, end_date), - is_not_in_range( - "d", min_limit_col_expr="c", max_limit_col_expr=F.expr("cast(b as timestamp) + INTERVAL 2 DAY") - ), + is_not_in_range("d", "c", F.expr("cast(b as timestamp) + INTERVAL 2 DAY")), + is_not_in_range("e", 1, 3), ) - checked_schema = "a_in_range: string, b_in_range: string, c_in_range: string, d_in_range: string" + checked_schema = ( + "a_in_range: string, b_in_range: string, c_in_range: string, d_in_range: string, e_in_range: string" + ) expected = spark.createDataFrame( [ - [None, None, None, None], + [None, None, None, None, None], [ - "Value 2 in range: [ 1 , 3 ]", - "Value 2025-02-01 in range: [ 2025-01-01 , 2025-03-01 ]", - "Value 2025-02-01 00:00:00 in range: [ 2025-01-01 00:00:00 , 2025-03-01 00:00:00 ]", - "Value 2025-02-02 00:00:00 in range: [ 2025-02-01 00:00:00 , 2025-02-03 00:00:00 ]", + "Value 1 in range: [ 1 , 3 ]", + "Value 2025-01-01 in range: [ 2025-01-01 , 2025-01-03 ]", + "Value 2025-01-03 00:00:00 in range: [ 2025-01-01 00:00:00 , 2025-01-03 00:00:00 ]", + None, + "Value 1.00 in range: [ 1 , 3 ]", ], - [None, None, None, None], - [None, None, None, None], + [ + "Value 3 in range: [ 1 , 3 ]", + None, + None, + "Value 2025-02-03 00:00:00 in range: [ 2025-02-01 00:00:00 , 2025-02-03 00:00:00 ]", + "Value 3.00 in range: [ 1 , 3 ]", + ], + [None, None, None, None, None], ], checked_schema, ) @@ -409,12 +466,12 @@ def test_col_struct(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_col_not_in_future_cur(spark): +def test_col_is_not_in_future_cur(spark): schema_dates = "a: string" test_df = spark.createDataFrame([["9999-12-31 23:59:59"]], schema_dates) - actual = test_df.select(not_in_future("a", 0, None)) + actual = test_df.select(is_not_in_future("a", 0, None)) checked_schema = "a_in_future: string" @@ -423,12 +480,12 @@ def test_col_not_in_future_cur(spark): assert actual.select("a_in_future") != expected.select("a_in_future") -def test_col_not_in_near_future_cur(spark): +def test_col_is_not_in_near_future_cur(spark): schema_dates = "a: string" test_df = spark.createDataFrame([["1900-01-01 23:59:59"], ["9999-12-31 23:59:59"], [None]], schema_dates) - actual = test_df.select(not_in_near_future("a", 2, None)) + actual = test_df.select(is_not_in_near_future("a", 2, None)) checked_schema = "a_in_near_future: string" expected = spark.createDataFrame( @@ -586,3 +643,147 @@ def test_col_is_valid_timestamp(spark, set_utc_timezone): expected = spark.createDataFrame(checked_data, checked_schema) assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_col_is_unique(spark): + test_df = spark.createDataFrame([["str1", 1], ["str2", 1], ["str2", 2], ["str3", 3]], SCHEMA) + + actual = test_df.select(is_unique("a"), is_unique("b")) + + checked_schema = "a_is_not_unique: string, b_is_not_unique: string" + expected = spark.createDataFrame( + [ + [None, "Column b has duplicate values"], + ["Column a has duplicate values", "Column b has duplicate values"], + ["Column a has duplicate values", None], + [None, None], + ], + checked_schema, + ) + + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_col_is_unique_handle_nulls(spark): + test_df = spark.createDataFrame([["", None], ["", None], ["str1", 1], [None, None]], SCHEMA) + + actual = test_df.select(is_unique("a"), is_unique("b")) + + checked_schema = "a_is_not_unique: string, b_is_not_unique: string" + expected = spark.createDataFrame( + [ + ["Column a has duplicate values", None], # Null values are not considered duplicates as they are unknown + ["Column a has duplicate values", None], + [None, None], + [None, None], + ], + checked_schema, + ) + + assert_df_equality(actual, expected, ignore_nullable=True, ignore_row_order=True) + + +def test_col_is_unique_custom_window_spec(spark): + schema_num = "a: int, b: timestamp" + test_df = spark.createDataFrame( + [ + [0, datetime(2025, 1, 1)], + [0, datetime(2025, 1, 2)], + [0, datetime(2025, 1, 3)], # duplicate but not within the first window + [1, None], # considered duplicate with "b" as "1970-01-01" + [1, None], # considered duplicate with "b" as "1970-01-01" + [None, datetime(2025, 1, 6)], + [None, None], + ], + schema_num, + ) + + actual = test_df.select( + # must use coalesce to handle nulls, otherwise records with null for the time column b will be dropped + is_unique("a", window_spec=F.window(F.coalesce(F.col("b"), F.lit(datetime(1970, 1, 1))), "2 days")) + ) + + checked_schema = "a_is_not_unique: string" + expected = spark.createDataFrame( + [ + ["Column a has duplicate values"], + ["Column a has duplicate values"], + ["Column a has duplicate values"], + ["Column a has duplicate values"], + [None], + [None], + [None], + ], + checked_schema, + ) + + assert_df_equality(actual, expected, ignore_nullable=True, ignore_row_order=True) + + +def test_col_is_unique_custom_window_spec_without_handling_nulls(spark): + schema_num = "a: int, b: timestamp" + test_df = spark.createDataFrame( + [ + [0, datetime(2025, 1, 1)], + [0, datetime(2025, 1, 2)], + [0, datetime(2025, 1, 3)], # duplicate but not within the first window + [1, None], # considered duplicate with "b" as "1970-01-01" + [1, None], # considered duplicate with "b" as "1970-01-01" + [None, datetime(2025, 1, 6)], + [None, None], + ], + schema_num, + ) + + actual = test_df.select( + # window functions do not handle nulls by default + # incorrect implementation of the window_spec will result in rows being dropped!!! + is_unique("a", window_spec=F.window(F.col("b"), "2 days")) + ) + + checked_schema = "a_is_not_unique: string" + expected = spark.createDataFrame( + [ + ["Column a has duplicate values"], + ["Column a has duplicate values"], + [None], + [None], + ], + checked_schema, + ) + + assert_df_equality(actual, expected, ignore_nullable=True, ignore_row_order=True) + + +def test_col_is_unique_custom_window_as_string(spark): + schema_num = "a: int, b: timestamp" + test_df = spark.createDataFrame( + [ + [0, datetime(2025, 1, 1)], + [0, datetime(2025, 1, 2)], + [0, datetime(2025, 1, 3)], # duplicate but not within the first window + [1, None], # considered duplicate with "b" as "1970-01-01" + [1, None], # considered duplicate with "b" as "1970-01-01" + [None, datetime(2025, 1, 6)], + [None, None], + ], + schema_num, + ) + + actual = test_df.select(is_unique("a", window_spec="window(coalesce(b, '1970-01-01'), '2 days')")) + + checked_schema = "a_is_not_unique: string" + expected = spark.createDataFrame( + [ + ["Column a has duplicate values"], + ["Column a has duplicate values"], + ["Column a has duplicate values"], + ["Column a has duplicate values"], + [None], + [None], + [None], + ], + checked_schema, + ) + + assert_df_equality(actual, expected, ignore_nullable=True, ignore_row_order=True) diff --git a/tests/integration/test_rules_generator.py b/tests/integration/test_rules_generator.py index 8b57b1ec..89bcd9fa 100644 --- a/tests/integration/test_rules_generator.py +++ b/tests/integration/test_rules_generator.py @@ -49,7 +49,7 @@ def test_generate_dq_rules(ws): }, { "check": { - "function": "value_is_in_list", + "function": "is_in_list", "arguments": {"col_name": "vendor_id", "allowed": ["1", "4", "2"]}, }, "name": "vendor_id_other_value", @@ -86,7 +86,7 @@ def test_generate_dq_rules_warn(ws): }, { "check": { - "function": "value_is_in_list", + "function": "is_in_list", "arguments": {"col_name": "vendor_id", "allowed": ["1", "4", "2"]}, }, "name": "vendor_id_other_value", diff --git a/tests/unit/test_build_rules.py b/tests/unit/test_build_rules.py index d7a40a9d..3e473b8b 100644 --- a/tests/unit/test_build_rules.py +++ b/tests/unit/test_build_rules.py @@ -5,7 +5,7 @@ from databricks.labs.dqx.col_functions import ( is_not_null_and_not_empty, sql_expression, - value_is_in_list, + is_in_list, is_not_null_and_not_empty_array, ) from databricks.labs.dqx.engine import ( @@ -31,11 +31,11 @@ def test_get_rules(): DQRuleColSet(columns=["a", "b"], check_func=is_not_null_and_not_empty).get_rules() # with check function params provided as positional arguments + DQRuleColSet( - columns=["c", "d"], criticality="error", check_func=value_is_in_list, check_func_args=[[1, 2]] + columns=["c", "d"], criticality="error", check_func=is_in_list, check_func_args=[[1, 2]] ).get_rules() # with check function params provided as named arguments + DQRuleColSet( - columns=["e"], criticality="warn", check_func=value_is_in_list, check_func_kwargs={"allowed": [3]} + columns=["e"], criticality="warn", check_func=is_in_list, check_func_kwargs={"allowed": [3]} ).get_rules() # should be skipped + DQRuleColSet(columns=[], criticality="error", check_func=is_not_null_and_not_empty).get_rules() @@ -46,9 +46,9 @@ def test_get_rules(): expected_rules = [ DQRule(name="col_a_is_null_or_empty", criticality="error", check=is_not_null_and_not_empty("a")), DQRule(name="col_b_is_null_or_empty", criticality="error", check=is_not_null_and_not_empty("b")), - DQRule(name="col_c_value_is_not_in_the_list", criticality="error", check=value_is_in_list("c", allowed=[1, 2])), - DQRule(name="col_d_value_is_not_in_the_list", criticality="error", check=value_is_in_list("d", allowed=[1, 2])), - DQRule(name="col_e_value_is_not_in_the_list", criticality="warn", check=value_is_in_list("e", allowed=[3])), + DQRule(name="col_c_is_not_in_the_list", criticality="error", check=is_in_list("c", allowed=[1, 2])), + DQRule(name="col_d_is_not_in_the_list", criticality="error", check=is_in_list("d", allowed=[1, 2])), + DQRule(name="col_e_is_not_in_the_list", criticality="warn", check=is_in_list("e", allowed=[3])), DQRule(name="col_a_is_null_or_empty_array", criticality="error", check=is_not_null_and_not_empty_array("a")), DQRule(name="col_b_is_null_or_empty_array", criticality="error", check=is_not_null_and_not_empty_array("b")), ] @@ -62,11 +62,9 @@ def test_build_rules(): DQRuleColSet(columns=["a", "b"], criticality="error", filter="c>0", check_func=is_not_null_and_not_empty), DQRuleColSet(columns=["c"], criticality="warn", check_func=is_not_null_and_not_empty), # with check function params provided as positional arguments - DQRuleColSet(columns=["d", "e"], criticality="error", check_func=value_is_in_list, check_func_args=[[1, 2]]), + DQRuleColSet(columns=["d", "e"], criticality="error", check_func=is_in_list, check_func_args=[[1, 2]]), # with check function params provided as named arguments - DQRuleColSet( - columns=["f"], criticality="warn", check_func=value_is_in_list, check_func_kwargs={"allowed": [3]} - ), + DQRuleColSet(columns=["f"], criticality="warn", check_func=is_in_list, check_func_kwargs={"allowed": [3]}), # should be skipped DQRuleColSet(columns=[], criticality="error", check_func=is_not_null_and_not_empty), # set of columns for the same check @@ -74,21 +72,21 @@ def test_build_rules(): DQRuleColSet(columns=["c"], criticality="warn", check_func=is_not_null_and_not_empty_array), ) + [ DQRule(name="col_g_is_null_or_empty", criticality="warn", filter="a=0", check=is_not_null_and_not_empty("g")), - DQRule(criticality="warn", check=value_is_in_list("h", allowed=[1, 2])), + DQRule(criticality="warn", check=is_in_list("h", allowed=[1, 2])), ] expected_rules = [ DQRule(name="col_a_is_null_or_empty", criticality="error", filter="c>0", check=is_not_null_and_not_empty("a")), DQRule(name="col_b_is_null_or_empty", criticality="error", filter="c>0", check=is_not_null_and_not_empty("b")), DQRule(name="col_c_is_null_or_empty", criticality="warn", check=is_not_null_and_not_empty("c")), - DQRule(name="col_d_value_is_not_in_the_list", criticality="error", check=value_is_in_list("d", allowed=[1, 2])), - DQRule(name="col_e_value_is_not_in_the_list", criticality="error", check=value_is_in_list("e", allowed=[1, 2])), - DQRule(name="col_f_value_is_not_in_the_list", criticality="warn", check=value_is_in_list("f", allowed=[3])), + DQRule(name="col_d_is_not_in_the_list", criticality="error", check=is_in_list("d", allowed=[1, 2])), + DQRule(name="col_e_is_not_in_the_list", criticality="error", check=is_in_list("e", allowed=[1, 2])), + DQRule(name="col_f_is_not_in_the_list", criticality="warn", check=is_in_list("f", allowed=[3])), DQRule(name="col_a_is_null_or_empty_array", criticality="error", check=is_not_null_and_not_empty_array("a")), DQRule(name="col_b_is_null_or_empty_array", criticality="error", check=is_not_null_and_not_empty_array("b")), DQRule(name="col_c_is_null_or_empty_array", criticality="warn", check=is_not_null_and_not_empty_array("c")), DQRule(name="col_g_is_null_or_empty", criticality="warn", filter="a=0", check=is_not_null_and_not_empty("g")), - DQRule(name="col_h_value_is_not_in_the_list", criticality="warn", check=value_is_in_list("h", allowed=[1, 2])), + DQRule(name="col_h_is_not_in_the_list", criticality="warn", check=is_in_list("h", allowed=[1, 2])), ] assert pprint.pformat(actual_rules) == pprint.pformat(expected_rules) @@ -107,11 +105,11 @@ def test_build_rules_by_metadata(): { "criticality": "error", "filter": "c=0", - "check": {"function": "value_is_in_list", "arguments": {"col_names": ["d", "e"], "allowed": [1, 2]}}, + "check": {"function": "is_in_list", "arguments": {"col_names": ["d", "e"], "allowed": [1, 2]}}, }, { "criticality": "warn", - "check": {"function": "value_is_in_list", "arguments": {"col_names": ["f"], "allowed": [3]}}, + "check": {"function": "is_in_list", "arguments": {"col_names": ["f"], "allowed": [3]}}, }, { "name": "col_g_is_null_or_empty", @@ -120,7 +118,7 @@ def test_build_rules_by_metadata(): }, { "criticality": "warn", - "check": {"function": "value_is_in_list", "arguments": {"col_name": "h", "allowed": [1, 2]}}, + "check": {"function": "is_in_list", "arguments": {"col_name": "h", "allowed": [1, 2]}}, }, { "name": "d_not_in_a", @@ -146,20 +144,20 @@ def test_build_rules_by_metadata(): DQRule(name="col_b_is_null_or_empty", criticality="error", check=is_not_null_and_not_empty("b")), DQRule(name="col_c_is_null_or_empty", criticality="warn", filter="a>0", check=is_not_null_and_not_empty("c")), DQRule( - name="col_d_value_is_not_in_the_list", + name="col_d_is_not_in_the_list", criticality="error", filter="c=0", - check=value_is_in_list("d", allowed=[1, 2]), + check=is_in_list("d", allowed=[1, 2]), ), DQRule( - name="col_e_value_is_not_in_the_list", + name="col_e_is_not_in_the_list", criticality="error", filter="c=0", - check=value_is_in_list("e", allowed=[1, 2]), + check=is_in_list("e", allowed=[1, 2]), ), - DQRule(name="col_f_value_is_not_in_the_list", criticality="warn", check=value_is_in_list("f", allowed=[3])), + DQRule(name="col_f_is_not_in_the_list", criticality="warn", check=is_in_list("f", allowed=[3])), DQRule(name="col_g_is_null_or_empty", criticality="warn", check=is_not_null_and_not_empty("g")), - DQRule(name="col_h_value_is_not_in_the_list", criticality="warn", check=value_is_in_list("h", allowed=[1, 2])), + DQRule(name="col_h_is_not_in_the_list", criticality="warn", check=is_in_list("h", allowed=[1, 2])), DQRule( name="d_not_in_a", criticality="error", diff --git a/tests/unit/test_checks_validation.py b/tests/unit/test_checks_validation.py index 17817524..78f605ef 100644 --- a/tests/unit/test_checks_validation.py +++ b/tests/unit/test_checks_validation.py @@ -30,9 +30,9 @@ def test_valid_multiple_checks(): "check": {"function": "is_not_null_and_not_empty", "arguments": {"col_name": "b"}}, }, { - "name": "col_a_value_is_not_in_the_list", + "name": "col_a_is_not_in_the_list", "criticality": "warn", - "check": {"function": "value_is_in_list", "arguments": {"col_name": "a", "allowed": [1, 3, 4]}}, + "check": {"function": "is_in_list", "arguments": {"col_name": "a", "allowed": [1, 3, 4]}}, }, { "name": "col_a_is_null_or_empty_array", @@ -57,9 +57,9 @@ def test_invalid_multiple_checks(): "check": {"function": "is_not_null_and_not_empty", "arguments": {"col_name": "b"}}, }, { - "name": "col_a_value_is_not_in_the_list", + "name": "col_a_is_not_in_the_list", "criticality": "warn", - "check": {"function": "value_is_in_list", "arguments": {"col_name": "a", "allowed": 2}}, + "check": {"function": "is_in_list", "arguments": {"col_name": "a", "allowed": 2}}, }, { "name": "col_b_is_null_or_empty", @@ -77,7 +77,7 @@ def test_invalid_multiple_checks(): expected_errors = [ "No arguments provided for function 'is_not_null_and_not_empty' in the 'arguments' block", "Invalid value for 'criticality' field", - "Argument 'allowed' should be of type 'list' for function 'value_is_in_list' in the 'arguments' block", + "Argument 'allowed' should be of type 'list' for function 'is_in_list' in the 'arguments' block", "'check' field is missing", ] assert len(status.errors) == len(expected_errors) @@ -176,7 +176,7 @@ def test_col_names_argument_type_list(): checks = [ { "criticality": "warn", - "check": {"function": "value_is_in_list", "arguments": {"col_names": ["a", "b"], "allowed": [1, 3, 4]}}, + "check": {"function": "is_in_list", "arguments": {"col_names": ["a", "b"], "allowed": [1, 3, 4]}}, } ] status = DQEngine.validate_checks(checks) @@ -187,13 +187,12 @@ def test_col_functions_argument_mismtach_type(): checks = [ { "criticality": "warn", - "check": {"function": "value_is_in_list", "arguments": {"col_name": "a", "allowed": 2}}, + "check": {"function": "is_in_list", "arguments": {"col_name": "a", "allowed": 2}}, } ] status = DQEngine.validate_checks(checks) - assert ( - "Argument 'allowed' should be of type 'list' for function 'value_is_in_list' in the 'arguments' block" - in str(status) + assert "Argument 'allowed' should be of type 'list' for function 'is_in_list' in the 'arguments' block" in str( + status ) diff --git a/tests/unit/test_col_functions.py b/tests/unit/test_col_functions.py new file mode 100644 index 00000000..5a315d67 --- /dev/null +++ b/tests/unit/test_col_functions.py @@ -0,0 +1,43 @@ +import pytest +from databricks.labs.dqx.col_functions import ( + is_in_range, + is_not_in_range, + is_not_greater_than, + is_not_less_than, + is_in_list, + is_not_null_and_is_in_list, +) + +LIMIT_VALUE_ERROR = "Limit is not provided" + + +@pytest.mark.parametrize("min_limit, max_limit", [(None, 1), (1, None)]) +def test_col_is_in_range_missing_limits(min_limit, max_limit): + with pytest.raises(ValueError, match=LIMIT_VALUE_ERROR): + is_in_range("a", min_limit, max_limit) + + +@pytest.mark.parametrize("min_limit, max_limit", [(None, 1), (1, None)]) +def test_col_is_not_in_range_missing_limits(min_limit, max_limit): + with pytest.raises(ValueError, match=LIMIT_VALUE_ERROR): + is_not_in_range("a", min_limit, max_limit) + + +def test_col_not_greater_than_missing_limit(): + with pytest.raises(ValueError, match=LIMIT_VALUE_ERROR): + is_not_greater_than("a", limit=None) + + +def test_col_not_less_than_missing_limit(): + with pytest.raises(ValueError, match=LIMIT_VALUE_ERROR): + is_not_less_than("a", limit=None) + + +def test_col_is_not_null_and_is_in_list_missing_allowed_list(): + with pytest.raises(ValueError, match="allowed list is not provided"): + is_not_null_and_is_in_list("a", allowed=[]) + + +def test_col_is_in_list_missing_allowed_list(): + with pytest.raises(ValueError, match="allowed list is not provided"): + is_in_list("a", allowed=[]) diff --git a/tests/unit/resolve_check_function.py b/tests/unit/test_resolve_check_function.py similarity index 100% rename from tests/unit/resolve_check_function.py rename to tests/unit/test_resolve_check_function.py