From e9c196d1c1b38fd76f27a8db81b160813b3fc081 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Wed, 15 Oct 2025 12:20:26 -0400 Subject: [PATCH 01/28] fix(db-data-type): handle null values --- .../flavors/mssql/data_chars/schema_ddf_query_mssql.sql | 8 ++++---- .../postgresql/data_chars/schema_ddf_query_postgresql.sql | 6 +++--- .../redshift/data_chars/schema_ddf_query_redshift.sql | 4 ++-- .../snowflake/data_chars/schema_ddf_query_snowflake.sql | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/testgen/template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql b/testgen/template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql index 8b113f7..6c44d4c 100644 --- a/testgen/template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql +++ b/testgen/template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql @@ -14,12 +14,12 @@ SELECT '{PROJECT_CODE}' as project_code, ELSE c.data_type END AS column_type, CASE WHEN c.data_type LIKE '%char' OR c.data_type LIKE '%binary' - THEN c.data_type + '(' + CAST(c.character_maximum_length AS VARCHAR) + ')' + THEN c.data_type + COALESCE('(' + CAST(c.character_maximum_length AS VARCHAR) + ')', '') WHEN c.data_type IN ('datetime2', 'datetimeoffset', 'time') - THEN c.data_type + '(' + CAST(c.datetime_precision AS VARCHAR) + ')' + THEN c.data_type + COALESCE('(' + CAST(c.datetime_precision AS VARCHAR) + ')', '') WHEN c.data_type IN ('numeric', 'decimal') - THEN c.data_type + '(' + CAST(c.numeric_precision AS VARCHAR) + ',' - + CAST(c.numeric_scale AS VARCHAR) + ')' + THEN c.data_type + COALESCE('(' + CAST(c.numeric_precision AS VARCHAR) + ',' + + CAST(c.numeric_scale AS VARCHAR) + ')', '') ELSE c.data_type END AS db_data_type, c.character_maximum_length, c.ordinal_position, diff --git a/testgen/template/flavors/postgresql/data_chars/schema_ddf_query_postgresql.sql b/testgen/template/flavors/postgresql/data_chars/schema_ddf_query_postgresql.sql index aca74a1..24d5077 100644 --- a/testgen/template/flavors/postgresql/data_chars/schema_ddf_query_postgresql.sql +++ b/testgen/template/flavors/postgresql/data_chars/schema_ddf_query_postgresql.sql @@ -17,12 +17,12 @@ SELECT '{PROJECT_CODE}' as project_code, END AS column_type, CASE WHEN c.data_type ILIKE 'char%' OR c.data_type ILIKE 'bit%' - THEN c.data_type || '(' || CAST(c.character_maximum_length AS VARCHAR) || ')' + THEN c.data_type || COALESCE('(' || CAST(c.character_maximum_length AS VARCHAR) || ')', '') WHEN c.data_type = 'numeric' - THEN 'numeric' || COALESCE( '(' || CAST(c.numeric_precision AS VARCHAR) || ',' + THEN 'numeric' || COALESCE('(' || CAST(c.numeric_precision AS VARCHAR) || ',' || CAST(c.numeric_scale AS VARCHAR) || ')', '') WHEN c.data_type ILIKE 'time%' - THEN c.data_type || '(' || CAST(c.datetime_precision AS VARCHAR) || ')' + THEN c.data_type || COALESCE('(' || CAST(c.datetime_precision AS VARCHAR) || ')', '') ELSE c.data_type END AS db_data_type, COALESCE(c.character_maximum_length, CASE WHEN c.data_type IN ('text', 'character varying') THEN 65535 END) diff --git a/testgen/template/flavors/redshift/data_chars/schema_ddf_query_redshift.sql b/testgen/template/flavors/redshift/data_chars/schema_ddf_query_redshift.sql index cf61e7c..d54ba38 100644 --- a/testgen/template/flavors/redshift/data_chars/schema_ddf_query_redshift.sql +++ b/testgen/template/flavors/redshift/data_chars/schema_ddf_query_redshift.sql @@ -14,9 +14,9 @@ SELECT '{PROJECT_CODE}' as project_code, ELSE c.data_type END AS column_type, CASE WHEN c.data_type ILIKE 'char%' - THEN c.data_type || '(' || CAST(c.character_maximum_length AS VARCHAR) || ')' + THEN c.data_type || COALESCE('(' || CAST(c.character_maximum_length AS VARCHAR) || ')', '') WHEN c.data_type = 'numeric' - THEN 'numeric' || COALESCE( '(' || CAST(c.numeric_precision AS VARCHAR) || ',' + THEN 'numeric' || COALESCE('(' || CAST(c.numeric_precision AS VARCHAR) || ',' || CAST(c.numeric_scale AS VARCHAR) || ')', '') ELSE c.data_type END AS db_data_type, diff --git a/testgen/template/flavors/snowflake/data_chars/schema_ddf_query_snowflake.sql b/testgen/template/flavors/snowflake/data_chars/schema_ddf_query_snowflake.sql index 6e90f89..49e6c1e 100644 --- a/testgen/template/flavors/snowflake/data_chars/schema_ddf_query_snowflake.sql +++ b/testgen/template/flavors/snowflake/data_chars/schema_ddf_query_snowflake.sql @@ -17,12 +17,12 @@ SELECT '{PROJECT_CODE}' as project_code, END AS column_type, CASE WHEN c.data_type = 'TEXT' - THEN 'VARCHAR(' || CAST(c.character_maximum_length AS VARCHAR) || ')' + THEN 'VARCHAR' || COALESCE('(' || CAST(c.character_maximum_length AS VARCHAR) || ')', '') WHEN c.data_type = 'NUMBER' - THEN c.data_type || '(' || CAST(c.numeric_precision AS VARCHAR) || ',' - || CAST(c.numeric_scale AS VARCHAR) || ')' + THEN c.data_type || COALESCE('(' || CAST(c.numeric_precision AS VARCHAR) || ',' + || CAST(c.numeric_scale AS VARCHAR) || ')', '') WHEN c.data_type ILIKE 'TIME%' - THEN c.data_type || '(' || CAST(c.datetime_precision AS VARCHAR) || ')' + THEN c.data_type || COALESCE('(' || CAST(c.datetime_precision AS VARCHAR) || ')', '') ELSE c.data_type END AS db_data_type, c.character_maximum_length, From 10e0d86f42013146852f78a36207dcea8130d7ad Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Wed, 15 Oct 2025 12:20:45 -0400 Subject: [PATCH 02/28] fix(runs): empty state css --- testgen/ui/components/frontend/js/pages/profiling_runs.js | 5 ++--- testgen/ui/components/frontend/js/pages/test_runs.js | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/testgen/ui/components/frontend/js/pages/profiling_runs.js b/testgen/ui/components/frontend/js/pages/profiling_runs.js index 49f073c..de98152 100644 --- a/testgen/ui/components/frontend/js/pages/profiling_runs.js +++ b/testgen/ui/components/frontend/js/pages/profiling_runs.js @@ -80,12 +80,11 @@ const ProfilingRuns = (/** @type Properties */ props) => { resizeFrameHeightOnDOMChange(wrapperId); return div( - { id: wrapperId }, + { id: wrapperId, class: 'tg-profiling-runs' }, () => { const projectSummary = getValue(props.project_summary); return projectSummary.profiling_run_count > 0 ? div( - { class: 'tg-profiling-runs' }, Toolbar(props, userCanEdit), () => profilingRuns.val.length ? div( @@ -408,7 +407,7 @@ const ConditionalEmptyState = ( const stylesheet = new CSSStyleSheet(); stylesheet.replace(` .tg-profiling-runs { - min-height: 500px; + min-height: 550px; } `); diff --git a/testgen/ui/components/frontend/js/pages/test_runs.js b/testgen/ui/components/frontend/js/pages/test_runs.js index 8b148f9..04a00b1 100644 --- a/testgen/ui/components/frontend/js/pages/test_runs.js +++ b/testgen/ui/components/frontend/js/pages/test_runs.js @@ -81,12 +81,11 @@ const TestRuns = (/** @type Properties */ props) => { resizeFrameHeightOnDOMChange(wrapperId); return div( - { id: wrapperId }, + { id: wrapperId, class: 'tg-test-runs' }, () => { const projectSummary = getValue(props.project_summary); return projectSummary.test_run_count > 0 ? div( - { class: 'tg-test-runs' }, Toolbar(props, userCanEdit), () => testRuns.val.length ? div( @@ -406,7 +405,7 @@ const ConditionalEmptyState = ( const stylesheet = new CSSStyleSheet(); stylesheet.replace(` .tg-test-runs { - min-height: 500px; + min-height: 550px; } `); From 556e6a2e1623597384decd34a5e9928fda7d983c Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Wed, 15 Oct 2025 12:21:47 -0400 Subject: [PATCH 03/28] misc(docker): upgrade base image --- deploy/testgen-base.dockerfile | 6 +++--- deploy/testgen.dockerfile | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/deploy/testgen-base.dockerfile b/deploy/testgen-base.dockerfile index f04aa3b..08976d9 100644 --- a/deploy/testgen-base.dockerfile +++ b/deploy/testgen-base.dockerfile @@ -1,4 +1,4 @@ -FROM python:3.12.7-alpine3.20 +FROM python:3.12-alpine3.22 ENV LANG=C.UTF-8 ENV LC_ALL=C.UTF-8 @@ -14,7 +14,7 @@ RUN apk update && apk upgrade && apk add --no-cache \ cmake \ musl-dev \ gfortran \ - linux-headers=6.6-r0 \ + linux-headers=6.14.2-r0 \ # Tools needed for installing the MSSQL ODBC drivers \ curl \ gpg \ @@ -25,7 +25,7 @@ RUN apk update && apk upgrade && apk add --no-cache \ unixodbc=2.3.12-r0 \ unixodbc-dev=2.3.12-r0 \ # Pinned versions for security - xz=5.6.2-r1 + xz=5.8.1-r0 RUN apk add --no-cache \ --repository https://dl-cdn.alpinelinux.org/alpine/v3.21/community \ diff --git a/deploy/testgen.dockerfile b/deploy/testgen.dockerfile index 58e15db..e759822 100644 --- a/deploy/testgen.dockerfile +++ b/deploy/testgen.dockerfile @@ -1,4 +1,4 @@ -ARG TESTGEN_BASE_LABEL=v7 +ARG TESTGEN_BASE_LABEL=v8 FROM datakitchen/dataops-testgen-base:${TESTGEN_BASE_LABEL} AS release-image From f5f0673e2b78df42b6db673edff318a1f2d7783d Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Fri, 3 Oct 2025 15:34:32 -0400 Subject: [PATCH 04/28] feat(profiling): add progress, approx row counts and error handling --- testgen/__main__.py | 12 +- testgen/commands/queries/contingency_query.py | 51 ++ .../queries/execute_cat_tests_query.py | 8 +- testgen/commands/queries/profiling_query.py | 510 +++++++----------- .../queries/refresh_data_chars_query.py | 165 ++++-- .../commands/queries/rollup_scores_query.py | 13 +- testgen/commands/run_execute_cat_tests.py | 8 +- testgen/commands/run_execute_tests.py | 17 +- testgen/commands/run_launch_db_config.py | 1 - .../run_pairwise_contingency_check.py | 147 +++++ testgen/commands/run_profiling.py | 317 +++++++++++ testgen/commands/run_profiling_bridge.py | 499 ----------------- testgen/commands/run_refresh_data_chars.py | 102 ++-- .../run_refresh_score_cards_results.py | 18 +- testgen/commands/run_rollup_scores.py | 6 +- testgen/commands/run_upgrade_db_config.py | 2 - testgen/common/database/database_service.py | 138 ++--- testgen/common/get_pipeline_parms.py | 28 - testgen/common/models/profiling_run.py | 90 +++- testgen/common/models/table_group.py | 83 ++- testgen/common/read_yaml_metadata_records.py | 1 - .../contingency_columns.sql | 0 .../contingency_counts.sql | 0 .../data_chars/data_chars_staging_delete.sql | 3 +- .../template/data_chars/data_chars_update.sql | 35 +- .../030_initialize_new_schema_structure.sql | 13 +- .../dbsetup/060_create_standard_views.sql | 77 --- .../dbupgrade/0157_incremental_upgrade.sql | 25 + ..._query_bigquery.sql => get_schema_ddf.sql} | 10 +- ...roject_get_table_sample_count_bigquery.sql | 30 -- ...uery.yaml => project_profiling_query.yaml} | 103 ++-- ... => project_secondary_profiling_query.sql} | 0 ...uery_databricks.sql => get_schema_ddf.sql} | 9 +- ...ject_get_table_sample_count_databricks.sql | 23 - ...icks.yaml => project_profiling_query.yaml} | 95 ++-- ... => project_secondary_profiling_query.sql} | 0 ...ddf_query_mssql.sql => get_schema_ddf.sql} | 20 +- .../project_get_table_sample_count_mssql.sql | 23 - ...ssql.yaml => project_profiling_query.yaml} | 96 ++-- ... => project_secondary_profiling_query.sql} | 0 ...uery_postgresql.sql => get_schema_ddf.sql} | 12 +- ...ject_get_table_sample_count_postgresql.sql | 23 - ...esql.yaml => project_profiling_query.yaml} | 97 ++-- ... => project_secondary_profiling_query.sql} | 0 ..._query_redshift.sql => get_schema_ddf.sql} | 14 +- ...roject_get_table_sample_count_redshift.sql | 23 - ...hift.yaml => project_profiling_query.yaml} | 96 ++-- ... => project_secondary_profiling_query.sql} | 0 ...dshift_spectrum.sql => get_schema_ddf.sql} | 12 +- ...t_table_sample_count_redshift_spectrum.sql | 23 - ...trum.yaml => project_profiling_query.yaml} | 96 ++-- ... => project_secondary_profiling_query.sql} | 0 ...query_snowflake.sql => get_schema_ddf.sql} | 10 +- ...oject_get_table_sample_count_snowflake.sql | 23 - ...lake.yaml => project_profiling_query.yaml} | 96 ++-- ... => project_secondary_profiling_query.sql} | 0 .../project_get_table_sample_count_trino.sql | 23 - ...rino.yaml => project_profiling_query.yaml} | 96 ++-- testgen/template/parms/parms_profiling.sql | 28 - .../profiling/functional_datatype.sql | 26 + .../profiling/functional_tabletype_update.sql | 24 + .../project_profile_run_record_insert.sql | 8 - .../project_profile_run_record_update.sql | 5 - ...ct_update_profile_results_to_estimates.sql | 4 +- .../template/profiling/refresh_anomalies.sql | 23 +- testgen/ui/components/frontend/css/shared.css | 16 +- .../frontend/js/components/score_issues.js | 2 +- .../frontend/js/components/select.js | 2 +- .../js/components/table_group_stats.js | 130 +++++ .../js/components/table_group_test.js | 83 +-- .../js/data_profiling/column_distribution.js | 64 ++- .../frontend/js/data_profiling/data_issues.js | 14 +- .../js/data_profiling/data_profiling_utils.js | 8 +- .../frontend/js/data_profiling/table_size.js | 22 +- .../frontend/js/pages/profiling_runs.js | 186 +++++-- .../frontend/js/pages/project_dashboard.js | 22 +- .../frontend/js/pages/run_profiling_dialog.js | 131 +++-- .../frontend/js/pages/table_group_wizard.js | 20 +- .../components/frontend/js/pages/test_runs.js | 2 +- testgen/ui/queries/profiling_queries.py | 10 +- testgen/ui/queries/table_group_queries.py | 183 ++++--- testgen/ui/views/connections.py | 18 +- testgen/ui/views/data_catalog.py | 4 +- .../ui/views/dialogs/run_profiling_dialog.py | 118 ++-- testgen/ui/views/profiling_results.py | 7 +- testgen/ui/views/profiling_runs.py | 4 +- testgen/ui/views/table_groups.py | 76 +-- testgen/ui/views/test_definitions.py | 2 +- testgen/utils/__init__.py | 4 + tests/unit/test_profiling_query.py | 68 --- tests/unit/test_refresh_data_chars_query.py | 62 +++ 91 files changed, 2361 insertions(+), 2437 deletions(-) create mode 100644 testgen/commands/queries/contingency_query.py create mode 100644 testgen/commands/run_pairwise_contingency_check.py create mode 100644 testgen/commands/run_profiling.py delete mode 100644 testgen/commands/run_profiling_bridge.py rename testgen/template/{profiling => contingency}/contingency_columns.sql (100%) rename testgen/template/{flavors/generic/profiling => contingency}/contingency_counts.sql (100%) create mode 100644 testgen/template/dbupgrade/0157_incremental_upgrade.sql rename testgen/template/flavors/bigquery/data_chars/{schema_ddf_query_bigquery.sql => get_schema_ddf.sql} (85%) delete mode 100644 testgen/template/flavors/bigquery/profiling/project_get_table_sample_count_bigquery.sql rename testgen/template/flavors/bigquery/profiling/{project_profiling_query_bigquery.yaml => project_profiling_query.yaml} (82%) rename testgen/template/flavors/bigquery/profiling/{project_secondary_profiling_query_bigquery.sql => project_secondary_profiling_query.sql} (100%) rename testgen/template/flavors/databricks/data_chars/{schema_ddf_query_databricks.sql => get_schema_ddf.sql} (86%) delete mode 100644 testgen/template/flavors/databricks/profiling/project_get_table_sample_count_databricks.sql rename testgen/template/flavors/databricks/profiling/{project_profiling_query_databricks.yaml => project_profiling_query.yaml} (83%) rename testgen/template/flavors/databricks/profiling/{project_secondary_profiling_query_databricks.sql => project_secondary_profiling_query.sql} (100%) rename testgen/template/flavors/mssql/data_chars/{schema_ddf_query_mssql.sql => get_schema_ddf.sql} (78%) delete mode 100644 testgen/template/flavors/mssql/profiling/project_get_table_sample_count_mssql.sql rename testgen/template/flavors/mssql/profiling/{project_profiling_query_mssql.yaml => project_profiling_query.yaml} (82%) rename testgen/template/flavors/mssql/profiling/{project_secondary_profiling_query_mssql.sql => project_secondary_profiling_query.sql} (100%) rename testgen/template/flavors/postgresql/data_chars/{schema_ddf_query_postgresql.sql => get_schema_ddf.sql} (87%) delete mode 100644 testgen/template/flavors/postgresql/profiling/project_get_table_sample_count_postgresql.sql rename testgen/template/flavors/postgresql/profiling/{project_profiling_query_postgresql.yaml => project_profiling_query.yaml} (80%) rename testgen/template/flavors/postgresql/profiling/{project_secondary_profiling_query_postgresql.sql => project_secondary_profiling_query.sql} (100%) rename testgen/template/flavors/redshift/data_chars/{schema_ddf_query_redshift.sql => get_schema_ddf.sql} (85%) delete mode 100644 testgen/template/flavors/redshift/profiling/project_get_table_sample_count_redshift.sql rename testgen/template/flavors/redshift/profiling/{project_profiling_query_redshift.yaml => project_profiling_query.yaml} (78%) rename testgen/template/flavors/redshift/profiling/{project_secondary_profiling_query_redshift.sql => project_secondary_profiling_query.sql} (100%) rename testgen/template/flavors/redshift_spectrum/data_chars/{schema_ddf_query_redshift_spectrum.sql => get_schema_ddf.sql} (76%) delete mode 100644 testgen/template/flavors/redshift_spectrum/profiling/project_get_table_sample_count_redshift_spectrum.sql rename testgen/template/flavors/redshift_spectrum/profiling/{project_profiling_query_redshift_spectrum.yaml => project_profiling_query.yaml} (78%) rename testgen/template/flavors/redshift_spectrum/profiling/{project_secondary_profiling_query_redshift_spectrum.sql => project_secondary_profiling_query.sql} (100%) rename testgen/template/flavors/snowflake/data_chars/{schema_ddf_query_snowflake.sql => get_schema_ddf.sql} (89%) delete mode 100644 testgen/template/flavors/snowflake/profiling/project_get_table_sample_count_snowflake.sql rename testgen/template/flavors/snowflake/profiling/{project_profiling_query_snowflake.yaml => project_profiling_query.yaml} (79%) rename testgen/template/flavors/snowflake/profiling/{project_secondary_profiling_query_snowflake.sql => project_secondary_profiling_query.sql} (100%) delete mode 100644 testgen/template/flavors/trino/profiling/project_get_table_sample_count_trino.sql rename testgen/template/flavors/trino/profiling/{project_profiling_query_trino.yaml => project_profiling_query.yaml} (81%) delete mode 100644 testgen/template/parms/parms_profiling.sql delete mode 100644 testgen/template/profiling/project_profile_run_record_insert.sql delete mode 100644 testgen/template/profiling/project_profile_run_record_update.sql create mode 100644 testgen/ui/components/frontend/js/components/table_group_stats.js delete mode 100644 tests/unit/test_profiling_query.py create mode 100644 tests/unit/test_refresh_data_chars_query.py diff --git a/testgen/__main__.py b/testgen/__main__.py index c0d7a7f..a6578e7 100644 --- a/testgen/__main__.py +++ b/testgen/__main__.py @@ -29,7 +29,7 @@ ) from testgen.commands.run_launch_db_config import run_launch_db_config from testgen.commands.run_observability_exporter import run_observability_exporter -from testgen.commands.run_profiling_bridge import run_profiling_queries +from testgen.commands.run_profiling import run_profiling from testgen.commands.run_quick_start import run_quick_start, run_quick_start_increment from testgen.commands.run_test_metadata_exporter import run_test_metadata_exporter from testgen.commands.run_upgrade_db_config import get_schema_revision, is_db_revision_up_to_date, run_upgrade_db_config @@ -124,10 +124,7 @@ def cli(ctx: Context, verbose: bool): ) def run_profile(configuration: Configuration, table_group_id: str): click.echo(f"run-profile with table_group_id: {table_group_id}") - spinner = None - if not configuration.verbose: - spinner = MoonSpinner("Processing ... ") - message = run_profiling_queries(table_group_id, spinner=spinner) + message = run_profiling(table_group_id) click.echo("\n" + message) @@ -374,10 +371,7 @@ def quick_start( table_group_id="0ea85e17-acbe-47fe-8394-9970725ad37d" click.echo(f"run-profile with table_group_id: {table_group_id}") - spinner = None - if not configuration.verbose: - spinner = MoonSpinner("Processing ... ") - message = run_profiling_queries(table_group_id, spinner=spinner, minutes_offset=minutes_offset) + message = run_profiling(table_group_id, minutes_offset=minutes_offset) click.echo("\n" + message) LOG.info(f"run-test-generation with table_group_id: {table_group_id} test_suite: {settings.DEFAULT_TEST_SUITE_KEY}") diff --git a/testgen/commands/queries/contingency_query.py b/testgen/commands/queries/contingency_query.py new file mode 100644 index 0000000..0a8437c --- /dev/null +++ b/testgen/commands/queries/contingency_query.py @@ -0,0 +1,51 @@ +# UNUSED CODE - TO BE REVIVED LATER + +import dataclasses +from uuid import UUID + +from testgen.common import read_template_sql_file +from testgen.common.database.database_service import quote_csv_items, replace_params + + +@dataclasses.dataclass +class ContingencyTable: + schema_name: str + table_name: str + contingency_columns: str + + +class ContingencySQL: + + contingency_max_values = 6 + + def _get_query( + self, + template_file_name: str, + sub_directory: str | None = "contingency", + params: dict | None = None, + ) -> tuple[str | None, dict]: + query = read_template_sql_file(template_file_name, sub_directory) + query = replace_params(query, params or {}) + + return query, params + + def get_contingency_columns(self, profiling_run_id: UUID) -> tuple[str, dict]: + # Runs on App database + return self._get_query( + "contingency_columns.sql", + params={ + "PROFILE_RUN_ID": profiling_run_id, + "CONTINGENCY_MAX_VALUES": self.contingency_max_values, + }, + ) + + def get_contingency_counts(self, contingency_table: ContingencyTable) -> tuple[str, dict]: + # Runs on Target database + return self._get_query( + "contingency_counts.sql", + params={ + "DATA_SCHEMA": contingency_table.schema_name, + "DATA_TABLE": contingency_table.table_name, + "CONTINGENCY_COLUMNS": quote_csv_items(contingency_table.contingency_columns), + }, + ) diff --git a/testgen/commands/queries/execute_cat_tests_query.py b/testgen/commands/queries/execute_cat_tests_query.py index 5f70a59..7ff5347 100644 --- a/testgen/commands/queries/execute_cat_tests_query.py +++ b/testgen/commands/queries/execute_cat_tests_query.py @@ -1,6 +1,6 @@ from typing import ClassVar, TypedDict -from testgen.commands.queries.rollup_scores_query import CRollupScoresSQL +from testgen.commands.queries.rollup_scores_query import RollupScoresSQL from testgen.common import date_service, read_template_sql_file from testgen.common.database.database_service import get_flavor_service, replace_params from testgen.common.read_file import replace_templated_functions @@ -27,7 +27,7 @@ class CCATExecutionSQL: target_table = "" cat_test_params: ClassVar[CATTestParams] = {} - _rollup_scores_sql: CRollupScoresSQL = None + _rollup_scores_sql: RollupScoresSQL = None def __init__(self, strProjectCode, strTestSuiteId, strTestSuite, strSQLFlavor, max_query_chars, minutes_offset=0): # Defaults @@ -40,9 +40,9 @@ def __init__(self, strProjectCode, strTestSuiteId, strTestSuite, strSQLFlavor, m self.today = date_service.get_now_as_string_with_offset(minutes_offset) self.minutes_offset = minutes_offset - def _get_rollup_scores_sql(self) -> CRollupScoresSQL: + def _get_rollup_scores_sql(self) -> RollupScoresSQL: if not self._rollup_scores_sql: - self._rollup_scores_sql = CRollupScoresSQL(self.test_run_id, self.table_groups_id) + self._rollup_scores_sql = RollupScoresSQL(self.test_run_id, self.table_groups_id) return self._rollup_scores_sql diff --git a/testgen/commands/queries/profiling_query.py b/testgen/commands/queries/profiling_query.py index 93dbe03..3d8c0e6 100644 --- a/testgen/commands/queries/profiling_query.py +++ b/testgen/commands/queries/profiling_query.py @@ -1,158 +1,139 @@ +import dataclasses import re -import typing - -from testgen.commands.queries.refresh_data_chars_query import CRefreshDataCharsSQL -from testgen.commands.queries.rollup_scores_query import CRollupScoresSQL -from testgen.common import date_service, read_template_sql_file, read_template_yaml_file -from testgen.common.database.database_service import get_flavor_service, replace_params +from uuid import UUID + +from testgen.commands.queries.refresh_data_chars_query import ColumnChars +from testgen.common import read_template_sql_file, read_template_yaml_file +from testgen.common.database.database_service import replace_params +from testgen.common.models.connection import Connection +from testgen.common.models.profiling_run import ProfilingRun +from testgen.common.models.table_group import TableGroup from testgen.common.read_file import replace_templated_functions -class CProfilingSQL: - dctSnippetTemplate: typing.ClassVar = {} - - project_code = "" - connection_id = "" - table_groups_id = "" - flavor = "" - run_date = "" - data_schema = "" - data_table = "" - - col_name = "" - col_gen_type = "" - col_type = "" - db_data_type = "" - col_ordinal_position = "0" - col_is_decimal = "" - col_top_freq_update = "" - - parm_table_set = None - parm_table_include_mask = None - parm_table_exclude_mask = None - parm_do_patterns = "Y" - parm_max_pattern_length = 25 - parm_do_freqs = "Y" - parm_do_sample = "N" - parm_sample_size = 0 - profile_run_id = "" - profile_id_column_mask = "" - profile_sk_column_mask = "" - profile_use_sampling = "" - profile_flag_cdes = False - profile_sample_percent = "" - profile_sample_min_count = "" - - sampling_table = "" - sample_ratio = "" - sample_percent_calc = "" - - process_id = None - - contingency_max_values = "4" - contingency_columns = "" - - exception_message = "" - minutes_offset = 0 - - _data_chars_sql: CRefreshDataCharsSQL = None - _rollup_scores_sql: CRollupScoresSQL = None - - def __init__(self, strProjectCode, flavor, minutes_offset=0): - self.flavor = flavor - self.project_code = strProjectCode - # Defaults - self.run_date = date_service.get_now_as_string_with_offset(minutes_offset) - self.today = date_service.get_now_as_string_with_offset(minutes_offset) +@dataclasses.dataclass +class TableSampling: + table_name: str + sample_count: int + sample_ratio: float + sample_percent: float + + +@dataclasses.dataclass +class HygieneIssueType: + id: str + anomaly_type: str + data_object: str + anomaly_criteria: str + detail_expression: str + dq_score_prevalence_formula: str + dq_score_risk_factor: str + + +class ProfilingSQL: + + profiling_results_table = "profile_results" + frequency_staging_table = "stg_secondary_profile_updates" + error_columns = ( + "project_code", + "connection_id", + "table_groups_id", + "schema_name", + "profile_run_id", + "run_date", + "table_name", + "column_name", + "position", + "column_type", + "general_type", + "db_data_type", + "record_ct", + "query_error", + ) + + max_pattern_length = 25 + max_error_length = 2000 + + def __init__( + self, + connection: Connection, + table_group: TableGroup, + profiling_run: ProfilingRun, + minutes_offset: int = 0, + ): + self.connection = connection + self.table_group = table_group + self.profiling_run = profiling_run + self.flavor = connection.sql_flavor self.minutes_offset = minutes_offset + self._profiling_template: dict = None - def _get_data_chars_sql(self) -> CRefreshDataCharsSQL: - if not self._data_chars_sql: - params = { - "project_code": self.project_code, - "sql_flavor": self.flavor, - "table_group_schema": self.data_schema, - "table_groups_id": self.table_groups_id, - "max_query_chars": None, - "profiling_table_set": self.parm_table_set, - "profiling_include_mask": self.parm_table_include_mask, - "profiling_exclude_mask": self.parm_table_exclude_mask, - } - self._data_chars_sql = CRefreshDataCharsSQL(params, self.run_date, "v_latest_profile_results") - - return self._data_chars_sql - - def _get_rollup_scores_sql(self) -> CRollupScoresSQL: - if not self._rollup_scores_sql: - self._rollup_scores_sql = CRollupScoresSQL(self.profile_run_id, self.table_groups_id) - - return self._rollup_scores_sql - - def _get_params(self) -> dict: - return { - "PROJECT_CODE": self.project_code, - "CONNECTION_ID": self.connection_id, - "TABLE_GROUPS_ID": self.table_groups_id, - "RUN_DATE": self.run_date, - "DATA_SCHEMA": self.data_schema, - "DATA_TABLE": self.data_table, - "COL_NAME": self.col_name, - "COL_NAME_SANITIZED": self.col_name.replace("'", "''"), - "COL_GEN_TYPE": self.col_gen_type, - "COL_TYPE": self.col_type or "", - "DB_DATA_TYPE": self.db_data_type or "", - "COL_POS": self.col_ordinal_position, - "TOP_FREQ": self.col_top_freq_update, - "PROFILE_RUN_ID": self.profile_run_id, - "PROFILE_ID_COLUMN_MASK": self.profile_id_column_mask, - "PROFILE_SK_COLUMN_MASK": self.profile_sk_column_mask, - "START_TIME": self.today, - "NOW_TIMESTAMP": date_service.get_now_as_string_with_offset(minutes_offset=self.minutes_offset), - "EXCEPTION_MESSAGE": self.exception_message, - "SAMPLING_TABLE": self.sampling_table, - "SAMPLE_SIZE": int(self.parm_sample_size), - "PROFILE_USE_SAMPLING": self.profile_use_sampling, - "PROFILE_SAMPLE_PERCENT": self.profile_sample_percent, - "PROFILE_SAMPLE_MIN_COUNT": self.profile_sample_min_count, - "PROFILE_SAMPLE_RATIO": self.sample_ratio, - "SAMPLE_PERCENT_CALC": self.sample_percent_calc, - "PARM_MAX_PATTERN_LENGTH": self.parm_max_pattern_length, - "CONTINGENCY_COLUMNS": self.contingency_columns, - "CONTINGENCY_MAX_VALUES": self.contingency_max_values, - "PROCESS_ID": self.process_id, + def _get_params(self, column_chars: ColumnChars | None = None, table_sampling: TableSampling | None = None) -> dict: + params = { + "PROJECT_CODE": self.table_group.project_code, + "CONNECTION_ID": self.connection.connection_id, + "TABLE_GROUPS_ID": self.table_group.id, + "PROFILE_RUN_ID": self.profiling_run.id, + "RUN_DATE": self.profiling_run.profiling_starttime, "SQL_FLAVOR": self.flavor, - "QUOTE": get_flavor_service(self.flavor).quote_character + "DATA_SCHEMA": self.table_group.table_group_schema, + "PROFILE_ID_COLUMN_MASK": self.table_group.profile_id_column_mask, + "PROFILE_SK_COLUMN_MASK": self.table_group.profile_sk_column_mask, + "MAX_PATTERN_LENGTH": self.max_pattern_length, } + if column_chars: + params.update({ + "DATA_TABLE": column_chars.table_name, + "COL_NAME": column_chars.column_name, + "COL_NAME_SANITIZED": column_chars.column_name.replace("'", "''"), + "COL_GEN_TYPE": column_chars.general_type, + "COL_TYPE": column_chars.column_type, + "DB_DATA_TYPE": column_chars.db_data_type, + "COL_POS": column_chars.ordinal_position, + }) + if table_sampling: + params.update({ + "SAMPLING_TABLE": table_sampling.table_name, + "SAMPLE_SIZE": table_sampling.sample_count, + "PROFILE_SAMPLE_RATIO": table_sampling.sample_ratio, + "SAMPLE_PERCENT_CALC": table_sampling.sample_percent, + }) + return params def _get_query( self, template_file_name: str, sub_directory: str | None = "profiling", extra_params: dict | None = None, + column_chars: ColumnChars | None = None, + table_sampling: TableSampling | None = None, ) -> tuple[str | None, dict]: query = read_template_sql_file(template_file_name, sub_directory) params = {} if query: - query = self._process_conditionals(query) + query = self._process_conditionals(query, extra_params) + params.update(self._get_params(column_chars, table_sampling)) if extra_params: params.update(extra_params) - params.update(self._get_params()) query = replace_params(query, params) query = replace_templated_functions(query, self.flavor) return query, params - def _process_conditionals(self, query: str): + def _process_conditionals(self, query: str, extra_params: dict | None = None) -> str: re_pattern = re.compile(r"^--\s+TG-(IF|ELSE|ENDIF)(?:\s+(\w+))?\s*$") condition = None updated_query = [] for line in query.splitlines(True): if re_match := re_pattern.match(line): match re_match.group(1): - case "IF" if condition is None and re_match.group(2) is not None: - condition = bool(getattr(self, re_match.group(2))) + case "IF" if condition is None and (variable := re_match.group(2)) is not None: + result = extra_params.get(variable) + if result is None: + result = getattr(self, variable, None) + condition = bool(result) case "ELSE" if condition is not None: condition = not condition case "ENDIF" if condition is not None: @@ -166,68 +147,56 @@ def _process_conditionals(self, query: str): raise ValueError("Template conditional misused") return "".join(updated_query) + + def _get_profiling_template(self) -> dict: + if not self._profiling_template: + self._profiling_template = read_template_yaml_file( + "project_profiling_query.yaml", + sub_directory=f"flavors/{self.flavor}/profiling", + ) + return self._profiling_template - @property - def do_sample_bool(self): - return self.parm_do_sample == "Y" - - def GetSecondProfilingColumnsQuery(self) -> tuple[str, dict]: + def get_frequency_analysis_columns(self) -> tuple[str, dict]: # Runs on App database return self._get_query("secondary_profiling_columns.sql") - def GetSecondProfilingUpdateQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("secondary_profiling_update.sql") - - def GetSecondProfilingStageDeleteQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("secondary_profiling_delete.sql") - - def GetDataTypeSuggestionUpdateQuery(self) -> tuple[str, dict]: + def update_frequency_analysis_results(self) -> list[tuple[str, dict]]: # Runs on App database - return self._get_query("datatype_suggestions.sql") + return [ + self._get_query("secondary_profiling_update.sql"), + self._get_query("secondary_profiling_delete.sql"), + ] - def GetFunctionalDataTypeUpdateQuery(self) -> tuple[str, dict]: + def update_profiling_results(self) -> list[tuple[str, dict]]: # Runs on App database - return self._get_query("functional_datatype.sql") - - def GetFunctionalTableTypeStageQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("functional_tabletype_stage.sql") - - def GetFunctionalTableTypeUpdateQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("functional_tabletype_update.sql") - - def GetPIIFlagUpdateQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("pii_flag.sql") - - def GetAnomalyStatsRefreshQuery(self) -> tuple[str, dict]: + queries = [ + self._get_query("datatype_suggestions.sql"), + self._get_query("functional_datatype.sql"), + self._get_query("functional_tabletype_stage.sql"), + self._get_query("functional_tabletype_update.sql"), + self._get_query("pii_flag.sql"), + ] + if self.table_group.profile_flag_cdes: + queries.append(self._get_query("cde_flagger_query.sql")) + return queries + + def update_hygiene_issue_counts(self) -> tuple[str, dict]: # Runs on App database return self._get_query("refresh_anomalies.sql") - def GetAnomalyScoringRollupRunQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_rollup_scores_sql().GetRollupScoresProfileRunQuery() - - def GetAnomalyScoringRollupTableGroupQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_rollup_scores_sql().GetRollupScoresProfileTableGroupQuery() - - def GetAnomalyTestTypesQuery(self) -> tuple[str, dict]: + def get_hygiene_issue_types(self) -> tuple[str, dict]: # Runs on App database return self._get_query("profile_anomaly_types_get.sql") - def GetAnomalyTestQuery(self, test_type: dict) -> tuple[str, dict] | None: + def detect_hygiene_issue(self, issue_type: HygieneIssueType) -> tuple[str, dict] | None: # Runs on App database extra_params = { - "ANOMALY_ID": test_type["id"], - "DETAIL_EXPRESSION": test_type["detail_expression"], - "ANOMALY_CRITERIA": test_type["anomaly_criteria"], + "ANOMALY_ID": issue_type.id, + "DETAIL_EXPRESSION": issue_type.detail_expression, + "ANOMALY_CRITERIA": issue_type.anomaly_criteria, } - match test_type["data_object"]: + match issue_type.data_object: case "Column": query, params = self._get_query("profile_anomalies_screen_column.sql", extra_params=extra_params) case "Multi-Col": @@ -243,157 +212,90 @@ def GetAnomalyTestQuery(self, test_type: dict) -> tuple[str, dict] | None: return query, params - def GetAnomalyScoringQuery(self, test_type: dict) -> tuple[str, dict]: + def update_hygiene_issue_prevalence(self, issue_type: HygieneIssueType) -> tuple[str, dict]: # Runs on App database query = read_template_sql_file("profile_anomaly_scoring.sql", sub_directory="profiling") params = { - "PROFILE_RUN_ID": self.profile_run_id, - "ANOMALY_ID": test_type["id"], - "PREV_FORMULA": test_type["dq_score_prevalence_formula"], - "RISK": test_type["dq_score_risk_factor"], + "PROFILE_RUN_ID": self.profiling_run.id, + "ANOMALY_ID": issue_type.id, + "PREV_FORMULA": issue_type.dq_score_prevalence_formula, + "RISK": issue_type.dq_score_risk_factor, } query = replace_params(query, params) return query, params - def GetDataCharsRefreshQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_data_chars_sql().GetDataCharsUpdateQuery() - - def GetCDEFlaggerQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("cde_flagger_query.sql") - - def GetProfileRunInfoRecordsQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("project_profile_run_record_insert.sql") - - def GetProfileRunInfoRecordUpdateQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("project_profile_run_record_update.sql") - - def GetDDFQuery(self) -> tuple[str, dict]: - # Runs on Target database - return self._get_data_chars_sql().GetDDFQuery() - - def GetProfilingQuery(self) -> tuple[str, dict]: + def run_column_profiling(self, column_chars: ColumnChars, table_sampling: TableSampling | None = None) -> tuple[str, dict]: # Runs on Target database - if not self.dctSnippetTemplate: - self.dctSnippetTemplate = read_template_yaml_file( - f"project_profiling_query_{self.flavor}.yaml", sub_directory=f"flavors/{self.flavor}/profiling" - ) - - dctSnippetTemplate = self.dctSnippetTemplate - - # Assemble in function - strQ = "" - - if self.parm_do_sample == "Y": - strQ += dctSnippetTemplate["strTemplate01_sampling"] - else: - strQ += dctSnippetTemplate["strTemplate01_else"] - - strQ += dctSnippetTemplate["strTemplate01_5"] - - if self.col_gen_type == "X": - strQ += dctSnippetTemplate["strTemplate02_X"] - else: - strQ += dctSnippetTemplate["strTemplate02_else"] - - if self.col_gen_type in ["A", "D", "N"]: - strQ += dctSnippetTemplate["strTemplate03_ADN"] - else: - strQ += dctSnippetTemplate["strTemplate03_else"] - - if self.col_gen_type == "A": - strQ += dctSnippetTemplate["strTemplate04_A"] - elif self.col_gen_type == "N": - strQ += dctSnippetTemplate["strTemplate04_N"] - else: - strQ += dctSnippetTemplate["strTemplate04_else"] - - if self.col_gen_type == "A": - strQ += dctSnippetTemplate["strTemplate05_A"] - else: - strQ += dctSnippetTemplate["strTemplate05_else"] - - if self.col_gen_type == "A" and self.parm_do_patterns == "Y": - strQ += dctSnippetTemplate["strTemplate06_A_patterns"] + template = self._get_profiling_template() + general_type = column_chars.general_type + + query = "" + query += template["01_sampling" if table_sampling else "01_else"] + query += template["01_all"] + query += template["02_X" if general_type == "X" else "02_else"] + query += template["03_ADN" if general_type in ["A", "D", "N"] else "03_else"] + + if general_type == "A": + query += template["04_A"] + elif general_type == "N": + query += template["04_N"] else: - strQ += dctSnippetTemplate["strTemplate06_else"] - - strQ += dctSnippetTemplate["strTemplate07_else"] - - if self.col_gen_type == "N": - strQ += dctSnippetTemplate["strTemplate08_N"] + query += template["04_else"] + + query += template["05_A" if general_type == "A" else "05_else"] + query += template["06_A" if general_type == "A" else "06_else"] + query += template["08_N" if general_type == "N" else "08_else"] + query += template["10_N_dec" if general_type == "N" and column_chars.is_decimal == True else "10_else"] + query += template["11_D" if general_type == "D" else "11_else"] + query += template["12_B" if general_type == "B" else "12_else"] + query += template["14_A" if general_type == "A" else "14_else"] + query += template["16_all"] + query += template["98_sampling" if table_sampling else "98_else"] + + if general_type == "N": + query += template["99_N_sampling" if table_sampling else "99_N"] else: - strQ += dctSnippetTemplate["strTemplate08_else"] + query += template["99_else"] - if self.col_gen_type == "N" and self.col_is_decimal == True: - strQ += dctSnippetTemplate["strTemplate10_N_dec"] - else: - strQ += dctSnippetTemplate["strTemplate10_else"] + if table_sampling: + query += template["100_sampling"] - if self.col_gen_type == "D": - strQ += dctSnippetTemplate["strTemplate11_D"] - else: - strQ += dctSnippetTemplate["strTemplate11_else"] - if self.col_gen_type == "B": - strQ += dctSnippetTemplate["strTemplate12_B"] - else: - strQ += dctSnippetTemplate["strTemplate12_else"] - - strQ += dctSnippetTemplate["strTemplate13_ALL"] - - if self.col_gen_type == "A": - if self.parm_do_patterns == "Y": - strQ += dctSnippetTemplate["strTemplate14_A_do_patterns"] - else: - strQ += dctSnippetTemplate["strTemplate14_A_no_patterns"] - else: - strQ += dctSnippetTemplate["strTemplate14_else"] - - strQ += dctSnippetTemplate["strTemplate15_ALL"] - - strQ += dctSnippetTemplate["strTemplate16_ALL"] - - if self.parm_do_sample == "Y": - strQ += dctSnippetTemplate["strTemplate98_sampling"] - else: - strQ += dctSnippetTemplate["strTemplate98_else"] - - if self.col_gen_type == "N": - if self.parm_do_sample == "Y": - strQ += dctSnippetTemplate["strTemplate99_N_sampling"] - else: - strQ += dctSnippetTemplate["strTemplate99_N"] - else: - strQ += dctSnippetTemplate["strTemplate99_else"] - - if self.parm_do_sample == "Y": - strQ += dctSnippetTemplate["strTemplate100_sampling"] - - params = self._get_params() - query = replace_params(strQ, params) + params = self._get_params(column_chars, table_sampling) + query = replace_params(query, params) query = replace_templated_functions(query, self.flavor) return query, params - - def GetSecondProfilingQuery(self) -> tuple[str, dict]: - # Runs on Target database - return self._get_query(f"project_secondary_profiling_query_{self.flavor}.sql", f"flavors/{self.flavor}/profiling") - - def GetTableSampleCount(self) -> tuple[str, dict]: - # Runs on Target database - return self._get_query(f"project_get_table_sample_count_{self.flavor}.sql", f"flavors/{self.flavor}/profiling") - - def GetContingencyColumns(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("contingency_columns.sql") - - def GetContingencyCounts(self) -> tuple[str, dict]: + + def get_profiling_errors(self, column_errors: list[tuple[ColumnChars, str]]) -> list[list[str | UUID | int]]: + return [ + [ + self.table_group.project_code, + self.connection.connection_id, + self.table_group.id, + self.table_group.table_group_schema, + self.profiling_run.id, + self.profiling_run.profiling_starttime, + column_chars.table_name, + column_chars.column_name.replace("'", "''"), + column_chars.ordinal_position, + column_chars.column_type, + "X", + column_chars.db_data_type, + column_chars.record_ct, + error[:self.max_error_length], + ] for column_chars, error in column_errors + ] + + def run_frequency_analysis(self, column_chars: ColumnChars, table_sampling: TableSampling | None = None) -> tuple[str, dict]: # Runs on Target database - return self._get_query("contingency_counts.sql", "flavors/generic/profiling") - - def UpdateProfileResultsToEst(self) -> tuple[str, dict]: + return self._get_query( + "project_secondary_profiling_query.sql", + f"flavors/{self.flavor}/profiling", + extra_params={"do_sample_bool": table_sampling is not None}, + column_chars=column_chars, + table_sampling=table_sampling, + ) + + def update_sampled_profiling_results(self, table_sampling: TableSampling) -> tuple[str, dict]: # Runs on App database - return self._get_query("project_update_profile_results_to_estimates.sql") + return self._get_query("project_update_profile_results_to_estimates.sql", table_sampling=table_sampling) diff --git a/testgen/commands/queries/refresh_data_chars_query.py b/testgen/commands/queries/refresh_data_chars_query.py index d6a0359..325c61e 100644 --- a/testgen/commands/queries/refresh_data_chars_query.py +++ b/testgen/commands/queries/refresh_data_chars_query.py @@ -1,99 +1,154 @@ +import dataclasses +from collections.abc import Iterable + from testgen.common import read_template_sql_file from testgen.common.database.database_service import get_flavor_service, replace_params -from testgen.common.database.flavor.flavor_service import SQLFlavor +from testgen.common.models.connection import Connection +from testgen.common.models.table_group import TableGroup from testgen.utils import chunk_queries -class CRefreshDataCharsSQL: - run_date: str - source_table: str - - project_code: str - sql_flavor: SQLFlavor - table_group_schema: str - table_group_id: str +@dataclasses.dataclass +class ColumnChars: + schema_name: str + table_name: str + column_name: str + ordinal_position: int = None + general_type: str = None + column_type: str = None + db_data_type: str = None + is_decimal: bool = False + approx_record_ct: int = None + record_ct: int = None - max_query_chars: int - profiling_table_set: str - profiling_include_mask: str - profiling_exclude_mask: str - def __init__(self, params: dict, run_date: str, source_table: str): - self.run_date = run_date - self.source_table = source_table +class RefreshDataCharsSQL: - self.project_code = params["project_code"] - self.sql_flavor = params["sql_flavor"] - self.table_group_schema = params["table_group_schema"] - self.table_group_id = params["table_groups_id"] + staging_table = "stg_data_chars_updates" + staging_columns = ( + "table_groups_id", + "run_date", + "schema_name", + "table_name", + "column_name", + "position", + "general_type", + "column_type", + "db_data_type", + "approx_record_ct", + "record_ct", + ) - self.max_query_chars = params["max_query_chars"] - self.profiling_table_set = params["profiling_table_set"] - self.profiling_include_mask = params["profiling_include_mask"] - self.profiling_exclude_mask = params["profiling_exclude_mask"] + def __init__(self, connection: Connection, table_group: TableGroup): + self.connection = connection + self.table_group = table_group + self.flavor = connection.sql_flavor + self.flavor_service = get_flavor_service(self.flavor) - def _get_query(self, template_file_name: str, sub_directory: str | None = "data_chars") -> tuple[str, dict]: + def _get_query( + self, + template_file_name: str, + sub_directory: str | None = "data_chars", + extra_params: dict | None = None, + ) -> tuple[str, dict]: query = read_template_sql_file(template_file_name, sub_directory) params = { - "PROJECT_CODE": self.project_code, - "DATA_SCHEMA": self.table_group_schema, - "TABLE_GROUPS_ID": self.table_group_id, - "RUN_DATE": self.run_date, - "SOURCE_TABLE": self.source_table, + "DATA_SCHEMA": self.table_group.table_group_schema, + "TABLE_GROUPS_ID": self.table_group.id, } + if extra_params: + params.update(extra_params) query = replace_params(query, params) return query, params def _get_table_criteria(self) -> str: table_criteria = "" - flavor_service = get_flavor_service(self.sql_flavor) - - if self.profiling_table_set: - table_criteria += f" AND c.{flavor_service.ddf_table_ref} IN ({self.profiling_table_set})" + ddf_table_ref = self.flavor_service.ddf_table_ref + escaped_underscore = self.flavor_service.escaped_underscore + escape_clause = self.flavor_service.escape_clause + + if self.table_group.profiling_table_set: + quoted_table_names = ",".join( + [f"'{item.strip()}'" for item in self.table_group.profiling_table_set.split(",")] + ) + table_criteria += f" AND c.{ddf_table_ref} IN ({quoted_table_names})" - if self.profiling_include_mask: + if self.table_group.profiling_include_mask: include_table_names = [ - item.strip().replace("_", flavor_service.escaped_underscore) - for item in self.profiling_include_mask.split(",") + item.strip().replace("_", escaped_underscore) + for item in self.table_group.profiling_include_mask.split(",") ] table_criteria += f""" AND ( - {" OR ".join([ f"(c.{flavor_service.ddf_table_ref} LIKE '{item}' {flavor_service.escape_clause})" for item in include_table_names ])} + {" OR ".join([ f"(c.{ddf_table_ref} LIKE '{item}' {escape_clause})" for item in include_table_names ])} ) """ - if self.profiling_exclude_mask: + if self.table_group.profiling_exclude_mask: exclude_table_names = [ - item.strip().replace("_", flavor_service.escaped_underscore) - for item in self.profiling_exclude_mask.split(",") + item.strip().replace("_", escaped_underscore) + for item in self.table_group.profiling_exclude_mask.split(",") ] table_criteria += f""" AND NOT ( - {" OR ".join([ f"(c.{flavor_service.ddf_table_ref} LIKE '{item}' {flavor_service.escape_clause})" for item in exclude_table_names ])} + {" OR ".join([ f"(c.{ddf_table_ref} LIKE '{item}' {escape_clause})" for item in exclude_table_names ])} ) """ return table_criteria - def GetDDFQuery(self) -> tuple[str, dict]: + def get_schema_ddf(self) -> tuple[str, dict]: # Runs on Target database - query, params = self._get_query(f"schema_ddf_query_{self.sql_flavor}.sql", f"flavors/{self.sql_flavor}/data_chars") - query = query.replace("{TABLE_CRITERIA}", self._get_table_criteria()) - return query, params + return self._get_query( + "get_schema_ddf.sql", + f"flavors/{self.flavor}/data_chars", + extra_params={"TABLE_CRITERIA": self._get_table_criteria()}, + ) - def GetRecordCountQueries(self, schema_tables: list[str]) -> list[tuple[str, None]]: + def get_row_counts(self, table_names: Iterable[str]) -> list[tuple[str, None]]: # Runs on Target database + schema = self.table_group.table_group_schema + quote = self.flavor_service.quote_character count_queries = [ - f"SELECT '{item}', COUNT(*) FROM {item}" - for item in schema_tables + f"SELECT '{table}', COUNT(*) FROM {quote}{schema}{quote}.{quote}{table}{quote}" + for table in table_names ] - chunked_queries = chunk_queries(count_queries, " UNION ALL ", self.max_query_chars) + chunked_queries = chunk_queries(count_queries, " UNION ALL ", self.connection.max_query_chars) return [ (query, None) for query in chunked_queries ] - def GetDataCharsUpdateQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("data_chars_update.sql") + def verify_access(self, table_name: str) -> tuple[str, None]: + # Runs on Target database + schema = self.table_group.table_group_schema + quote = self.flavor_service.quote_character + query = ( + f"SELECT 1 FROM {quote}{schema}{quote}.{quote}{table_name}{quote} LIMIT 1" + if not self.flavor_service.use_top + else f"SELECT TOP 1 * FROM {quote}{schema}{quote}.{quote}{table_name}{quote}" + ) + return (query, None) - def GetStagingDeleteQuery(self) -> tuple[str, dict]: + def get_staging_data_chars(self, data_chars: list[ColumnChars], run_date: str) -> list[list[str | bool | int]]: + return [ + [ + self.table_group.id, + run_date, + column.schema_name, + column.table_name, + column.column_name, + column.ordinal_position, + column.general_type, + column.column_type, + column.db_data_type, + column.approx_record_ct, + column.record_ct, + ] + for column in data_chars + ] + + def update_data_chars(self, run_date: str) -> list[tuple[str, dict]]: # Runs on App database - return self._get_query("data_chars_staging_delete.sql") + params = {"RUN_DATE": run_date} + return [ + self._get_query("data_chars_update.sql", extra_params=params), + self._get_query("data_chars_staging_delete.sql", extra_params=params), + ] diff --git a/testgen/commands/queries/rollup_scores_query.py b/testgen/commands/queries/rollup_scores_query.py index dde0d55..7255ec6 100644 --- a/testgen/commands/queries/rollup_scores_query.py +++ b/testgen/commands/queries/rollup_scores_query.py @@ -4,7 +4,7 @@ from testgen.common.database.database_service import replace_params -class CRollupScoresSQL: +class RollupScoresSQL: run_id: str table_group_id: str @@ -21,13 +21,12 @@ def _get_query(self, template_file_name: str, sub_directory: str | None = "rollu query = replace_params(query, params) return query, params - def GetRollupScoresProfileRunQuery(self) -> tuple[str, dict]: + def rollup_profiling_scores(self) -> list[tuple[str, dict]]: # Runs on App database - return self._get_query("rollup_scores_profile_run.sql") - - def GetRollupScoresProfileTableGroupQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("rollup_scores_profile_table_group.sql") + return [ + self._get_query("rollup_scores_profile_run.sql"), + self._get_query("rollup_scores_profile_table_group.sql"), + ] def GetRollupScoresTestRunQuery(self) -> tuple[str, dict]: # Runs on App database diff --git a/testgen/commands/run_execute_cat_tests.py b/testgen/commands/run_execute_cat_tests.py index 15d30a1..0f6935a 100644 --- a/testgen/commands/run_execute_cat_tests.py +++ b/testgen/commands/run_execute_cat_tests.py @@ -113,8 +113,8 @@ def run_cat_test_queries( if lstCATQueries: LOG.info("CurrentStep: Performing CAT Tests") - lstAllResults, lstResultColumnNames, intErrors = fetch_from_db_threaded( - lstCATQueries, use_target_db=True, max_threads=params["max_threads"], spinner=spinner + lstAllResults, lstResultColumnNames, errors = fetch_from_db_threaded( + lstCATQueries, use_target_db=True, max_threads=params["max_threads"], ) if lstAllResults: @@ -125,9 +125,9 @@ def run_cat_test_queries( # Parses aggregate results to individual test_result records at dk db execute_db_queries([clsCATExecute.GetCATResultsParseSQL()]) LOG.info("Test results successfully parsed.") - if intErrors > 0: + if errors: has_errors = True - cat_error_msg = f"Errors were encountered executing aggregate tests. ({intErrors} errors occurred.) Please check log." + cat_error_msg = f"Errors were encountered executing aggregate tests. ({len(errors)} errors occurred.) Please check log." LOG.warning(cat_error_msg) clsCATExecute.exception_message += cat_error_msg else: diff --git a/testgen/commands/run_execute_tests.py b/testgen/commands/run_execute_tests.py index af9dd9f..9a16bb2 100644 --- a/testgen/commands/run_execute_tests.py +++ b/testgen/commands/run_execute_tests.py @@ -21,10 +21,11 @@ from testgen.common.get_pipeline_parms import TestExecutionParams from testgen.common.models import with_database_session from testgen.common.models.connection import Connection +from testgen.common.models.table_group import TableGroup from testgen.ui.session import session from .run_execute_cat_tests import run_cat_test_queries -from .run_refresh_data_chars import run_refresh_data_chars_queries +from .run_refresh_data_chars import run_data_chars_refresh from .run_test_parameter_validation import run_parameter_validation_queries LOG = logging.getLogger("testgen") @@ -90,18 +91,18 @@ def run_test_queries( # Execute list, returning test results LOG.info("CurrentStep: Executing Non-CAT Test Queries") - lstTestResults, colResultNames, intErrors = fetch_from_db_threaded( - lstTestQueries, use_target_db=True, max_threads=params["max_threads"], spinner=spinner + lstTestResults, colResultNames, errors = fetch_from_db_threaded( + lstTestQueries, use_target_db=True, max_threads=params["max_threads"], ) # Copy test results to DK DB LOG.info("CurrentStep: Saving Non-CAT Test Results") if lstTestResults: write_to_app_db(lstTestResults, colResultNames, "test_results") - if intErrors > 0: + if errors: has_errors = True error_msg = ( - f"Errors were encountered executing Referential Tests. ({intErrors} errors occurred.) " + f"Errors were encountered executing Referential Tests. ({len(errors)} errors occurred.) " "Please check log. " ) LOG.warning(error_msg) @@ -166,15 +167,15 @@ def run_execution_steps( ) LOG.info("CurrentStep: Assigning Connection Parameters") - connection = Connection.get_by_table_group(test_exec_params["table_groups_id"]) + table_group = TableGroup.get(test_exec_params["table_groups_id"]) + connection = Connection.get(table_group.connection_id) set_target_db_params(connection.__dict__) test_exec_params["sql_flavor"] = connection.sql_flavor test_exec_params["max_query_chars"] = connection.max_query_chars test_exec_params["max_threads"] = connection.max_threads try: - LOG.info("CurrentStep: Execute Step - Data Characteristics Refresh") - run_refresh_data_chars_queries(test_exec_params, test_time, spinner) + run_data_chars_refresh(connection, table_group, test_time) except Exception: LOG.warning("Data Characteristics Refresh failed", exc_info=True, stack_info=True) pass diff --git a/testgen/commands/run_launch_db_config.py b/testgen/commands/run_launch_db_config.py index f65a80e..65ae07d 100644 --- a/testgen/commands/run_launch_db_config.py +++ b/testgen/commands/run_launch_db_config.py @@ -86,7 +86,6 @@ def run_launch_db_config(delete_db: bool, drop_users_and_roles: bool = True) -> user_override=params_mapping["TESTGEN_ADMIN_USER"], password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"], user_type="schema_admin", - suppress_logs=True, ) import_metadata_records_from_yaml(params_mapping) diff --git a/testgen/commands/run_pairwise_contingency_check.py b/testgen/commands/run_pairwise_contingency_check.py new file mode 100644 index 0000000..9df9840 --- /dev/null +++ b/testgen/commands/run_pairwise_contingency_check.py @@ -0,0 +1,147 @@ +# UNUSED CODE - TO BE REVIVED LATER + +from uuid import UUID + +import pandas as pd + +from testgen.commands.queries.contingency_query import ContingencySQL +from testgen.commands.queries.profiling_query import ContingencyTable +from testgen.common.database.database_service import fetch_dict_from_db, write_to_app_db + + +def run_pairwise_contingency_check(profiling_run_id: UUID, threshold_ratio: float) -> None: + # Goal: identify pairs of values that represent IF X=A THEN Y=B rules + + threshold_ratio = threshold_ratio / 100.0 if threshold_ratio else 0.95 + + sql_generator = ContingencySQL() + table_columns = fetch_dict_from_db(*sql_generator.get_contingency_columns(profiling_run_id)) + + if not table_columns: + return + + table_columns = [ContingencyTable(item) for item in table_columns] + df_merged = None + for table in table_columns: + counts = fetch_dict_from_db( + *sql_generator.get_contingency_counts(table), + use_target_db=True, + ) + if counts: + df = pd.DataFrame(counts) + columns = table.contingency_columns.lower().split(",") + overall_counts = {col: df.groupby(col)["freq_ct"].sum() for col in columns} + + contingency_table = [] + for i, col1 in enumerate(columns): + for col2 in columns[i + 1 :]: + # Create a pivot table for each pair + pivot = df.pivot_table(index=col1, columns=col2, values="freq_ct", aggfunc="sum", fill_value=0) + pivot = pivot.stack().reset_index() + pivot.rename(columns={0: "pair_count"}, inplace=True) + + pivot["first_column_overall_count"] = pivot[col1].map(overall_counts[col1]) + pivot["second_column_overall_count"] = pivot[col2].map(overall_counts[col2]) + + pivot["first_column_name"] = col1 + pivot["second_column_name"] = col2 + + contingency_table.append(pivot) + + # Combine all pairs into a single DataFrame + contingency_table = pd.concat(contingency_table, ignore_index=True) + + contingency_table["pair_to_first_ratio"] = ( + contingency_table["pair_count"] / contingency_table["first_column_overall_count"] + ) + contingency_table["pair_to_second_ratio"] = ( + contingency_table["pair_count"] / contingency_table["second_column_overall_count"] + ) + + # Include rows where both cols meet minimum threshold count (max of 30 or 5%) + total_observations = contingency_table["pair_count"].sum() + threshold_min = max(total_observations * 0.05, 30) + contingency_table = contingency_table[ + (contingency_table["first_column_overall_count"] >= threshold_min) + & (contingency_table["second_column_overall_count"] >= threshold_min) + ] + # Drop rows where neither ratio meets the threshold ratio (keep if either meets it) + # -- note we still have to check individual columns when saving pairs + contingency_table = contingency_table[ + ~( + (contingency_table["pair_to_first_ratio"] < threshold_ratio) + & (contingency_table["pair_to_second_ratio"] < threshold_ratio) + ) + ] + + contingency_table["profiling_run_id"] = profiling_run_id + contingency_table["schema_name"] = table.schema_name + contingency_table["table_name"] = table.table_name + + if df_merged is None: + df_merged = contingency_table + else: + df_merged = pd.concat([df_merged, contingency_table], ignore_index=True) + + save_contingency_rules(df_merged, threshold_ratio) + + +def save_contingency_rules(df: pd.DataFrame, threshold_ratio: float) -> None: + if df is None or df.empty: + return + + contingency_rules = [] + for row in df.itertuples(): + # First causes second: almost all of first coincide with second value + if row.pair_to_first_ratio >= threshold_ratio: + contingency_rules.append( + [ + row.profiling_run_id, + row.schema_name, + row.table_name, + row.first_column_name, + getattr(row, row.first_column_name), + row.second_column_name, + getattr(row, row.second_column_name), + row.pair_count, + row.first_column_overall_count, + row.second_column_overall_count, + row.pair_to_first_ratio, + ] + ) + + # Second causes first: almost all of second coincide with first value + if row.pair_to_second_ratio >= threshold_ratio: + contingency_rules.append( + [ + row.profiling_run_id, + row.schema_name, + row.table_name, + row.second_column_name, + getattr(row, row.second_column_name), + row.first_column_name, + getattr(row, row.first_column_name), + row.pair_count, + row.second_column_overall_count, + row.first_column_overall_count, + row.pair_to_second_ratio, + ] + ) + + write_to_app_db( + contingency_rules, + [ + "profile_run_id", + "schema_name", + "table_name", + "cause_column_name", + "cause_column_value", + "effect_column_name", + "effect_column_value", + "pair_count", + "cause_column_total", + "effect_column_total", + "rule_ratio", + ], + "profile_pair_rules", + ) diff --git a/testgen/commands/run_profiling.py b/testgen/commands/run_profiling.py new file mode 100644 index 0000000..344e437 --- /dev/null +++ b/testgen/commands/run_profiling.py @@ -0,0 +1,317 @@ +import logging +import subprocess +import threading +from datetime import UTC, datetime + +import testgen.common.process_service as process_service +from testgen import settings +from testgen.commands.queries.profiling_query import HygieneIssueType, ProfilingSQL, TableSampling +from testgen.commands.queries.refresh_data_chars_query import ColumnChars +from testgen.commands.queries.rollup_scores_query import RollupScoresSQL +from testgen.commands.run_execute_tests import run_execution_steps_in_background +from testgen.commands.run_generate_tests import run_test_gen_queries +from testgen.commands.run_refresh_data_chars import run_data_chars_refresh +from testgen.commands.run_refresh_score_cards_results import run_refresh_score_cards_results +from testgen.common import ( + date_service, + execute_db_queries, + fetch_dict_from_db, + fetch_from_db_threaded, + set_target_db_params, + write_to_app_db, +) +from testgen.common.database.database_service import ThreadedProgress, empty_cache +from testgen.common.mixpanel_service import MixpanelService +from testgen.common.models import with_database_session +from testgen.common.models.connection import Connection +from testgen.common.models.profiling_run import ProfilingRun +from testgen.common.models.table_group import TableGroup +from testgen.common.models.test_suite import TestSuite +from testgen.ui.session import session +from testgen.utils import get_exception_message + +LOG = logging.getLogger("testgen") + + +def run_profiling_in_background(table_group_id): + msg = f"Triggering profiling run for table group {table_group_id}" + if settings.IS_DEBUG: + LOG.info(msg + ". Running in debug mode (new thread instead of new process).") + empty_cache() + background_thread = threading.Thread( + target=run_profiling, + args=(table_group_id, session.auth.user_display if session.auth else None), + ) + background_thread.start() + else: + LOG.info(msg) + script = ["testgen", "run-profile", "-tg", str(table_group_id)] + subprocess.Popen(script) # NOQA S603 + + +@with_database_session +def run_profiling(table_group_id: str, username: str | None = None, minutes_offset: int = 0): + if table_group_id is None: + raise ValueError("Table Group ID was not specified") + + LOG.info(f"Starting profiling run for table group {table_group_id}") + + LOG.info("Retrieving connection and table group parameters") + table_group = TableGroup.get(table_group_id) + connection = Connection.get(table_group.connection_id) + set_target_db_params(connection.__dict__) + + LOG.info("Creating profiling run record") + profiling_run = ProfilingRun( + project_code=table_group.project_code, + connection_id=connection.connection_id, + table_groups_id=table_group.id, + profiling_starttime=date_service.get_now_as_string_with_offset(minutes_offset), + process_id=process_service.get_current_process_id(), + ) + profiling_run.init_progress() + profiling_run.set_progress("data_chars", "Running") + profiling_run.save() + + LOG.info(f"Profiling run: {profiling_run.id}, Connection: {connection.connection_name}, Table group: {table_group.table_groups_name}") + try: + data_chars = run_data_chars_refresh(connection, table_group, profiling_run.profiling_starttime) + distinct_tables = {(column.table_name, column.record_ct) for column in data_chars} + + profiling_run.set_progress("data_chars", "Completed") + profiling_run.table_ct = len(distinct_tables) + profiling_run.column_ct = len(data_chars) + profiling_run.record_ct = sum(table[1] for table in distinct_tables) + profiling_run.data_point_ct = sum(column.record_ct for column in data_chars) + + if data_chars: + sql_generator = ProfilingSQL(connection, table_group, profiling_run, minutes_offset=minutes_offset) + + _run_column_profiling(sql_generator, data_chars) + _run_frequency_analysis(sql_generator) + _run_hygiene_issue_detection(sql_generator) + + # if table_group.profile_do_pair_rules == "Y": + # LOG.info("Compiling pairwise contingency rules") + # run_pairwise_contingency_check(profiling_run.id, table_group.profile_pair_rule_pct) + else: + LOG.info("No columns were selected to profile.") + except Exception as e: + LOG.exception("Profiling encountered an error.") + LOG.info("Updating profiling run record") + profiling_run.log_message = get_exception_message(e) + profiling_run.profiling_endtime = date_service.get_now_as_string_with_offset(minutes_offset) + profiling_run.status = "Error" + profiling_run.save() + else: + LOG.info("Updating profiling run record") + profiling_run.profiling_endtime = date_service.get_now_as_string_with_offset(minutes_offset) + profiling_run.status = "Complete" + profiling_run.save() + + LOG.info("Rolling up profiling scores") + execute_db_queries( + RollupScoresSQL(profiling_run.id, table_group.id).rollup_profiling_scores(), + ) + run_refresh_score_cards_results( + project_code=table_group.project_code, + add_history_entry=True, + refresh_date=date_service.parse_now(profiling_run.profiling_starttime), + ) + + if bool(table_group.monitor_test_suite_id) and not table_group.last_complete_profile_run_id: + _generate_monitor_tests(table_group.project_code, table_group_id, table_group.monitor_test_suite_id) + finally: + if not minutes_offset: + end_time = date_service.parse_now(profiling_run.profiling_endtime) + MixpanelService().send_event( + "run-profiling", + source=settings.ANALYTICS_JOB_SOURCE, + username=username, + sql_flavor=connection.sql_flavor_code, + sampling=table_group.profile_use_sampling, + table_count=profiling_run.table_ct or 0, + column_count=profiling_run.column_ct or 0, + run_duration=(end_time - date_service.parse_now(profiling_run.profiling_starttime)).total_seconds(), + scoring_duration=(datetime.now(UTC) - end_time).total_seconds(), + ) + + return f""" + {"Profiling encountered an error. Check log for details." if profiling_run.status == "Error" else "Profiling completed."} + Run ID: {profiling_run.id} + """ + + +def _run_column_profiling(sql_generator: ProfilingSQL, data_chars: list[ColumnChars]) -> None: + profiling_run = sql_generator.profiling_run + profiling_run.set_progress("col_profiling", "Running") + profiling_run.save() + + LOG.info("Running column profiling queries") + table_group = sql_generator.table_group + sampling_params: dict[str, TableSampling] = {} + sample_percent = ( + float(table_group.profile_sample_percent) + if str(table_group.profile_sample_percent).replace(".", "", 1).isdigit() + else 30 + ) + if table_group.profile_use_sampling and 0 < sample_percent < 100: + min_sample = table_group.profile_sample_min_count + max_sample = 999000 + for column in data_chars: + if not sampling_params.get(column.table_name) and column.record_ct > min_sample: + calc_sample = round(sample_percent * column.record_ct / 100) + sample_count = min(max(calc_sample, min_sample), max_sample) + + sampling_params[column.table_name] = TableSampling( + table_name=column.table_name, + sample_count=sample_count, + sample_ratio=column.record_ct / sample_count, + sample_percent=round(100 * sample_count / column.record_ct, 4), + ) + + def update_column_progress(progress: ThreadedProgress) -> None: + profiling_run.set_progress( + "col_profiling", + "Running", + detail=f"{progress['processed']} of {progress['total']}", + error=f"{progress['errors']} column{'s' if progress['errors'] > 1 else ''} had errors" + if progress["errors"] + else None, + ) + profiling_run.save() + + profiling_results, result_columns, error_data = fetch_from_db_threaded( + [sql_generator.run_column_profiling(column, sampling_params.get(column.table_name)) for column in data_chars], + use_target_db=True, + max_threads=sql_generator.connection.max_threads, + progress_callback=update_column_progress, + ) + + if error_count := len(error_data): + LOG.warning(f"Errors running column profiling queries: {error_count}") + LOG.info("Writing column profiling errors") + error_results = sql_generator.get_profiling_errors( + [(data_chars[index], error) for index, error in error_data.items()] + ) + write_to_app_db(error_results, sql_generator.error_columns, sql_generator.profiling_results_table) + + if not profiling_results: # All queries failed, so stop the process + raise RuntimeError(f"{error_count} errors during column profiling. See details in results.") + + LOG.info("Writing column profiling results") + write_to_app_db(profiling_results, result_columns, sql_generator.profiling_results_table) + + if sampling_params: + try: + LOG.info("Updating sampled profiling results") + execute_db_queries( + [ + sql_generator.update_sampled_profiling_results(table_sampling) + for table_sampling in sampling_params.values() + ] + ) + except Exception as e: + raise RuntimeError(f"Error updating sampled profiling results. {get_exception_message(e)}") from e + + profiling_run.set_progress( + "col_profiling", + "Warning" if error_count else "Completed", + error=f"{error_count} column{'s' if error_count > 1 else ''} had errors. See details in results." + if error_count + else None, + ) + + +def _run_frequency_analysis(sql_generator: ProfilingSQL) -> None: + profiling_run = sql_generator.profiling_run + profiling_run.set_progress("freq_analysis", "Running") + profiling_run.save() + + error_data = None + try: + LOG.info("Selecting columns for frequency analysis") + frequency_columns = fetch_dict_from_db(*sql_generator.get_frequency_analysis_columns()) + + if frequency_columns: + LOG.info("Running frequency analysis queries") + + def update_frequency_progress(progress: ThreadedProgress) -> None: + profiling_run.set_progress( + "freq_analysis", "Running", detail=f"{progress['processed']} of {progress['total']}" + ) + profiling_run.save() + + frequency_results, result_columns, error_data = fetch_from_db_threaded( + [sql_generator.run_frequency_analysis(ColumnChars(**column)) for column in frequency_columns], + use_target_db=True, + max_threads=sql_generator.connection.max_threads, + progress_callback=update_frequency_progress, + ) + if error_data: + LOG.warning(f"Errors running frequency analysis queries: {len(error_data)}") + + if frequency_results: + LOG.info("Writing frequency results to staging") + write_to_app_db(frequency_results, result_columns, sql_generator.frequency_staging_table) + + LOG.info("Updating profiling results with frequency analysis and deleting staging") + execute_db_queries(sql_generator.update_frequency_analysis_results()) + except Exception as e: + profiling_run.set_progress("freq_analysis", "Warning", error=f"Error encountered. {get_exception_message(e)}") + else: + if error_data: + profiling_run.set_progress( + "freq_analysis", "Warning", error=f"Error encountered. {next(iter(error_data.values()))}" + ) + else: + profiling_run.set_progress("freq_analysis", "Completed") + + +def _run_hygiene_issue_detection(sql_generator: ProfilingSQL) -> None: + profiling_run = sql_generator.profiling_run + profiling_run.set_progress("hygiene_issues", "Running") + profiling_run.save() + + try: + LOG.info("Detecting functional data types and critical data elements") + execute_db_queries(sql_generator.update_profiling_results()) + + LOG.info("Retrieving hygiene issue types") + hygiene_issue_types = fetch_dict_from_db(*sql_generator.get_hygiene_issue_types()) + hygiene_issue_types = [HygieneIssueType(**item) for item in hygiene_issue_types] + + LOG.info("Detecting hygiene issues and updating prevalence and counts") + execute_db_queries( + [ + *[ + query + for issue_type in hygiene_issue_types + if (query := sql_generator.detect_hygiene_issue(issue_type)) + ], + *[ + sql_generator.update_hygiene_issue_prevalence(issue_type) + for issue_type in hygiene_issue_types + if issue_type.dq_score_prevalence_formula + ], + sql_generator.update_hygiene_issue_counts(), + ] + ) + except Exception as e: + profiling_run.set_progress("hygiene_issues", "Warning", error=f"Error encountered. {get_exception_message(e)}") + else: + profiling_run.set_progress("hygiene_issues", "Completed") + + +@with_database_session +def _generate_monitor_tests(project_code: str, table_group_id: str, test_suite_id: str) -> None: + try: + monitor_test_suite = TestSuite.get(test_suite_id) + if not monitor_test_suite: + LOG.info("Skipping test generation on missing monitor test suite") + else: + LOG.info("Generating monitor tests") + run_test_gen_queries(table_group_id, monitor_test_suite.test_suite, "Monitor") + run_execution_steps_in_background(project_code, monitor_test_suite.test_suite) + except Exception: + LOG.exception("Error generating monitor tests") diff --git a/testgen/commands/run_profiling_bridge.py b/testgen/commands/run_profiling_bridge.py deleted file mode 100644 index 236985b..0000000 --- a/testgen/commands/run_profiling_bridge.py +++ /dev/null @@ -1,499 +0,0 @@ -import logging -import subprocess -import threading -import uuid -from datetime import UTC, datetime - -import pandas as pd -from progress.spinner import Spinner - -import testgen.common.process_service as process_service -from testgen import settings -from testgen.commands.queries.profiling_query import CProfilingSQL -from testgen.commands.run_execute_tests import run_execution_steps_in_background -from testgen.commands.run_generate_tests import run_test_gen_queries -from testgen.commands.run_refresh_score_cards_results import run_refresh_score_cards_results -from testgen.common import ( - date_service, - execute_db_queries, - fetch_dict_from_db, - fetch_from_db_threaded, - get_profiling_params, - quote_csv_items, - set_target_db_params, - write_to_app_db, -) -from testgen.common.database.database_service import empty_cache, get_flavor_service -from testgen.common.mixpanel_service import MixpanelService -from testgen.common.models import with_database_session -from testgen.common.models.connection import Connection -from testgen.common.models.test_suite import TestSuite -from testgen.ui.session import session - -LOG = logging.getLogger("testgen") - - -def save_contingency_rules(df_merged, threshold_ratio): - # Prep rows to save - lst_rules = [] - for row in df_merged.itertuples(): - # First causes second: almost all of first coincide with second value - if row.pair_to_first_ratio >= threshold_ratio: - profiling_run_id = row.profiling_run_id - schema_name = row.schema_name - table_name = row.table_name - cause_column_name = row.first_column_name - cause_column_value = getattr(row, row.first_column_name) - effect_column_name = row.second_column_name - effect_column_value = getattr(row, row.second_column_name) - pair_count = row.pair_count - cause_column_total = row.first_column_overall_count - effect_column_total = row.second_column_overall_count - rule_ratio = row.pair_to_first_ratio - lst_rules.append( - [ - profiling_run_id, - schema_name, - table_name, - cause_column_name, - cause_column_value, - effect_column_name, - effect_column_value, - pair_count, - cause_column_total, - effect_column_total, - rule_ratio, - ] - ) - - # Second causes first: almost all of second coincide with first value - if row.pair_to_second_ratio >= threshold_ratio: - profiling_run_id = row.profiling_run_id - schema_name = row.schema_name - table_name = row.table_name - cause_column_name = row.second_column_name - cause_column_value = getattr(row, row.second_column_name) - effect_column_name = row.first_column_name - effect_column_value = getattr(row, row.first_column_name) - pair_count = row.pair_count - cause_column_total = row.second_column_overall_count - effect_column_total = row.first_column_overall_count - rule_ratio = row.pair_to_second_ratio - lst_rules.append( - [ - profiling_run_id, - schema_name, - table_name, - cause_column_name, - cause_column_value, - effect_column_name, - effect_column_value, - pair_count, - cause_column_total, - effect_column_total, - rule_ratio, - ] - ) - - write_to_app_db( - lst_rules, - [ - "profile_run_id", - "schema_name", - "table_name", - "cause_column_name", - "cause_column_value", - "effect_column_name", - "effect_column_value", - "pair_count", - "cause_column_total", - "effect_column_total", - "rule_ratio", - ], - "profile_pair_rules", - ) - - -def RunPairwiseContingencyCheck(clsProfiling: CProfilingSQL, threshold_ratio: float): - # Goal: identify pairs of values that represent IF X=A THEN Y=B rules - - # Define the threshold percent -- should be high - if threshold_ratio: - threshold_ratio = threshold_ratio / 100.0 - else: - threshold_ratio = 0.95 - str_max_values = "6" - - # Retrieve columns to include in list from profiing results - clsProfiling.contingency_max_values = str_max_values - lst_tables = fetch_dict_from_db(*clsProfiling.GetContingencyColumns()) - - # Retrieve record counts per column combination - df_merged = None - if lst_tables: - for dct_table in lst_tables: - df_merged = None - clsProfiling.data_schema = dct_table["schema_name"] - clsProfiling.data_table = dct_table["table_name"] - clsProfiling.contingency_columns = quote_csv_items(dct_table["contingency_columns"]) - lst_counts = fetch_dict_from_db(*clsProfiling.GetContingencyCounts(), use_target_db=True) - if lst_counts: - df = pd.DataFrame(lst_counts) - # Get list of columns - columns = dct_table["contingency_columns"].lower().split(",") - - # Calculate overall counts for each column - overall_counts = {col: df.groupby(col)["freq_ct"].sum() for col in columns} - - # Prepare to aggregate the data - contingency_table = [] - for i, col1 in enumerate(columns): - for col2 in columns[i + 1 :]: - # Create a pivot table for each pair - pivot = df.pivot_table(index=col1, columns=col2, values="freq_ct", aggfunc="sum", fill_value=0) - pivot = pivot.stack().reset_index() - pivot.rename(columns={0: "pair_count"}, inplace=True) - - # Add overall counts - pivot["first_column_overall_count"] = pivot[col1].map(overall_counts[col1]) - pivot["second_column_overall_count"] = pivot[col2].map(overall_counts[col2]) - - # Add column names - pivot["first_column_name"] = col1 - pivot["second_column_name"] = col2 - - contingency_table.append(pivot) - - # Combine all pairs into a single DataFrame - contingency_table = pd.concat(contingency_table, ignore_index=True) - - # Calculate the ratios - contingency_table["pair_to_first_ratio"] = ( - contingency_table["pair_count"] / contingency_table["first_column_overall_count"] - ) - contingency_table["pair_to_second_ratio"] = ( - contingency_table["pair_count"] / contingency_table["second_column_overall_count"] - ) - - # Include rows where both cols meet minimum threshold count (max of 30 or 5%) - total_observations = contingency_table["pair_count"].sum() - threshold_min = max(total_observations * 0.05, 30) - contingency_table = contingency_table[ - (contingency_table["first_column_overall_count"] >= threshold_min) - & (contingency_table["second_column_overall_count"] >= threshold_min) - ] - # Drop rows where neither ratio meets the threshold ratio (keep if either meets it) - # -- note we still have to check individual columns when saving pairs - contingency_table = contingency_table[ - ~( - (contingency_table["pair_to_first_ratio"] < threshold_ratio) - & (contingency_table["pair_to_second_ratio"] < threshold_ratio) - ) - ] - - # Add table name - contingency_table["profiling_run_id"] = clsProfiling.profile_run_id - contingency_table["schema_name"] = dct_table["schema_name"] - contingency_table["table_name"] = dct_table["table_name"] - - # Combine with previous tables - if df_merged == None: - df_merged = contingency_table - else: - df_merged = pd.concat([df_merged, contingency_table], ignore_index=True) - - if df_merged is not None: - if not df_merged.empty: - save_contingency_rules(df_merged, threshold_ratio) - - -def run_profiling_in_background(table_group_id): - msg = f"Starting run_profiling_in_background against table group_id: {table_group_id}" - if settings.IS_DEBUG: - LOG.info(msg + ". Running in debug mode (new thread instead of new process).") - empty_cache() - background_thread = threading.Thread( - target=run_profiling_queries, - args=(table_group_id, session.auth.user_display if session.auth else None), - ) - background_thread.start() - else: - LOG.info(msg) - script = ["testgen", "run-profile", "-tg", str(table_group_id)] - subprocess.Popen(script) # NOQA S603 - - -@with_database_session -def run_profiling_queries(table_group_id: str, username: str | None = None, spinner: Spinner | None = None, minutes_offset: int = 0): - if table_group_id is None: - raise ValueError("Table Group ID was not specified") - - has_errors = False - - # Set Project Connection Parms in common.db_bridgers from retrieved parms - LOG.info("CurrentStep: Assigning Connection Parameters") - connection = Connection.get_by_table_group(table_group_id) - set_target_db_params(connection.__dict__) - - LOG.info("CurrentStep: Retrieving Parameters") - - # Generate UUID for Profile Run ID - profiling_run_id = str(uuid.uuid4()) - - params = get_profiling_params(table_group_id) - needs_monitor_tests_generated = ( - bool(params["monitor_test_suite_id"]) and not params["last_complete_profile_run_id"] - ) - - LOG.info("CurrentStep: Initializing Query Generator") - clsProfiling = CProfilingSQL(params["project_code"], connection.sql_flavor, minutes_offset=minutes_offset) - - # Set General Parms - clsProfiling.table_groups_id = table_group_id - clsProfiling.connection_id = connection.connection_id - clsProfiling.profile_run_id = profiling_run_id - clsProfiling.data_schema = params["table_group_schema"] - clsProfiling.parm_table_set = params["profiling_table_set"] - clsProfiling.parm_table_include_mask = params["profiling_include_mask"] - clsProfiling.parm_table_exclude_mask = params["profiling_exclude_mask"] - clsProfiling.profile_id_column_mask = params["profile_id_column_mask"] - clsProfiling.profile_sk_column_mask = params["profile_sk_column_mask"] - clsProfiling.profile_use_sampling = params["profile_use_sampling"] - clsProfiling.profile_flag_cdes = params["profile_flag_cdes"] - clsProfiling.profile_sample_percent = params["profile_sample_percent"] - clsProfiling.profile_sample_min_count = params["profile_sample_min_count"] - clsProfiling.process_id = process_service.get_current_process_id() - - # Add a record in profiling_runs table for the new profile - execute_db_queries([clsProfiling.GetProfileRunInfoRecordsQuery()]) - if spinner: - spinner.next() - - table_count = 0 - column_count = 0 - try: - # Retrieve Column Metadata - LOG.info("CurrentStep: Getting DDF from project") - - lstResult = fetch_dict_from_db(*clsProfiling.GetDDFQuery(), use_target_db=True) - column_count = len(lstResult) - - if lstResult: - flavor_service = get_flavor_service(connection.sql_flavor) - quote = flavor_service.quote_character - - # Get distinct tables - distinct_tables = set() - for item in lstResult: - schema_name = item["table_schema"] - table_name = item["table_name"] - distinct_tables.add(f"{quote}{schema_name}{quote}.{quote}{table_name}{quote}") - - # Convert the set to a list - distinct_tables_list = list(distinct_tables) - table_count = len(distinct_tables_list) - - if clsProfiling.profile_use_sampling == "Y": - # Sampling tables - lstQueries = [] - for parm_sampling_table in distinct_tables_list: - clsProfiling.sampling_table = parm_sampling_table - lstQueries.append(clsProfiling.GetTableSampleCount()) - - lstSampleTables, _, intErrors = fetch_from_db_threaded( - lstQueries, use_target_db=True, max_threads=connection.max_threads, spinner=spinner - ) - dctSampleTables = {x[0]: [x[1], x[2], x[3]] for x in lstSampleTables} - if intErrors > 0: - has_errors = True - LOG.warning( - f"Errors were encountered retrieving sampling table counts. ({intErrors} errors occurred.) Please check log." - ) - - # Assemble profiling queries - if spinner: - spinner.next() - LOG.info("CurrentStep: Assembling profiling queries, round 1") - lstQueries = [] - for dctColumnRecord in lstResult: - # Set Column Parms - clsProfiling.data_schema = dctColumnRecord["table_schema"] - clsProfiling.data_table = dctColumnRecord["table_name"] - clsProfiling.col_name = dctColumnRecord["column_name"] - clsProfiling.col_type = dctColumnRecord["column_type"] - clsProfiling.db_data_type = dctColumnRecord["db_data_type"] - clsProfiling.profile_run_id = profiling_run_id - clsProfiling.col_is_decimal = dctColumnRecord["is_decimal"] - clsProfiling.col_ordinal_position = dctColumnRecord["ordinal_position"] - clsProfiling.col_gen_type = dctColumnRecord["general_type"] - clsProfiling.parm_do_sample = "N" - - if clsProfiling.profile_use_sampling == "Y": - table_identifier = f"{quote}{clsProfiling.data_schema}{quote}.{quote}{clsProfiling.data_table}{quote}" - if dctSampleTables[table_identifier][0] > -1: - clsProfiling.parm_sample_size = dctSampleTables[table_identifier][0] - clsProfiling.sample_ratio = dctSampleTables[table_identifier][1] - clsProfiling.sample_percent_calc = dctSampleTables[table_identifier][2] - clsProfiling.parm_do_sample = clsProfiling.profile_use_sampling - else: - clsProfiling.parm_sample_size = 0 - clsProfiling.sample_ratio = "" - clsProfiling.sample_percent_calc = "" - - lstQueries.append(clsProfiling.GetProfilingQuery()) - - # Run Profiling Queries and save results - LOG.info("CurrentStep: Profiling Round 1") - LOG.debug("Running %s profiling queries", len(lstQueries)) - - lstProfiles, colProfileNames, intErrors = fetch_from_db_threaded( - lstQueries, use_target_db=True, max_threads=connection.max_threads, spinner=spinner - ) - if intErrors > 0: - has_errors = True - LOG.warning( - f"Errors were encountered executing profiling queries. ({intErrors} errors occurred.) Please check log." - ) - LOG.info("CurrentStep: Saving Round 1 profiling results to Metadata") - write_to_app_db(lstProfiles, colProfileNames, "profile_results") - - if clsProfiling.profile_use_sampling == "Y": - lstQueries = [] - for table_name, value in dctSampleTables.items(): - if value[0] > -1: - clsProfiling.sampling_table = table_name - clsProfiling.sample_ratio = value[1] - lstQueries.append(clsProfiling.UpdateProfileResultsToEst()) - - execute_db_queries(lstQueries) - - if clsProfiling.parm_do_freqs == "Y": - lstUpdates = [] - # Get secondary profiling columns - LOG.info("CurrentStep: Selecting columns for frequency analysis") - lstResult = fetch_dict_from_db(*clsProfiling.GetSecondProfilingColumnsQuery()) - - if lstResult: - # Assemble secondary profiling queries - # - Freqs for columns not already freq'd, but with max actual value length under threshold - LOG.info("CurrentStep: Generating frequency queries") - lstQueries = [] - for dctColumnRecord in lstResult: - clsProfiling.data_schema = dctColumnRecord["schema_name"] - clsProfiling.data_table = dctColumnRecord["table_name"] - clsProfiling.col_name = dctColumnRecord["column_name"] - - lstQueries.append(clsProfiling.GetSecondProfilingQuery()) - # Run secondary profiling queries - LOG.info("CurrentStep: Retrieving %s frequency results from project", len(lstQueries)) - lstUpdates, colProfileNames, intErrors = fetch_from_db_threaded( - lstQueries, use_target_db=True, max_threads=connection.max_threads, spinner=spinner - ) - if intErrors > 0: - has_errors = True - LOG.warning( - f"Errors were encountered executing frequency queries. ({intErrors} errors occurred.) Please check log." - ) - - if lstUpdates: - # Copy secondary results to DQ staging - LOG.info("CurrentStep: Writing frequency results to Staging") - write_to_app_db(lstUpdates, colProfileNames, "stg_secondary_profile_updates") - - LOG.info("CurrentStep: Generating profiling update queries") - - lstQueries = [] - lstAnomalyTypes = [] - - if lstUpdates: - # Run single update query, then delete from staging - lstQueries.extend([ - clsProfiling.GetSecondProfilingUpdateQuery(), - clsProfiling.GetSecondProfilingStageDeleteQuery(), - ]) - lstQueries.extend([ - clsProfiling.GetDataTypeSuggestionUpdateQuery(), - clsProfiling.GetFunctionalDataTypeUpdateQuery(), - clsProfiling.GetFunctionalTableTypeStageQuery(), - clsProfiling.GetFunctionalTableTypeUpdateQuery(), - clsProfiling.GetPIIFlagUpdateQuery(), - ]) - - lstAnomalyTypes = fetch_dict_from_db(*clsProfiling.GetAnomalyTestTypesQuery()) - lstQueries.extend([ - query for test_type in lstAnomalyTypes if (query := clsProfiling.GetAnomalyTestQuery(test_type)) - ]) - lstQueries.extend([ - clsProfiling.GetAnomalyScoringQuery(test_type) - for test_type in lstAnomalyTypes - if test_type["dq_score_prevalence_formula"] - ]) - lstQueries.append(clsProfiling.GetAnomalyStatsRefreshQuery()) - - # Always runs last - lstQueries.append(clsProfiling.GetDataCharsRefreshQuery()) - if clsProfiling.profile_flag_cdes: - lstQueries.append(clsProfiling.GetCDEFlaggerQuery()) - - LOG.info("CurrentStep: Running profiling update queries") - execute_db_queries(lstQueries) - - if params["profile_do_pair_rules"] == "Y": - LOG.info("CurrentStep: Compiling pairwise contingency rules") - RunPairwiseContingencyCheck(clsProfiling, params["profile_pair_rule_pct"]) - else: - LOG.info("No columns were selected to profile.") - except Exception as e: - has_errors = True - sqlsplit = e.args[0].split("[SQL", 1) - errorline = sqlsplit[0].replace("'", "''") if len(sqlsplit) > 0 else "unknown error" - clsProfiling.exception_message = f"{type(e).__name__}: {errorline}" - raise - finally: - LOG.info("Updating the profiling run record") - execute_db_queries([clsProfiling.GetProfileRunInfoRecordUpdateQuery()]) - end_time = datetime.now(UTC) - - execute_db_queries([ - clsProfiling.GetAnomalyScoringRollupRunQuery(), - clsProfiling.GetAnomalyScoringRollupTableGroupQuery(), - ]) - run_refresh_score_cards_results( - project_code=params["project_code"], - add_history_entry=True, - refresh_date=date_service.parse_now(clsProfiling.run_date), - ) - - MixpanelService().send_event( - "run-profiling", - source=settings.ANALYTICS_JOB_SOURCE, - username=username, - sql_flavor=clsProfiling.flavor, - sampling=clsProfiling.profile_use_sampling == "Y", - table_count=table_count, - column_count=column_count, - run_duration=(end_time - date_service.parse_now(clsProfiling.run_date)).total_seconds(), - scoring_duration=(datetime.now(UTC) - end_time).total_seconds(), - ) - - if needs_monitor_tests_generated: - _generate_monitor_tests(params["project_code"], table_group_id, params["monitor_test_suite_id"]) - - return f""" - Profiling completed {"with errors. Check log for details." if has_errors else "successfully."} - Run ID: {profiling_run_id} - """ - - -@with_database_session -def _generate_monitor_tests(project_code: str, table_group_id: str, test_suite_id: str) -> None: - try: - monitor_test_suite = TestSuite.get(test_suite_id) - if not monitor_test_suite: - LOG.info("Skipping test generation on missing monitor test suite") - else: - LOG.info("Generating monitor tests") - run_test_gen_queries(table_group_id, monitor_test_suite.test_suite, "Monitor") - run_execution_steps_in_background(project_code, monitor_test_suite.test_suite) - except Exception: - LOG.exception("Error generating monitor tests") diff --git a/testgen/commands/run_refresh_data_chars.py b/testgen/commands/run_refresh_data_chars.py index 2c81255..9da28de 100644 --- a/testgen/commands/run_refresh_data_chars.py +++ b/testgen/commands/run_refresh_data_chars.py @@ -1,83 +1,57 @@ import logging -from progress.spinner import Spinner - -from testgen.commands.queries.refresh_data_chars_query import CRefreshDataCharsSQL +from testgen.commands.queries.refresh_data_chars_query import ColumnChars, RefreshDataCharsSQL from testgen.common.database.database_service import ( execute_db_queries, fetch_dict_from_db, fetch_from_db_threaded, - get_flavor_service, write_to_app_db, ) -from testgen.common.get_pipeline_parms import TestExecutionParams +from testgen.common.models.connection import Connection +from testgen.common.models.table_group import TableGroup +from testgen.utils import get_exception_message LOG = logging.getLogger("testgen") -STAGING_TABLE = "stg_data_chars_updates" - -def run_refresh_data_chars_queries(params: TestExecutionParams, run_date: str, spinner: Spinner=None): - LOG.info("CurrentStep: Initializing Data Characteristics Refresh") - sql_generator = CRefreshDataCharsSQL(params, run_date, STAGING_TABLE) - flavor_service = get_flavor_service(params["sql_flavor"]) - quote = flavor_service.quote_character - LOG.info("CurrentStep: Getting DDF for table group") - ddf_results = fetch_dict_from_db(*sql_generator.GetDDFQuery(), use_target_db=True) +def run_data_chars_refresh(connection: Connection, table_group: TableGroup, run_date: str) -> list[ColumnChars]: + sql_generator = RefreshDataCharsSQL(connection, table_group) - distinct_tables = { - f"{quote}{item['table_schema']}{quote}.{quote}{item['table_name']}{quote}" - for item in ddf_results - } - if distinct_tables: - count_queries = sql_generator.GetRecordCountQueries(distinct_tables) + LOG.info("Getting DDF for table group") + try: + data_chars = fetch_dict_from_db(*sql_generator.get_schema_ddf(), use_target_db=True) + except Exception as e: + raise RuntimeError(f"Error refreshing columns for data catalog. {get_exception_message(e)}") from e + + data_chars = [ColumnChars(**column) for column in data_chars] + if data_chars: + distinct_tables = {column.table_name for column in data_chars} + count_queries = sql_generator.get_row_counts(distinct_tables) - LOG.info("CurrentStep: Getting record counts for table group") - count_results, _, error_count = fetch_from_db_threaded( - count_queries, use_target_db=True, max_threads=params["max_threads"], spinner=spinner + LOG.info("Getting row counts for table group") + count_results, _, error_data = fetch_from_db_threaded( + count_queries, use_target_db=True, max_threads=connection.max_threads, ) - if error_count: - LOG.warning(f"{error_count} errors were encountered while retrieving record counts.") + + count_map = dict(count_results) + for column in data_chars: + column.record_ct = count_map.get(column.table_name) + + write_data_chars(data_chars, sql_generator, run_date) + + if error_data: + raise RuntimeError(f"Error refreshing row counts for data catalog. {next(iter(error_data.values()))}") else: - count_results = [] - LOG.warning("No tables detected in table group. Skipping retrieval of record counts") + LOG.warning("No tables detected in table group") + + return data_chars + - count_map = dict(count_results) - staging_columns = [ - "project_code", - "table_groups_id", - "run_date", - "schema_name", - "table_name", - "column_name", - "position", - "general_type", - "column_type", - "db_data_type", - "record_ct", - ] - staging_records = [ - [ - item["project_code"], - params["table_groups_id"], - run_date, - item["table_schema"], - item["table_name"], - item["column_name"], - item["ordinal_position"], - item["general_type"], - item["column_type"], - item["db_data_type"], - count_map.get(f"{quote}{item['table_schema']}{quote}.{quote}{item['table_name']}{quote}", 0), - ] - for item in ddf_results - ] +def write_data_chars(data_chars: list[ColumnChars], sql_generator: RefreshDataCharsSQL, run_date: str) -> None: + staging_results = sql_generator.get_staging_data_chars(data_chars, run_date) - LOG.info("CurrentStep: Writing data characteristics to staging") - write_to_app_db(staging_records, staging_columns, STAGING_TABLE) + LOG.info("Writing data characteristics to staging") + write_to_app_db(staging_results, sql_generator.staging_columns, sql_generator.staging_table) - LOG.info("CurrentStep: Refreshing data characteristics and deleting staging") - execute_db_queries([ - sql_generator.GetDataCharsUpdateQuery(), - sql_generator.GetStagingDeleteQuery(), - ]) + LOG.info("Refreshing data characteristics and deleting staging") + execute_db_queries(sql_generator.update_data_chars(run_date)) diff --git a/testgen/commands/run_refresh_score_cards_results.py b/testgen/commands/run_refresh_score_cards_results.py index 7d56c6b..5475496 100644 --- a/testgen/commands/run_refresh_score_cards_results.py +++ b/testgen/commands/run_refresh_score_cards_results.py @@ -24,7 +24,6 @@ def run_refresh_score_cards_results( ): start_time = time.time() _refresh_date = refresh_date or datetime.datetime.now(datetime.UTC) - LOG.info("CurrentStep: Initializing scorecards results refresh") try: definitions = [] @@ -33,13 +32,13 @@ def run_refresh_score_cards_results( else: definitions.append(ScoreDefinition.get(str(definition_id))) except Exception: - LOG.exception("CurrentStep: Stopping scorecards results refresh after unexpected error") + LOG.exception("Stopping scorecards results refresh after unexpected error") return db_session = get_current_session() for definition in definitions: LOG.info( - "CurrentStep: Refreshing results for scorecard %s in project %s", + "Refreshing results for scorecard %s in project %s", definition.name, definition.project_code, ) @@ -53,8 +52,8 @@ def run_refresh_score_cards_results( definition.results = _score_card_to_results(fresh_score_card) definition.breakdown = _score_definition_to_results_breakdown(definition) if add_history_entry: - LOG.info( - "CurrentStep: Adding history entry for scorecard %s in project %s", + LOG.debug( + "Adding history entry for scorecard %s in project %s", definition.name, definition.project_code, ) @@ -71,14 +70,9 @@ def run_refresh_score_cards_results( definition.history.append(history_entry) history_entry.add_as_cutoff() definition.save() - LOG.info( - "CurrentStep: Done refreshing scorecard %s in project %s", - definition.name, - definition.project_code, - ) except Exception: LOG.exception( - "CurrentStep: Unexpected error refreshing scorecard %s in project %s", + "Error refreshing scorecard %s in project %s", definition.name, definition.project_code, ) @@ -90,7 +84,7 @@ def run_refresh_score_cards_results( scope = f"scorecard {definition_id}" end_time = time.time() - LOG.info("CurrentStep: Refreshing results for %s is over after %s seconds", scope, round(end_time - start_time, 2)) + LOG.info("Refreshing results for %s done after %s seconds", scope, round(end_time - start_time, 2)) def _score_card_to_results(score_card: ScoreCard) -> list[ScoreDefinitionResult]: diff --git a/testgen/commands/run_rollup_scores.py b/testgen/commands/run_rollup_scores.py index e835571..707e50f 100644 --- a/testgen/commands/run_rollup_scores.py +++ b/testgen/commands/run_rollup_scores.py @@ -1,6 +1,6 @@ import logging -from testgen.commands.queries.rollup_scores_query import CRollupScoresSQL +from testgen.commands.queries.rollup_scores_query import RollupScoresSQL from testgen.commands.run_refresh_score_cards_results import run_refresh_score_cards_results from testgen.common.database.database_service import execute_db_queries @@ -9,7 +9,7 @@ def run_profile_rollup_scoring_queries(project_code: str, run_id: str, table_group_id: str | None = None): LOG.info("CurrentStep: Initializing Profiling Scores Rollup") - sql_generator = CRollupScoresSQL(run_id, table_group_id) + sql_generator = RollupScoresSQL(run_id, table_group_id) queries = [sql_generator.GetRollupScoresProfileRunQuery()] if table_group_id: @@ -22,7 +22,7 @@ def run_profile_rollup_scoring_queries(project_code: str, run_id: str, table_gro def run_test_rollup_scoring_queries(project_code: str, run_id: str, table_group_id: str | None = None): LOG.info("CurrentStep: Initializing Testing Scores Rollup") - sql_generator = CRollupScoresSQL(run_id, table_group_id) + sql_generator = RollupScoresSQL(run_id, table_group_id) queries = [sql_generator.GetRollupScoresTestRunQuery()] if table_group_id: diff --git a/testgen/commands/run_upgrade_db_config.py b/testgen/commands/run_upgrade_db_config.py index e144f07..95ec4bc 100644 --- a/testgen/commands/run_upgrade_db_config.py +++ b/testgen/commands/run_upgrade_db_config.py @@ -96,7 +96,6 @@ def _refresh_static_metadata(params_mapping): user_override=params_mapping["TESTGEN_ADMIN_USER"], password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"], user_type="schema_admin", - suppress_logs=True, ) import_metadata_records_from_yaml(params_mapping) @@ -107,7 +106,6 @@ def _refresh_static_metadata(params_mapping): user_override=params_mapping["TESTGEN_ADMIN_USER"], password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"], user_type="schema_admin", - suppress_logs=True, ) diff --git a/testgen/common/database/database_service.py b/testgen/common/database/database_service.py index 8adbe7c..b46f4d8 100644 --- a/testgen/common/database/database_service.py +++ b/testgen/common/database/database_service.py @@ -2,15 +2,13 @@ import csv import importlib import logging -import queue as qu -import threading +from collections.abc import Callable, Iterable from contextlib import suppress from dataclasses import dataclass, field from typing import Any, Literal, TypedDict from urllib.parse import quote_plus import psycopg2.sql -from progress.spinner import Spinner from sqlalchemy import create_engine, text from sqlalchemy.engine import LegacyRow, RowMapping from sqlalchemy.engine.base import Connection, Engine @@ -29,6 +27,7 @@ from testgen.common.database import FilteredStringIO from testgen.common.database.flavor.flavor_service import ConnectionParams, FlavorService, SQLFlavor from testgen.common.read_file import get_template_files +from testgen.utils import get_exception_message LOG = logging.getLogger("testgen") @@ -95,7 +94,7 @@ def create_database( drop_existing: bool = False, drop_users_and_roles: bool = False, ) -> None: - LOG.info("DB operation: create_database on App database (User type = database_admin)") + LOG.debug("DB operation: create_database on App database (User type = database_admin)") connection = _init_db_connection( user_override=params["TESTGEN_ADMIN_USER"], @@ -134,19 +133,16 @@ def execute_db_queries( user_override: str | None = None, password_override: str | None = None, user_type: UserType = "normal", - suppress_logs: bool = False, ) -> tuple[list[Any], list[int]]: - LOG.info(f"DB operation: execute_db_queries ({len(queries)}) on {'Target' if use_target_db else 'App'} database (User type = {user_type})") + LOG.debug(f"DB operation: execute_db_queries ({len(queries)}) on {'Target' if use_target_db else 'App'} database (User type = {user_type})") with _init_db_connection(use_target_db, user_override, password_override, user_type) as connection: return_values: list[Any] = [] row_counts: list[int] = [] if not queries: - LOG.info("No queries to process") + LOG.debug("No queries to process") for index, (query, params) in enumerate(queries): - LOG.debug(f"Query: {query}") - if not suppress_logs: - LOG.info(f"Processing {index + 1} of {len(queries)} queries") + LOG.debug(f"Query {index + 1} of {len(queries)}: {query}") transaction = connection.begin() result = connection.execute(text(query), params) row_counts.append(result.rowcount) @@ -166,55 +162,73 @@ def execute_db_queries( return return_values, row_counts +class ThreadedProgress(TypedDict): + processed: int + errors: int + total: int + + def fetch_from_db_threaded( queries: list[tuple[str, dict | None]], use_target_db: bool = False, - max_threads: int | None = None, - spinner: Spinner | None = None, -) -> tuple[list[LegacyRow], list[str], int]: - LOG.info(f"DB operation: fetch_from_db_threaded on {'Target' if use_target_db else 'App'} database (User type = normal)") + max_threads: int = 4, + progress_callback: Callable[[ThreadedProgress], None] | None = None, +) -> tuple[list[LegacyRow], list[str], dict[int, str]]: + LOG.debug(f"DB operation: fetch_from_db_threaded ({len(queries)}) on {'Target' if use_target_db else 'App'} database (User type = normal)") - result_data = [] - result_columns: list[str] = [] - error_count = 0 + def fetch_data(query: str, params: dict | None, index: int) -> tuple[list[LegacyRow], list[str], int, str | None]: + LOG.debug(f"Query: {query}") + row_data: list[LegacyRow] = [] + column_names: list[str] = [] + error = None - if not max_threads or max_threads < 1 or max_threads > 10: - max_threads = 4 + try: + with _init_db_connection(use_target_db) as connection: + result = connection.execute(text(query), params) + LOG.debug(f"{result.rowcount} records retrieved") + row_data = result.fetchall() + column_names = list(result.keys()) + except Exception as e: + error = get_exception_message(e) + LOG.exception(f"Failed to execute threaded query: {query}") - queue = qu.Queue() - for item in queries: - queue.put(item) + return row_data, column_names, index, error + + result_data: list[LegacyRow] = [] + result_columns: list[str] = [] + error_data: dict[int, str] = {} - threaded_fetch = _ThreadedFetch(use_target_db, threading.Lock()) + query_count = len(queries) + processed_count = 0 + max_threads = max(1, min(10, max_threads)) with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: - try: - futures = [] - while not queue.empty(): - query, params = queue.get() - futures.append(executor.submit(threaded_fetch, query, params)) - - for future in futures: - row_data, column_names, has_errors = future.result() - if spinner: - spinner.next() - error_count += 1 if has_errors else 0 - if row_data: - result_data.append(row_data) - result_columns = column_names - - except Exception: - LOG.exception("Failed to execute threaded queries") + futures = [ + executor.submit(fetch_data, query, params, index) + for index, (query, params) in enumerate(queries) + ] + for future in concurrent.futures.as_completed(futures): + row_data, column_names, index, error = future.result() + if row_data: + result_data.append(row_data) + result_columns = column_names + if error: + error_data[index] = error + + processed_count += 1 + if progress_callback: + progress_callback({"processed": processed_count, "errors": len(error_data), "total": query_count}) + LOG.debug(f"Processed {processed_count} of {query_count} threaded queries") # Flatten nested lists result_data = [element for sublist in result_data for element in sublist] - return result_data, result_columns, error_count + return result_data, result_columns, error_data def fetch_list_from_db( query: str, params: dict | None = None, use_target_db: bool = False ) -> tuple[list[LegacyRow], list[str]]: - LOG.info(f"DB operation: fetch_list_from_db on {'Target' if use_target_db else 'App'} database (User type = normal)") + LOG.debug(f"DB operation: fetch_list_from_db on {'Target' if use_target_db else 'App'} database (User type = normal)") with _init_db_connection(use_target_db) as connection: LOG.debug(f"Query: {query}") @@ -229,7 +243,7 @@ def fetch_list_from_db( def fetch_dict_from_db( query: str, params: dict | None = None, use_target_db: bool = False ) -> list[RowMapping]: - LOG.info(f"DB operation: fetch_dict_from_db on {'Target' if use_target_db else 'App'} database (User type = normal)") + LOG.debug(f"DB operation: fetch_dict_from_db on {'Target' if use_target_db else 'App'} database (User type = normal)") with _init_db_connection(use_target_db) as connection: LOG.debug(f"Query: {query}") @@ -239,8 +253,8 @@ def fetch_dict_from_db( return [row._mapping for row in result] -def write_to_app_db(data: list[LegacyRow], column_names: list[str], table_name: str) -> None: - LOG.info("DB operation: write_to_app_db on App database (User type = normal)") +def write_to_app_db(data: list[LegacyRow], column_names: Iterable[str], table_name: str) -> None: + LOG.debug("DB operation: write_to_app_db on App database (User type = normal)") # use_raw is required to make use of the copy_expert method for fast batch ingestion connection = _init_db_connection(use_raw=True) @@ -384,37 +398,3 @@ def _init_target_db_connection() -> Connection: ) return connection - - -class _ThreadedFetch: - def __init__(self, use_target_db: bool, count_lock: threading.Lock): - self.use_target_db = use_target_db - self.count_lock = count_lock - self.count = 0 - - def __call__(self, query: str, params: dict | None = None) -> tuple[list[LegacyRow], list[str], bool]: - LOG.debug(f"Query: {query}") - column_names: list[str] = [] - row_data: list = None - has_errors = False - - with self.count_lock: - self.count += 1 - i = self.count - - try: - with _init_db_connection(self.use_target_db) as connection: - try: - result = connection.execute(text(query), params) - LOG.debug(f"{result.rowcount} records retrieved") - row_data = result.fetchall() - if not column_names: - column_names = result.keys() - LOG.info(f"Processed threaded query {i} on thread {threading.current_thread().name}") - except Exception: - LOG.exception(f"Failed to execute threaded query: {query}") - has_errors = True - except Exception as e: - raise ValueError(f"Failed to execute threaded query: {e}") from e - else: - return row_data, list(column_names), has_errors diff --git a/testgen/common/get_pipeline_parms.py b/testgen/common/get_pipeline_parms.py index 3c37aac..e5ced2f 100644 --- a/testgen/common/get_pipeline_parms.py +++ b/testgen/common/get_pipeline_parms.py @@ -8,23 +8,6 @@ class BaseParams(TypedDict): project_code: str connection_id: str -class ProfilingParams(BaseParams): - table_groups_id: str - profiling_table_set: str - profiling_include_mask: str - profiling_exclude_mask: str - profile_id_column_mask: str - profile_sk_column_mask: str - profile_use_sampling: str - profile_flag_cdes: bool - profile_sample_percent: str - profile_sample_min_count: int - profile_do_pair_rules: str - profile_pair_rule_pct: int - monitor_test_suite_id: str | None - last_complete_profile_run_id: str | None - - class TestGenerationParams(BaseParams): export_to_observability: str test_suite_id: str @@ -43,17 +26,6 @@ class TestExecutionParams(BaseParams): max_query_chars: int - -def get_profiling_params(table_group_id: str) -> ProfilingParams: - results = fetch_dict_from_db( - read_template_sql_file("parms_profiling.sql", "parms"), - {"TABLE_GROUP_ID": table_group_id}, - ) - if not results: - raise ValueError("Connection parameters not found for profiling.") - return ProfilingParams(results[0]) - - def get_test_generation_params(table_group_id: str, test_suite: str) -> TestGenerationParams: results = fetch_dict_from_db( read_template_sql_file("parms_test_gen.sql", "parms"), diff --git a/testgen/common/models/profiling_run.py b/testgen/common/models/profiling_run.py index da848f7..713a06e 100644 --- a/testgen/common/models/profiling_run.py +++ b/testgen/common/models/profiling_run.py @@ -1,13 +1,14 @@ from collections.abc import Iterable from dataclasses import dataclass from datetime import UTC, datetime -from typing import Literal, NamedTuple -from uuid import UUID +from typing import Literal, NamedTuple, TypedDict +from uuid import UUID, uuid4 import streamlit as st from sqlalchemy import BigInteger, Column, Float, Integer, String, desc, func, select, text, update from sqlalchemy.dialects import postgresql from sqlalchemy.orm import InstrumentedAttribute +from sqlalchemy.orm.attributes import flag_modified from sqlalchemy.sql.expression import case from testgen.common.models import get_current_session @@ -16,7 +17,15 @@ from testgen.utils import is_uuid4 ProfilingRunStatus = Literal["Running", "Complete", "Error", "Cancelled"] +ProgressKey = Literal["data_chars", "col_profiling", "freq_analysis", "hygiene_issues"] +ProgressStatus = Literal["Pending", "Running", "Completed", "Warning"] +class ProgressStep(TypedDict): + key: ProgressKey + status: ProgressStatus + label: str + detail: str + error: str @dataclass class ProfilingRunMinimal(EntityMinimal): @@ -32,16 +41,19 @@ class ProfilingRunMinimal(EntityMinimal): @dataclass class ProfilingRunSummary(EntityMinimal): - profiling_run_id: UUID - start_time: datetime - end_time: datetime + id: UUID + profiling_starttime: datetime + profiling_endtime: datetime table_groups_name: str status: ProfilingRunStatus + progress: list[ProgressStep] process_id: int log_message: str - schema_name: str + table_group_schema: str table_ct: int column_ct: int + record_ct: int + data_point_ct: int anomaly_ct: int anomalies_definite_ct: int anomalies_likely_ct: int @@ -58,16 +70,19 @@ class LatestProfilingRun(NamedTuple): class ProfilingRun(Entity): __tablename__ = "profiling_runs" - id: UUID = Column(postgresql.UUID(as_uuid=True), primary_key=True) + id: UUID = Column(postgresql.UUID(as_uuid=True), primary_key=True, default=uuid4) project_code: str = Column(String, nullable=False) connection_id: str = Column(BigInteger, nullable=False) table_groups_id: UUID = Column(postgresql.UUID(as_uuid=True), nullable=False) profiling_starttime: datetime = Column(postgresql.TIMESTAMP) profiling_endtime: datetime = Column(postgresql.TIMESTAMP) status: ProfilingRunStatus = Column(String, default="Running") + progress: list[ProgressStep] = Column(postgresql.JSONB, default=[]) log_message: str = Column(String) table_ct: int = Column(BigInteger) column_ct: int = Column(BigInteger) + record_ct: int = Column(BigInteger) + data_point_ct: int = Column(BigInteger) anomaly_ct: int = Column(BigInteger) anomaly_table_ct: int = Column(BigInteger) anomaly_column_ct: int = Column(BigInteger) @@ -176,28 +191,32 @@ def select_summary( ) GROUP BY profile_anomaly_results.profile_run_id ) - SELECT v_profiling_runs.profiling_run_id, - v_profiling_runs.start_time, - v_profiling_runs.end_time, - v_profiling_runs.table_groups_name, - v_profiling_runs.status, - v_profiling_runs.process_id, - v_profiling_runs.log_message, - v_profiling_runs.schema_name, - v_profiling_runs.table_ct, - v_profiling_runs.column_ct, - v_profiling_runs.anomaly_ct, + SELECT profiling_runs.id, + profiling_runs.profiling_starttime, + profiling_runs.profiling_endtime, + table_groups.table_groups_name, + profiling_runs.status, + profiling_runs.progress, + profiling_runs.process_id, + profiling_runs.log_message, + table_groups.table_group_schema, + profiling_runs.table_ct, + profiling_runs.column_ct, + profiling_runs.record_ct, + profiling_runs.data_point_ct, + profiling_runs.anomaly_ct, profile_anomalies.definite_ct AS anomalies_definite_ct, profile_anomalies.likely_ct AS anomalies_likely_ct, profile_anomalies.possible_ct AS anomalies_possible_ct, profile_anomalies.dismissed_ct AS anomalies_dismissed_ct, - v_profiling_runs.dq_score_profiling - FROM v_profiling_runs - LEFT JOIN profile_anomalies ON (v_profiling_runs.profiling_run_id = profile_anomalies.profile_run_id) - WHERE project_code = :project_code - {"AND v_profiling_runs.table_groups_id = :table_group_id" if table_group_id else ""} - {"AND v_profiling_runs.profiling_run_id IN :profiling_run_ids" if profiling_run_ids else ""} - ORDER BY start_time DESC; + profiling_runs.dq_score_profiling + FROM profiling_runs + LEFT JOIN table_groups ON (profiling_runs.table_groups_id = table_groups.id) + LEFT JOIN profile_anomalies ON (profiling_runs.id = profile_anomalies.profile_run_id) + WHERE profiling_runs.project_code = :project_code + {"AND profiling_runs.table_groups_id = :table_group_id" if table_group_id else ""} + {"AND profiling_runs.id IN :profiling_run_ids" if profiling_run_ids else ""} + ORDER BY profiling_starttime DESC; """ params = { "project_code": project_code, @@ -256,5 +275,22 @@ def clear_cache(cls) -> bool: cls.select_minimal_where.clear() cls.select_summary.clear() - def save(self) -> None: - raise NotImplementedError + def init_progress(self) -> None: + self._progress = { + "data_chars": {"label": "Refreshing data catalog"}, + "col_profiling": {"label": "Profiling columns"}, + "freq_analysis": {"label": "Running frequency analysis"}, + "hygiene_issues": {"label": "Detecting hygiene issues"}, + } + for key in self._progress: + self._progress[key].update({"key": key, "status": "Pending"}) + + def set_progress(self, key: ProgressKey, status: ProgressStatus, detail: str | None = None, error: str | None = None) -> None: + self._progress[key]["status"] = status + if detail: + self._progress[key]["detail"] = detail + if error: + self._progress[key]["error"] = error + + self.progress = list(self._progress.values()) + flag_modified(self, "progress") diff --git a/testgen/common/models/table_group.py b/testgen/common/models/table_group.py index 46e3da5..2520c83 100644 --- a/testgen/common/models/table_group.py +++ b/testgen/common/models/table_group.py @@ -30,17 +30,33 @@ class TableGroupMinimal(EntityMinimal): profiling_delay_days: str +@dataclass +class TableGroupStats(EntityMinimal): + id: UUID + table_groups_name: str + table_group_schema: str + table_ct: int + column_ct: int + approx_record_ct: int + record_ct: int + approx_data_point_ct: int + data_point_ct: int + + @dataclass class TableGroupSummary(EntityMinimal): id: UUID table_groups_name: str + table_ct: int + column_ct: int + approx_record_ct: int + record_ct: int + approx_data_point_ct: int + data_point_ct: int dq_score_profiling: float dq_score_testing: float latest_profile_id: UUID latest_profile_start: datetime - latest_profile_table_ct: int - latest_profile_column_ct: int - latest_profile_data_point_ct: int latest_anomalies_ct: int latest_anomalies_definite_ct: int latest_anomalies_likely_ct: int @@ -113,18 +129,61 @@ def select_minimal_where( ) -> Iterable[TableGroupMinimal]: results = cls._select_columns_where(cls._minimal_columns, *clauses, order_by=order_by) return [TableGroupMinimal(**row) for row in results] + + @classmethod + @st.cache_data(show_spinner=False) + def select_stats(cls, project_code: str, table_group_id: str | UUID | None = None) -> Iterable[TableGroupStats]: + query = f""" + WITH stats AS ( + SELECT table_groups_id, + COUNT(*) AS table_ct, + SUM(column_ct) AS column_ct, + SUM(approx_record_ct) AS approx_record_ct, + SUM(record_ct) AS record_ct, + SUM(column_ct * approx_record_ct) AS approx_data_point_ct, + SUM(column_ct * record_ct) AS data_point_ct + FROM data_table_chars + GROUP BY table_groups_id + ) + SELECT groups.id, + groups.table_groups_name, + groups.table_group_schema, + stats.table_ct, + stats.column_ct, + stats.approx_record_ct, + stats.record_ct, + stats.approx_data_point_ct, + stats.data_point_ct + FROM table_groups AS groups + LEFT JOIN stats ON (groups.id = stats.table_groups_id) + WHERE groups.project_code = :project_code + {"AND groups.id = :table_group_id" if table_group_id else ""} + ORDER BY LOWER(groups.table_groups_name); + """ + params = {"project_code": project_code, "table_group_id": table_group_id} + db_session = get_current_session() + results = db_session.execute(text(query), params).mappings().all() + return [TableGroupStats(**row) for row in results] @classmethod @st.cache_data(show_spinner=False) def select_summary(cls, project_code: str, for_dashboard: bool = False) -> Iterable[TableGroupSummary]: query = f""" - WITH latest_profile AS ( + WITH stats AS ( + SELECT table_groups_id, + COUNT(*) AS table_ct, + SUM(column_ct) AS column_ct, + SUM(approx_record_ct) AS approx_record_ct, + SUM(record_ct) AS record_ct, + SUM(column_ct * approx_record_ct) AS approx_data_point_ct, + SUM(column_ct * record_ct) AS data_point_ct + FROM data_table_chars + GROUP BY table_groups_id + ), + latest_profile AS ( SELECT latest_run.table_groups_id, latest_run.id, latest_run.profiling_starttime, - latest_run.table_ct, - latest_run.column_ct, - latest_run.dq_total_data_points, latest_run.anomaly_ct, SUM( CASE @@ -167,19 +226,23 @@ def select_summary(cls, project_code: str, for_dashboard: bool = False) -> Itera ) SELECT groups.id, groups.table_groups_name, + stats.table_ct, + stats.column_ct, + stats.approx_record_ct, + stats.record_ct, + stats.approx_data_point_ct, + stats.data_point_ct, groups.dq_score_profiling, groups.dq_score_testing, latest_profile.id AS latest_profile_id, latest_profile.profiling_starttime AS latest_profile_start, - latest_profile.table_ct AS latest_profile_table_ct, - latest_profile.column_ct AS latest_profile_column_ct, - latest_profile.dq_total_data_points AS latest_profile_data_point_ct, latest_profile.anomaly_ct AS latest_anomalies_ct, latest_profile.definite_ct AS latest_anomalies_definite_ct, latest_profile.likely_ct AS latest_anomalies_likely_ct, latest_profile.possible_ct AS latest_anomalies_possible_ct, latest_profile.dismissed_ct AS latest_anomalies_dismissed_ct FROM table_groups AS groups + LEFT JOIN stats ON (groups.id = stats.table_groups_id) LEFT JOIN latest_profile ON (groups.id = latest_profile.table_groups_id) WHERE groups.project_code = :project_code {"AND groups.include_in_dashboard IS TRUE" if for_dashboard else ""}; diff --git a/testgen/common/read_yaml_metadata_records.py b/testgen/common/read_yaml_metadata_records.py index 6361b2b..28f8cf5 100644 --- a/testgen/common/read_yaml_metadata_records.py +++ b/testgen/common/read_yaml_metadata_records.py @@ -164,7 +164,6 @@ def _process_yaml_for_import(params_mapping: dict, data:dict, parent_table:str, user_override=params_mapping["TESTGEN_ADMIN_USER"], password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"], user_type="schema_admin", - suppress_logs=True, ) return diff --git a/testgen/template/profiling/contingency_columns.sql b/testgen/template/contingency/contingency_columns.sql similarity index 100% rename from testgen/template/profiling/contingency_columns.sql rename to testgen/template/contingency/contingency_columns.sql diff --git a/testgen/template/flavors/generic/profiling/contingency_counts.sql b/testgen/template/contingency/contingency_counts.sql similarity index 100% rename from testgen/template/flavors/generic/profiling/contingency_counts.sql rename to testgen/template/contingency/contingency_counts.sql diff --git a/testgen/template/data_chars/data_chars_staging_delete.sql b/testgen/template/data_chars/data_chars_staging_delete.sql index 292d722..82418a9 100644 --- a/testgen/template/data_chars/data_chars_staging_delete.sql +++ b/testgen/template/data_chars/data_chars_staging_delete.sql @@ -1,4 +1,3 @@ DELETE FROM stg_data_chars_updates -WHERE project_code = :PROJECT_CODE - AND table_groups_id = :TABLE_GROUPS_ID +WHERE table_groups_id = :TABLE_GROUPS_ID AND run_date = :RUN_DATE; diff --git a/testgen/template/data_chars/data_chars_update.sql b/testgen/template/data_chars/data_chars_update.sql index ec16d4e..448d07c 100644 --- a/testgen/template/data_chars/data_chars_update.sql +++ b/testgen/template/data_chars/data_chars_update.sql @@ -7,20 +7,19 @@ WITH new_chars AS ( SELECT table_groups_id, schema_name, table_name, - functional_table_type, run_date, + MAX(approx_record_ct) AS approx_record_ct, MAX(record_ct) AS record_ct, COUNT(*) AS column_ct - FROM {SOURCE_TABLE} + FROM stg_data_chars_updates WHERE table_groups_id = :TABLE_GROUPS_ID GROUP BY table_groups_id, schema_name, table_name, - functional_table_type, run_date ) UPDATE data_table_chars -SET functional_table_type = COALESCE(n.functional_table_type, d.functional_table_type), +SET approx_record_ct = n.approx_record_ct, record_ct = n.record_ct, column_ct = n.column_ct, last_refresh_date = n.run_date, @@ -38,34 +37,33 @@ WITH new_chars AS ( SELECT table_groups_id, schema_name, table_name, - functional_table_type, run_date, + MAX(approx_record_ct) AS approx_record_ct, MAX(record_ct) AS record_ct, COUNT(*) AS column_ct - FROM {SOURCE_TABLE} + FROM stg_data_chars_updates WHERE table_groups_id = :TABLE_GROUPS_ID GROUP BY table_groups_id, schema_name, table_name, - functional_table_type, run_date ) INSERT INTO data_table_chars ( table_groups_id, schema_name, table_name, - functional_table_type, add_date, last_refresh_date, + approx_record_ct, record_ct, column_ct ) SELECT n.table_groups_id, n.schema_name, n.table_name, - n.functional_table_type, n.run_date, n.run_date, + n.approx_record_ct, n.record_ct, n.column_ct FROM new_chars n @@ -81,7 +79,7 @@ WITH new_chars AS ( SELECT table_groups_id, schema_name, table_name - FROM {SOURCE_TABLE} + FROM stg_data_chars_updates WHERE table_groups_id = :TABLE_GROUPS_ID GROUP BY table_groups_id, schema_name, @@ -90,7 +88,7 @@ WITH new_chars AS ( last_run AS ( SELECT table_groups_id, MAX(run_date) as last_run_date - FROM {SOURCE_TABLE} + FROM stg_data_chars_updates WHERE table_groups_id = :TABLE_GROUPS_ID GROUP BY table_groups_id ) @@ -118,21 +116,17 @@ WITH new_chars AS ( table_name, column_name, position, - general_type, column_type, db_data_type, - functional_data_type, run_date - FROM {SOURCE_TABLE} + FROM stg_data_chars_updates WHERE table_groups_id = :TABLE_GROUPS_ID ), update_chars AS ( UPDATE data_column_chars SET ordinal_position = n.position, - general_type = n.general_type, column_type = n.column_type, db_data_type = n.db_data_type, - functional_data_type = COALESCE(n.functional_data_type, d.functional_data_type), last_mod_date = CASE WHEN n.db_data_type <> d.db_data_type THEN n.run_date ELSE d.last_mod_date END, drop_date = NULL FROM new_chars n @@ -172,9 +166,8 @@ WITH new_chars AS ( general_type, column_type, db_data_type, - functional_data_type, run_date - FROM {SOURCE_TABLE} + FROM stg_data_chars_updates WHERE table_groups_id = :TABLE_GROUPS_ID ), inserted_records AS ( @@ -188,7 +181,6 @@ inserted_records AS ( general_type, column_type, db_data_type, - functional_data_type, add_date, last_mod_date ) @@ -201,7 +193,6 @@ inserted_records AS ( n.general_type, n.column_type, n.db_data_type, - n.functional_data_type, n.run_date, n.run_date FROM new_chars n @@ -237,13 +228,13 @@ WITH new_chars AS ( schema_name, table_name, column_name - FROM {SOURCE_TABLE} + FROM stg_data_chars_updates WHERE table_groups_id = :TABLE_GROUPS_ID ), last_run AS ( SELECT table_groups_id, MAX(run_date) as last_run_date - FROM {SOURCE_TABLE} + FROM stg_data_chars_updates WHERE table_groups_id = :TABLE_GROUPS_ID GROUP BY table_groups_id ), diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index ad76f02..8892e51 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -30,18 +30,16 @@ CREATE TABLE stg_functional_table_updates ( ); CREATE TABLE stg_data_chars_updates ( - project_code VARCHAR(30), table_groups_id UUID, run_date TIMESTAMP, schema_name VARCHAR(120), table_name VARCHAR(120), - functional_table_type VARCHAR(50), column_name VARCHAR(120), position INTEGER, general_type VARCHAR(1), column_type VARCHAR(50), db_data_type VARCHAR(50), - functional_data_type VARCHAR(50), + approx_record_ct BIGINT, record_ct BIGINT ); @@ -134,9 +132,12 @@ CREATE TABLE profiling_runs ( profiling_starttime TIMESTAMP, profiling_endtime TIMESTAMP, status VARCHAR(100) DEFAULT 'Running', + progress JSONB, log_message VARCHAR, table_ct BIGINT, column_ct BIGINT, + record_ct BIGINT, + data_point_ct BIGINT, anomaly_ct BIGINT, anomaly_table_ct BIGINT, anomaly_column_ct BIGINT, @@ -235,7 +236,6 @@ CREATE TABLE profile_results ( dk_id BIGINT GENERATED ALWAYS AS IDENTITY, -- CONSTRAINT profile_results_dk_id_pk -- PRIMARY KEY, - column_id UUID, project_code VARCHAR(30), connection_id BIGINT CONSTRAINT profile_results_connections_connection_id_fk @@ -307,7 +307,8 @@ CREATE TABLE profile_results ( pii_flag VARCHAR(50), functional_data_type VARCHAR(50), functional_table_type VARCHAR(50), - sample_ratio FLOAT + sample_ratio FLOAT, + query_error VARCHAR(2000) ); ALTER SEQUENCE profile_results_dk_id_seq OWNED BY profile_results.dk_id; @@ -400,9 +401,9 @@ CREATE TABLE data_table_chars ( add_date TIMESTAMP, drop_date TIMESTAMP, last_refresh_date TIMESTAMP, + approx_record_ct BIGINT, record_ct BIGINT, column_ct BIGINT, - data_point_ct BIGINT GENERATED ALWAYS AS (record_ct * column_ct) STORED, last_complete_profile_run_id UUID, last_profile_record_ct BIGINT, dq_score_profiling FLOAT, diff --git a/testgen/template/dbsetup/060_create_standard_views.sql b/testgen/template/dbsetup/060_create_standard_views.sql index 0eea385..d5aac62 100644 --- a/testgen/template/dbsetup/060_create_standard_views.sql +++ b/testgen/template/dbsetup/060_create_standard_views.sql @@ -22,30 +22,6 @@ INNER JOIN profile_results r ON p.id = r.profile_run_id; -DROP VIEW IF EXISTS v_latest_profile_anomalies; - -CREATE VIEW v_latest_profile_anomalies - AS -WITH last_profile_date - AS (SELECT table_groups_id, MAX(profiling_starttime) as last_profile_run_date - FROM profiling_runs - GROUP BY table_groups_id) -SELECT r.id, r.project_code, r.table_groups_id, - r.profile_run_id, pr.profiling_starttime as profile_run_date, - r.schema_name, r.table_name, r.column_name, r.column_type, - t.anomaly_name, t.anomaly_description, t.issue_likelihood, - r.detail, - t.suggested_action, r.disposition - FROM profile_anomaly_results r -INNER JOIN profile_anomaly_types t - ON r.anomaly_id = t.id -INNER JOIN profiling_runs pr - ON (r.profile_run_id = pr.id) -INNER JOIN last_profile_date l - ON (pr.table_groups_id = l.table_groups_id - AND pr.profiling_starttime = l.last_profile_run_date); - - DROP VIEW IF EXISTS v_inactive_anomalies; CREATE VIEW v_inactive_anomalies @@ -55,59 +31,6 @@ SELECT DISTINCT anomaly_id, table_groups_id, schema_name, table_name, column_nam WHERE disposition = 'Inactive'; -DROP VIEW IF EXISTS v_profiling_runs; - -CREATE VIEW v_profiling_runs - AS -SELECT r.id as profiling_run_id, - r.project_code, cc.connection_name, r.connection_id, r.table_groups_id, - tg.table_groups_name, - tg.table_group_schema as schema_name, - r.profiling_starttime as start_time, - r.profiling_endtime as end_time, - r.status, - r.log_message, - r.table_ct, - r.column_ct, - r.anomaly_ct, r.anomaly_table_ct, r.anomaly_column_ct, - process_id, r.dq_score_profiling - FROM profiling_runs r -INNER JOIN table_groups tg - ON r.table_groups_id = tg.id -INNER JOIN connections cc - ON r.connection_id = cc.connection_id -GROUP BY r.id, r.project_code, cc.connection_name, r.connection_id, - r.table_groups_id, tg.table_groups_name, tg.table_group_schema, - r.profiling_starttime, r.profiling_endtime, r.status; - - -DROP VIEW IF EXISTS v_test_runs; - -CREATE VIEW v_test_runs - AS -SELECT r.id as test_run_id, - p.project_code, - p.project_name, - ts.test_suite, - r.test_starttime, - TO_CHAR(r.test_endtime - r.test_starttime, 'HH24:MI:SS') as duration, - r.status, r.log_message, - COUNT(*) as test_ct, - SUM(result_code) as passed_ct, - COALESCE(SUM(CASE WHEN tr.result_status = 'Failed' THEN 1 END), 0) as failed_ct, - COALESCE(SUM(CASE WHEN tr.result_status = 'Warning' THEN 1 END), 0) as warning_ct, - r.process_id - FROM test_runs r -INNER JOIN test_suites ts - ON (r.test_suite_id = ts.id) -INNER JOIN projects p - ON (ts.project_code = p.project_code) -INNER JOIN test_results tr - ON (r.id = tr.test_run_id) -GROUP BY r.id, p.project_code, ts.test_suite, r.test_starttime, r.test_endtime, - r.process_id, r.status, r.log_message, p.project_name; - - DROP VIEW IF EXISTS v_test_results; CREATE VIEW v_test_results diff --git a/testgen/template/dbupgrade/0157_incremental_upgrade.sql b/testgen/template/dbupgrade/0157_incremental_upgrade.sql new file mode 100644 index 0000000..99f792b --- /dev/null +++ b/testgen/template/dbupgrade/0157_incremental_upgrade.sql @@ -0,0 +1,25 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +DROP VIEW IF EXISTS v_latest_profile_results CASCADE; +DROP VIEW IF EXISTS v_latest_profile_anomalies; +DROP VIEW IF EXISTS v_profiling_runs; +DROP VIEW IF EXISTS v_test_runs; + +ALTER TABLE stg_data_chars_updates + DROP COLUMN project_code, + DROP COLUMN functional_table_type, + DROP COLUMN functional_data_type, + ADD COLUMN approx_record_ct BIGINT; + +ALTER TABLE data_table_chars + ADD COLUMN approx_record_ct BIGINT, + DROP COLUMN data_point_ct; + +ALTER TABLE profiling_runs + ADD COLUMN progress JSONB, + ADD COLUMN record_ct BIGINT, + ADD COLUMN data_point_ct BIGINT; + +ALTER TABLE profile_results + DROP COLUMN column_id, + ADD COLUMN query_error VARCHAR(2000); diff --git a/testgen/template/flavors/bigquery/data_chars/schema_ddf_query_bigquery.sql b/testgen/template/flavors/bigquery/data_chars/get_schema_ddf.sql similarity index 85% rename from testgen/template/flavors/bigquery/data_chars/schema_ddf_query_bigquery.sql rename to testgen/template/flavors/bigquery/data_chars/get_schema_ddf.sql index 1e3c93a..ee2165d 100644 --- a/testgen/template/flavors/bigquery/data_chars/schema_ddf_query_bigquery.sql +++ b/testgen/template/flavors/bigquery/data_chars/get_schema_ddf.sql @@ -1,6 +1,5 @@ -SELECT '{PROJECT_CODE}' AS project_code, - CURRENT_TIMESTAMP() AS refresh_timestamp, - c.table_schema, +SELECT + c.table_schema AS schema_name, c.table_name, c.column_name, CASE @@ -10,7 +9,6 @@ SELECT '{PROJECT_CODE}' AS project_code, ELSE LOWER(c.data_type) END AS column_type, c.data_type AS db_data_type, - NULL AS character_maximum_length, c.ordinal_position, CASE WHEN LOWER(c.data_type) = 'string' THEN 'A' @@ -21,7 +19,9 @@ SELECT '{PROJECT_CODE}' AS project_code, WHEN REGEXP_CONTAINS(LOWER(c.data_type), r'(decimal|numeric|bignumeric)') THEN 'N' ELSE 'X' END AS general_type, - REGEXP_CONTAINS(LOWER(c.data_type), r'(decimal|numeric|bignumeric)') AS is_decimal + REGEXP_CONTAINS(LOWER(c.data_type), r'(decimal|numeric|bignumeric)') AS is_decimal, + t.row_count AS approx_record_ct FROM `{DATA_SCHEMA}.INFORMATION_SCHEMA.COLUMNS` c + LEFT JOIN `{DATA_SCHEMA}.__TABLES__` t ON c.table_name = t.table_id WHERE c.table_schema = '{DATA_SCHEMA}' {TABLE_CRITERIA} ORDER BY c.table_schema, c.table_name, c.ordinal_position; diff --git a/testgen/template/flavors/bigquery/profiling/project_get_table_sample_count_bigquery.sql b/testgen/template/flavors/bigquery/profiling/project_get_table_sample_count_bigquery.sql deleted file mode 100644 index 4fdfcc6..0000000 --- a/testgen/template/flavors/bigquery/profiling/project_get_table_sample_count_bigquery.sql +++ /dev/null @@ -1,30 +0,0 @@ -WITH stats AS ( - SELECT - COUNT(*) * 1.0 AS record_ct, - ROUND(CAST({PROFILE_SAMPLE_PERCENT} AS FLOAT64) * COUNT(*) * 1.0 / 100.0) AS calc_sample_ct, - CAST({PROFILE_SAMPLE_MIN_COUNT} AS FLOAT64) AS min_sample_ct, - CAST(999000 AS FLOAT64) AS max_sample_ct - FROM `{SAMPLING_TABLE}` -) -SELECT '{SAMPLING_TABLE}' AS schema_table, - CASE - WHEN record_ct <= min_sample_ct THEN -1 - WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct - ELSE {PROFILE_SAMPLE_MIN_COUNT} - END AS sample_count, - CASE - WHEN record_ct <= min_sample_ct THEN 1 - WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct - ELSE record_ct / min_sample_ct - END AS sample_ratio, - ROUND( - CASE - WHEN record_ct <= min_sample_ct THEN 100 - WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct - WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct - ELSE 100.0 * min_sample_ct / record_ct - END, - 4) AS sample_percent_calc -FROM stats; diff --git a/testgen/template/flavors/bigquery/profiling/project_profiling_query_bigquery.yaml b/testgen/template/flavors/bigquery/profiling/project_profiling_query.yaml similarity index 82% rename from testgen/template/flavors/bigquery/profiling/project_profiling_query_bigquery.yaml rename to testgen/template/flavors/bigquery/profiling/project_profiling_query.yaml index 5d0456a..03c7a4a 100644 --- a/testgen/template/flavors/bigquery/profiling/project_profiling_query_bigquery.yaml +++ b/testgen/template/flavors/bigquery/profiling/project_profiling_query.yaml @@ -1,15 +1,15 @@ --- -strTemplate01_sampling: | +01_sampling: | WITH target_table AS ( SELECT * FROM `{DATA_SCHEMA}.{DATA_TABLE}` WHERE RAND() * 100 < {SAMPLE_PERCENT_CALC} ) SELECT -strTemplate01_else: | +01_else: | WITH target_table AS ( SELECT * FROM `{DATA_SCHEMA}.{DATA_TABLE}` ) SELECT -strTemplate01_5: | +01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -22,29 +22,33 @@ strTemplate01_5: | '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, -strTemplate02_X: | + +02_X: | COUNT(`{COL_NAME}`) AS value_ct, COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct, SUM(IF(`{COL_NAME}` IS NULL, 1, 0)) AS null_value_ct, -strTemplate02_else: | +02_else: | COUNT(`{COL_NAME}`) AS value_ct, COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct, SUM(IF(`{COL_NAME}` IS NULL, 1, 0)) AS null_value_ct, -strTemplate03_ADN: MIN(LENGTH(CAST(`{COL_NAME}` AS STRING))) AS min_length, + +03_ADN: MIN(LENGTH(CAST(`{COL_NAME}` AS STRING))) AS min_length, MAX(LENGTH(CAST(`{COL_NAME}` AS STRING))) AS max_length, AVG(NULLIF(LENGTH(CAST(`{COL_NAME}` AS STRING)), 0)) AS avg_length, -strTemplate03_else: NULL as min_length, +03_else: NULL as min_length, NULL as max_length, NULL as avg_length, -strTemplate04_A: SUM( + +04_A: SUM( CASE WHEN REGEXP_CONTAINS(TRIM(CAST(`{COL_NAME}` AS STRING)), r'^0(\.0*)?$') THEN 1 ELSE 0 END ) AS zero_value_ct, -strTemplate04_N: CAST(SUM(1 - ABS(SIGN(CAST(`{COL_NAME}` AS NUMERIC)))) AS INT64) AS zero_value_ct, -strTemplate04_else: NULL as zero_value_ct, -strTemplate05_A: | +04_N: CAST(SUM(1 - ABS(SIGN(CAST(`{COL_NAME}` AS NUMERIC)))) AS INT64) AS zero_value_ct, +04_else: NULL as zero_value_ct, + +05_A: | COUNT( DISTINCT UPPER( REGEXP_REPLACE(CAST(`{COL_NAME}` AS STRING), r"[ '\.,-]", "") @@ -115,7 +119,7 @@ strTemplate05_A: | AND SUBSTR(`{COL_NAME}`, 1, 3) <> '666' THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'SSN' END AS std_pattern_match, -strTemplate05_else: NULL as distinct_std_value_ct, +05_else: NULL as distinct_std_value_ct, NULL as zero_length_ct, NULL as lead_space_ct, NULL as quoted_value_ct, @@ -130,7 +134,8 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, -strTemplate06_A_patterns: | + +06_A: | ( SELECT LEFT(STRING_AGG(val, ' | ' ORDER BY ct DESC), 1000) AS top_patterns FROM ( @@ -150,7 +155,7 @@ strTemplate06_A_patterns: | AND ( SELECT MAX(LENGTH(CAST({COL_NAME} AS STRING))) FROM `target_table` - ) BETWEEN 3 AND {PARM_MAX_PATTERN_LENGTH} + ) BETWEEN 3 AND {MAX_PATTERN_LENGTH} ) p GROUP BY pattern HAVING pattern > ' ' @@ -159,27 +164,9 @@ strTemplate06_A_patterns: | ) ) ps ) as top_patterns, -strTemplate06_else: NULL as top_patterns, -strTemplate07_A_freq: | - ( - SELECT LEFT(STRING_AGG(val, ' | ' ORDER BY ct DESC), 1000) AS top_freq_values - FROM ( - SELECT CONCAT(CAST(ct AS STRING), ' | ', CAST({COL_NAME} AS STRING)) AS val, - ct - FROM ( - SELECT {COL_NAME}, - COUNT(*) AS ct - FROM `target_table` - WHERE {COL_NAME} > ' ' - GROUP BY {COL_NAME} - HAVING {COL_NAME} > ' ' - ORDER BY ct DESC, {COL_NAME} DESC - LIMIT 10 - ) - ) ps - ) as top_freq_values, -strTemplate07_else: NULL as top_freq_values, -strTemplate08_N: MIN(`{COL_NAME}`) AS min_value, +06_else: NULL as top_patterns, + +08_N: MIN(`{COL_NAME}`) AS min_value, MIN(CASE WHEN `{COL_NAME}` > 0 THEN `{COL_NAME}` ELSE NULL END) AS min_value_over_0, MAX(`{COL_NAME}`) AS max_value, AVG(CAST(`{COL_NAME}` AS FLOAT64)) AS avg_value, @@ -187,7 +174,7 @@ strTemplate08_N: MIN(`{COL_NAME}`) AS min_value, MIN(pct_25) AS percentile_25, MIN(pct_50) AS percentile_50, MIN(pct_75) AS percentile_75, -strTemplate08_else: NULL as min_value, +08_else: NULL as min_value, NULL as min_value_over_0, NULL as max_value, NULL as avg_value, @@ -195,9 +182,11 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(COALESCE(ROUND(ABS(MOD(`{COL_NAME}`, 1)), 5), 0)) as fractional_sum, -strTemplate10_else: NULL as fractional_sum, -strTemplate11_D: | + +10_N_dec: SUM(COALESCE(ROUND(ABS(MOD(`{COL_NAME}`, 1)), 5), 0)) as fractional_sum, +10_else: NULL as fractional_sum, + +11_D: | MIN(`{COL_NAME}`) AS min_date, -- Other flavors have a minimum threshold of 0001-01-01, but BigQuery doesn't make it easy to to the same MAX(`{COL_NAME}`) as max_date, COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH) > 12 THEN 1 END) AS before_1yr_date_ct, @@ -211,8 +200,7 @@ strTemplate11_D: | COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), DAY)) AS date_days_present, COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), WEEK)) AS date_weeks_present, COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH)) AS date_months_present, - -strTemplate11_else: NULL as min_date, +11_else: NULL as min_date, NULL as max_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, @@ -225,10 +213,11 @@ strTemplate11_else: NULL as min_date, NULL as date_days_present, NULL as date_weeks_present, NULL as date_months_present, -strTemplate12_B: SUM(CAST(`{COL_NAME}` AS INT64)) AS boolean_true_ct, -strTemplate12_else: NULL as boolean_true_ct, -strTemplate13_ALL: NULL AS datatype_suggestion, -strTemplate14_A_do_patterns: | + +12_B: SUM(CAST(`{COL_NAME}` AS INT64)) AS boolean_true_ct, +12_else: NULL as boolean_true_ct, + +14_A: | ( SELECT COUNT(DISTINCT REGEXP_REPLACE( @@ -243,31 +232,29 @@ strTemplate14_A_do_patterns: | ) as distinct_pattern_ct, SUM(CAST(SIGN(LENGTH(TRIM(`{COL_NAME}`)) - LENGTH(REPLACE(TRIM(`{COL_NAME}`), ' ', ''))) AS INT64)) AS embedded_space_ct, AVG(CAST(LENGTH(TRIM(`{COL_NAME}`)) - LENGTH(REPLACE(TRIM(`{COL_NAME}`), ' ', '')) AS FLOAT64)) AS avg_embedded_spaces, -strTemplate14_A_no_patterns: NULL as distinct_pattern_ct, - SUM(CAST(SIGN(LENGTH(TRIM(`{COL_NAME}`)) - LENGTH(REPLACE(TRIM(`{COL_NAME}`), ' ', ''))) AS INT64)) AS embedded_space_ct, - AVG(CAST(LENGTH(TRIM(`{COL_NAME}`)) - LENGTH(REPLACE(TRIM(`{COL_NAME}`), ' ', '')) AS FLOAT64)) AS avg_embedded_spaces, -strTemplate14_else: NULL as distinct_pattern_ct, +14_else: NULL as distinct_pattern_ct, NULL as embedded_space_ct, NULL as avg_embedded_spaces, -strTemplate15_ALL: NULL as functional_data_type, - NULL as functional_table_type, -strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id " -strTemplate98_sampling: ' FROM target_table' -strTemplate98_else: ' FROM target_table' -strTemplate99_N: | +16_all: " '{PROFILE_RUN_ID}' as profile_run_id " + +98_sampling: ' FROM target_table' +98_else: ' FROM target_table' + +99_N: | , (SELECT PERCENTILE_CONT(`{COL_NAME}`, 0.25) OVER() AS pct_25, PERCENTILE_CONT(`{COL_NAME}`, 0.50) OVER() AS pct_50, PERCENTILE_CONT(`{COL_NAME}`, 0.75) OVER() AS pct_75 FROM `{DATA_SCHEMA}.{DATA_TABLE}` LIMIT 1) pctile -strTemplate99_N_sampling: | +99_N_sampling: | , (SELECT APPROX_QUANTILES(`{COL_NAME}`, 100)[OFFSET(25)] AS pct_25, APPROX_QUANTILES(`{COL_NAME}`, 100)[OFFSET(50)] AS pct_50, APPROX_QUANTILES(`{COL_NAME}`, 100)[OFFSET(75)] AS pct_75 FROM `{DATA_SCHEMA}.{DATA_TABLE}` LIMIT 1) pctile -strTemplate99_else: ; -strTemplate100_sampling: ' ' +99_else: ; + +100_sampling: ' ' diff --git a/testgen/template/flavors/bigquery/profiling/project_secondary_profiling_query_bigquery.sql b/testgen/template/flavors/bigquery/profiling/project_secondary_profiling_query.sql similarity index 100% rename from testgen/template/flavors/bigquery/profiling/project_secondary_profiling_query_bigquery.sql rename to testgen/template/flavors/bigquery/profiling/project_secondary_profiling_query.sql diff --git a/testgen/template/flavors/databricks/data_chars/schema_ddf_query_databricks.sql b/testgen/template/flavors/databricks/data_chars/get_schema_ddf.sql similarity index 86% rename from testgen/template/flavors/databricks/data_chars/schema_ddf_query_databricks.sql rename to testgen/template/flavors/databricks/data_chars/get_schema_ddf.sql index 0cfb56f..6ae63a9 100644 --- a/testgen/template/flavors/databricks/data_chars/schema_ddf_query_databricks.sql +++ b/testgen/template/flavors/databricks/data_chars/get_schema_ddf.sql @@ -1,6 +1,5 @@ -SELECT '{PROJECT_CODE}' AS project_code, - CURRENT_TIMESTAMP AS refresh_timestamp, - c.table_schema, +SELECT + c.table_schema AS schema_name, c.table_name, c.column_name, CASE @@ -11,7 +10,6 @@ SELECT '{PROJECT_CODE}' AS project_code, ELSE lower(c.full_data_type) END AS column_type, c.full_data_type AS db_data_type, - c.character_maximum_length, c.ordinal_position, CASE WHEN c.data_type IN ('STRING', 'CHAR') THEN 'A' @@ -23,7 +21,8 @@ SELECT '{PROJECT_CODE}' AS project_code, CASE WHEN c.numeric_scale > 0 THEN 1 ELSE 0 - END AS is_decimal + END AS is_decimal, + NULL AS approx_record_ct -- table statistics unavailable FROM information_schema.columns c WHERE c.table_schema = '{DATA_SCHEMA}' {TABLE_CRITERIA} ORDER BY c.table_schema, c.table_name, c.ordinal_position; diff --git a/testgen/template/flavors/databricks/profiling/project_get_table_sample_count_databricks.sql b/testgen/template/flavors/databricks/profiling/project_get_table_sample_count_databricks.sql deleted file mode 100644 index 9a62c3d..0000000 --- a/testgen/template/flavors/databricks/profiling/project_get_table_sample_count_databricks.sql +++ /dev/null @@ -1,23 +0,0 @@ -WITH stats - AS (SELECT COUNT(*)::FLOAT as record_ct, - ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0) as calc_sample_ct, - CAST({PROFILE_SAMPLE_MIN_COUNT} as FLOAT) as min_sample_ct, - CAST(999000 as FLOAT) as max_sample_ct - FROM {SAMPLING_TABLE} ) -SELECT '{SAMPLING_TABLE}' as schema_table, - CASE WHEN record_ct <= min_sample_ct THEN -1 - WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct - ELSE {PROFILE_SAMPLE_MIN_COUNT} - END as sample_count, - CASE WHEN record_ct <= min_sample_ct THEN 1 - WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct - ELSE record_ct / min_sample_ct - END as sample_ratio, - ROUND(CASE WHEN record_ct <= min_sample_ct THEN 100 - WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct - WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct - ELSE 100.0 * min_sample_ct / record_ct - END, 4) as sample_percent_calc - FROM stats; diff --git a/testgen/template/flavors/databricks/profiling/project_profiling_query_databricks.yaml b/testgen/template/flavors/databricks/profiling/project_profiling_query.yaml similarity index 83% rename from testgen/template/flavors/databricks/profiling/project_profiling_query_databricks.yaml rename to testgen/template/flavors/databricks/profiling/project_profiling_query.yaml index 4c2cbaa..32c41ab 100644 --- a/testgen/template/flavors/databricks/profiling/project_profiling_query_databricks.yaml +++ b/testgen/template/flavors/databricks/profiling/project_profiling_query.yaml @@ -1,7 +1,7 @@ --- -strTemplate01_sampling: "SELECT " -strTemplate01_else: "SELECT " -strTemplate01_5: | +01_sampling: "SELECT " +01_else: "SELECT " +01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -14,26 +14,30 @@ strTemplate01_5: | '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, -strTemplate02_X: | + +02_X: | COUNT(`{COL_NAME}`) AS value_ct, COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct, SUM(CASE WHEN `{COL_NAME}` IS NULL THEN 1 ELSE 0 END) AS null_value_ct, -strTemplate02_else: | +02_else: | COUNT(`{COL_NAME}`) AS value_ct, COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct, SUM(CASE WHEN `{COL_NAME}` IS NULL THEN 1 ELSE 0 END) AS null_value_ct, -strTemplate03_ADN: MIN(LEN(`{COL_NAME}`)) AS min_length, + +03_ADN: MIN(LEN(`{COL_NAME}`)) AS min_length, MAX(LEN(`{COL_NAME}`)) AS max_length, AVG(CAST(NULLIF(LEN(`{COL_NAME}`), 0) AS FLOAT)) AS avg_length, -strTemplate03_else: NULL as min_length, +03_else: NULL as min_length, NULL as max_length, NULL as avg_length, -strTemplate04_A: SUM(CASE + +04_A: SUM(CASE WHEN LTRIM(RTRIM(`{COL_NAME}`)) RLIKE '0([.]0*)' THEN 1 ELSE 0 END) AS zero_value_ct, -strTemplate04_N: CAST(SUM( 1 - ABS(SIGN(`{COL_NAME}`)))AS BIGINT ) AS zero_value_ct, -strTemplate04_else: NULL as zero_value_ct, -strTemplate05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE(`{COL_NAME}`,' '''',.-',REPEAT(' ', LEN(' '''',.-'))),' ',''))) as distinct_std_value_ct, +04_N: CAST(SUM( 1 - ABS(SIGN(`{COL_NAME}`)))AS BIGINT ) AS zero_value_ct, +04_else: NULL as zero_value_ct, + +05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE(`{COL_NAME}`,' '''',.-',REPEAT(' ', LEN(' '''',.-'))),' ',''))) as distinct_std_value_ct, SUM(CASE WHEN `{COL_NAME}` = '' THEN 1 ELSE 0 @@ -118,7 +122,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE(`{COL_NAME}`,' '''',.-', AND LEFT(`{COL_NAME}`, 3) NOT BETWEEN '734' AND '749' AND LEFT(`{COL_NAME}`, 3) <> '666' THEN 1 END) AS FLOAT)/CAST(COUNT(`{COL_NAME}`) AS FLOAT) > 0.9 THEN 'SSN' END as std_pattern_match, -strTemplate05_else: NULL as distinct_std_value_ct, +05_else: NULL as distinct_std_value_ct, NULL as zero_length_ct, NULL as lead_space_ct, NULL as quoted_value_ct, @@ -133,7 +137,8 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, -strTemplate06_A_patterns: (SELECT CONCAT_WS(' | ', collect_list(ct_pattern)) + +06_A: (SELECT CONCAT_WS(' | ', collect_list(ct_pattern)) FROM ( SELECT TRANSLATE( @@ -152,21 +157,9 @@ strTemplate06_A_patterns: (SELECT CONCAT_WS(' | ', collect_list(ct_pattern)) ORDER BY ct DESC LIMIT 5 )) AS top_patterns, -strTemplate06_else: NULL as top_patterns, -strTemplate07_A_freq: ( SELECT LEFT(CONCAT_WS(' | ', collect_list(val)), 1000) as concat_vals - FROM ( - SELECT CAST(COUNT(*) as VARCHAR(10)) || ' | ' || `{COL_NAME}` as val, - COUNT(*) as ct - FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` - WHERE `{COL_NAME}` > ' ' - GROUP BY `{COL_NAME}` - HAVING `{COL_NAME}` > ' ' - ORDER BY COUNT(*) DESC, val ASC - LIMIT 10 - ) ps - ) AS top_freq_values, -strTemplate07_else: NULL as top_freq_values, -strTemplate08_N: MIN(`{COL_NAME}`) AS min_value, +06_else: NULL as top_patterns, + +08_N: MIN(`{COL_NAME}`) AS min_value, MIN(CASE WHEN `{COL_NAME}` > 0 THEN `{COL_NAME}` ELSE NULL END) AS min_value_over_0, MAX(`{COL_NAME}`) AS max_value, AVG(CAST(`{COL_NAME}` AS FLOAT)) AS avg_value, @@ -174,7 +167,7 @@ strTemplate08_N: MIN(`{COL_NAME}`) AS min_value, MIN(pct_25) as percentile_25, MIN(pct_50) as percentile_50, MIN(pct_75) as percentile_75, -strTemplate08_else: NULL as min_value, +08_else: NULL as min_value, NULL as min_value_over_0, NULL as max_value, NULL as avg_value, @@ -182,11 +175,11 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(ABS(MOD(`{COL_NAME}`, 1)), 5)) as fractional_sum, -strTemplate10_else: NULL as fractional_sum, +10_N_dec: SUM(ROUND(ABS(MOD(`{COL_NAME}`, 1)), 5)) as fractional_sum, +10_else: NULL as fractional_sum, -strTemplate11_D: CASE +11_D: CASE WHEN MIN(`{COL_NAME}`) IS NULL THEN NULL ELSE CASE WHEN MIN(`{COL_NAME}`) >= CAST('0001-01-01' as date) THEN MIN(`{COL_NAME}`) ELSE CAST('0001-01-01' as date) END END as min_date, @@ -225,8 +218,7 @@ strTemplate11_D: CASE COUNT(DISTINCT <%DATEDIFF_DAY; `{COL_NAME}`; '{RUN_DATE}'::DATE%>) as date_days_present, COUNT(DISTINCT <%DATEDIFF_WEEK; `{COL_NAME}`; '{RUN_DATE}'::DATE%>) as date_weeks_present, COUNT(DISTINCT <%DATEDIFF_MONTH; `{COL_NAME}`; '{RUN_DATE}'::DATE%>) as date_months_present, - -strTemplate11_else: NULL as min_date, +11_else: NULL as min_date, NULL as max_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, @@ -240,12 +232,10 @@ strTemplate11_else: NULL as min_date, NULL as date_weeks_present, NULL as date_months_present, -strTemplate12_B: SUM(CAST(`{COL_NAME}` AS INTEGER)) AS boolean_true_ct, +12_B: SUM(CAST(`{COL_NAME}` AS INTEGER)) AS boolean_true_ct, +12_else: NULL as boolean_true_ct, -strTemplate12_else: NULL as boolean_true_ct, - -strTemplate13_ALL: NULL AS datatype_suggestion, -strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT TRANSLATE(`{COL_NAME}`, +14_A: ( SELECT COUNT(DISTINCT TRANSLATE(`{COL_NAME}`, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN' ) @@ -254,38 +244,27 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT TRANSLATE(`{COL_NAME}`, WHERE `{COL_NAME}` > ' ' ) AS distinct_pattern_ct, SUM(CAST(SIGN(LEN(TRIM(`{COL_NAME}`)) - LEN(REPLACE(TRIM(`{COL_NAME}`),' ',''))) AS BIGINT)) AS embedded_space_ct, AVG(CAST(LEN(TRIM(`{COL_NAME}`)) - LEN(REPLACE(TRIM(`{COL_NAME}`),' ','')) AS FLOAT)) AS avg_embedded_spaces, - -strTemplate14_A_no_patterns: NULL as distinct_pattern_ct, - SUM(CAST(SIGN(LEN(RTRIM(LTRIM(`{COL_NAME}`))) - LEN(REPLACE(RTRIM(LTRIM(`{COL_NAME}`)),' ',''))) AS BIGINT)) AS embedded_space_ct, - AVG(CAST(LEN(RTRIM(LTRIM(`{COL_NAME}`))) - LEN(REPLACE(RTRIM(LTRIM(`{COL_NAME}`)),' ','')) AS FLOAT)) AS avg_embedded_spaces, - -strTemplate14_else: NULL as distinct_pattern_ct, +14_else: NULL as distinct_pattern_ct, NULL as embedded_space_ct, NULL as avg_embedded_spaces, -strTemplate15_ALL: NULL as functional_data_type, - NULL as functional_table_type, - -strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id" +16_all: " '{PROFILE_RUN_ID}' as profile_run_id" -strTemplate98_sampling: ' FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT)' +98_sampling: ' FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT)' +98_else: ' FROM `{DATA_SCHEMA}`.`{DATA_TABLE}`' -strTemplate98_else: ' FROM `{DATA_SCHEMA}`.`{DATA_TABLE}`' - -strTemplate99_N: | +99_N: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_75 FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` LIMIT 1) pctile - -strTemplate99_N_sampling: | +99_N_sampling: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_75 FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) LIMIT 1 ) pctile +99_else: ' ' -strTemplate99_else: ' ' - -strTemplate100_sampling: ' ' +100_sampling: ' ' diff --git a/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query_databricks.sql b/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query.sql similarity index 100% rename from testgen/template/flavors/databricks/profiling/project_secondary_profiling_query_databricks.sql rename to testgen/template/flavors/databricks/profiling/project_secondary_profiling_query.sql diff --git a/testgen/template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql b/testgen/template/flavors/mssql/data_chars/get_schema_ddf.sql similarity index 78% rename from testgen/template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql rename to testgen/template/flavors/mssql/data_chars/get_schema_ddf.sql index 6c44d4c..8a3ea74 100644 --- a/testgen/template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql +++ b/testgen/template/flavors/mssql/data_chars/get_schema_ddf.sql @@ -1,6 +1,15 @@ -SELECT '{PROJECT_CODE}' as project_code, - CURRENT_TIMESTAMP as refresh_timestamp, - c.table_schema, +WITH approx_cts AS ( + SELECT SCHEMA_NAME(o.schema_id) AS schema_name, + o.name AS table_name, + SUM(p.rows) AS approx_record_ct + FROM sys.objects o + LEFT JOIN sys.partitions p ON p.object_id = o.object_id + WHERE p.index_id IN (0, 1) -- 0 = heap, 1 = clustered index + OR p.index_id IS NULL + GROUP BY o.schema_id, o.name +) +SELECT + c.table_schema AS schema_name, c.table_name, c.column_name, CASE @@ -21,7 +30,6 @@ SELECT '{PROJECT_CODE}' as project_code, THEN c.data_type + COALESCE('(' + CAST(c.numeric_precision AS VARCHAR) + ',' + CAST(c.numeric_scale AS VARCHAR) + ')', '') ELSE c.data_type END AS db_data_type, - c.character_maximum_length, c.ordinal_position, CASE WHEN LOWER(c.data_type) LIKE '%char%' @@ -40,7 +48,9 @@ SELECT '{PROJECT_CODE}' as project_code, ELSE 'X' END AS general_type, - CASE WHEN c.numeric_scale > 0 THEN 1 ELSE 0 END AS is_decimal + CASE WHEN c.numeric_scale > 0 THEN 1 ELSE 0 END AS is_decimal, + a.approx_record_ct AS approx_record_ct FROM information_schema.columns c + LEFT JOIN approx_cts a ON c.table_schema = a.schema_name AND c.table_name = a.table_name WHERE c.table_schema = '{DATA_SCHEMA}' {TABLE_CRITERIA} ORDER BY c.table_schema, c.table_name, c.ordinal_position; diff --git a/testgen/template/flavors/mssql/profiling/project_get_table_sample_count_mssql.sql b/testgen/template/flavors/mssql/profiling/project_get_table_sample_count_mssql.sql deleted file mode 100644 index b7ccafa..0000000 --- a/testgen/template/flavors/mssql/profiling/project_get_table_sample_count_mssql.sql +++ /dev/null @@ -1,23 +0,0 @@ -WITH stats - AS (SELECT CAST(COUNT(*) as FLOAT) as record_ct, - ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0, 0) as calc_sample_ct, - CAST({PROFILE_SAMPLE_MIN_COUNT} as FLOAT) as min_sample_ct, - CAST(999000 as FLOAT) as max_sample_ct - FROM {SAMPLING_TABLE} ) -SELECT '{SAMPLING_TABLE}' as schema_table, - CASE WHEN record_ct <= min_sample_ct THEN -1 - WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct - ELSE {PROFILE_SAMPLE_MIN_COUNT} - END as sample_count, - CASE WHEN record_ct <= min_sample_ct THEN 1 - WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct - ELSE record_ct / min_sample_ct - END as sample_ratio, - ROUND(CASE WHEN record_ct <= min_sample_ct THEN 100 - WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct - WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct - ELSE 100.0 * min_sample_ct / record_ct - END, 4) as sample_percent_calc - FROM stats; diff --git a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml b/testgen/template/flavors/mssql/profiling/project_profiling_query.yaml similarity index 82% rename from testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml rename to testgen/template/flavors/mssql/profiling/project_profiling_query.yaml index 75ed459..58e04ca 100644 --- a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml +++ b/testgen/template/flavors/mssql/profiling/project_profiling_query.yaml @@ -1,7 +1,7 @@ --- -strTemplate01_sampling: "SELECT " -strTemplate01_else: "SELECT " -strTemplate01_5: | +01_sampling: "SELECT " +01_else: "SELECT " +01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -14,26 +14,30 @@ strTemplate01_5: | '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, -strTemplate02_X: | + +02_X: | COUNT(CASE WHEN "{COL_NAME}" IS NOT NULL THEN 1 END) AS value_ct, NULL AS distinct_value_ct, SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, -strTemplate02_else: | +02_else: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, -strTemplate03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, + +03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, MAX(LEN("{COL_NAME}")) AS max_length, AVG(CAST(NULLIF(LEN("{COL_NAME}"), 0) AS FLOAT)) AS avg_length, -strTemplate03_else: NULL as min_length, +03_else: NULL as min_length, NULL as max_length, NULL as avg_length, -strTemplate04_A: SUM(CASE + +04_A: SUM(CASE WHEN LTRIM(RTRIM("{COL_NAME}")) LIKE '0([.]0*)' THEN 1 ELSE 0 END) AS zero_value_ct, -strTemplate04_N: CAST(SUM( 1 - ABS(SIGN("{COL_NAME}")))AS BIGINT ) AS zero_value_ct, -strTemplate04_else: NULL as zero_value_ct, -strTemplate05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE("{COL_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ',''))) as distinct_std_value_ct, +04_N: CAST(SUM( 1 - ABS(SIGN("{COL_NAME}")))AS BIGINT ) AS zero_value_ct, +04_else: NULL as zero_value_ct, + +05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE("{COL_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ',''))) as distinct_std_value_ct, SUM(CASE WHEN "{COL_NAME}" = '' THEN 1 ELSE 0 @@ -120,7 +124,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE("{COL_NAME}",' '''',.-', AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END) AS FLOAT)/CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'SSN' END as std_pattern_match, -strTemplate05_else: NULL as distinct_std_value_ct, +05_else: NULL as distinct_std_value_ct, NULL as zero_length_ct, NULL as lead_space_ct, NULL as quoted_value_ct, @@ -135,7 +139,8 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, -strTemplate06_A_patterns: ( SELECT LEFT(STRING_AGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats + +06_A: ( SELECT LEFT(STRING_AGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats FROM ( SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(40)) + ' | ' + pattern AS pattern, COUNT(*) AS ct @@ -145,25 +150,14 @@ strTemplate06_A_patterns: ( SELECT LEFT(STRING_AGG(pattern, ' | ') WITHIN GROUP AS pattern FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK) WHERE "{COL_NAME}" > ' ' AND ((SELECT MAX(LEN("{COL_NAME}")) - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)) BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH})) p + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)) BETWEEN 3 and {MAX_PATTERN_LENGTH})) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC ) ps) AS top_patterns, -strTemplate06_else: NULL as top_patterns, -strTemplate07_A_freq: ( SELECT LEFT(STRING_AGG(val, ' | ') WITHIN GROUP (ORDER BY ct DESC, val ASC), 1000) as concat_vals - FROM ( - SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) + ' | ' + "{COL_NAME}" as val, - COUNT(*) as ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK) - WHERE "{COL_NAME}" > ' ' - GROUP BY "{COL_NAME}" - HAVING "{COL_NAME}" > ' ' - ORDER BY COUNT(*) DESC - ) ps - ) AS top_freq_values, -strTemplate07_else: NULL as top_freq_values, -strTemplate08_N: MIN("{COL_NAME}") AS min_value, +06_else: NULL as top_patterns, + +08_N: MIN("{COL_NAME}") AS min_value, MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, MAX("{COL_NAME}") AS max_value, AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value, @@ -171,7 +165,7 @@ strTemplate08_N: MIN("{COL_NAME}") AS min_value, MIN(pct_25) as percentile_25, MIN(pct_50) as percentile_50, MIN(pct_75) as percentile_75, -strTemplate08_else: NULL as min_value, +08_else: NULL as min_value, NULL as min_value_over_0, NULL as max_value, NULL as avg_value, @@ -179,11 +173,11 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(ABS(("{COL_NAME}" % 1)), 5)) as fractional_sum, -strTemplate10_else: NULL as fractional_sum, +10_N_dec: SUM(ROUND(ABS(("{COL_NAME}" % 1)), 5)) as fractional_sum, +10_else: NULL as fractional_sum, -strTemplate11_D: CASE +11_D: CASE WHEN MIN("{COL_NAME}") IS NULL THEN NULL ELSE CASE WHEN MIN("{COL_NAME}") >= CAST('0001-01-01' as date) THEN MIN("{COL_NAME}") ELSE CAST('0001-01-01' as date) END END as min_date, @@ -222,8 +216,7 @@ strTemplate11_D: CASE COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present, COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present, COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present, - -strTemplate11_else: NULL as min_date, +11_else: NULL as min_date, NULL as max_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, @@ -237,12 +230,10 @@ strTemplate11_else: NULL as min_date, NULL as date_weeks_present, NULL as date_months_present, -strTemplate12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, - -strTemplate12_else: NULL as boolean_true_ct, +12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +12_else: NULL as boolean_true_ct, -strTemplate13_ALL: NULL AS datatype_suggestion, -strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT TRANSLATE("{COL_NAME}" COLLATE Latin1_General_BIN, +14_A: ( SELECT COUNT(DISTINCT TRANSLATE("{COL_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN' ) @@ -251,38 +242,27 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT TRANSLATE("{COL_NAME}" CO WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(CAST(SIGN(LEN(RTRIM(LTRIM("{COL_NAME}"))) - LEN(REPLACE(RTRIM(LTRIM("{COL_NAME}")),' ',''))) AS BIGINT)) AS embedded_space_ct, AVG(CAST(LEN(RTRIM(LTRIM("{COL_NAME}"))) - LEN(REPLACE(RTRIM(LTRIM("{COL_NAME}")),' ','')) AS FLOAT)) AS avg_embedded_spaces, - -strTemplate14_A_no_patterns: NULL as distinct_pattern_ct, - SUM(CAST(SIGN(LEN(RTRIM(LTRIM("{COL_NAME}"))) - LEN(REPLACE(RTRIM(LTRIM("{COL_NAME}")),' ',''))) AS BIGINT)) AS embedded_space_ct, - AVG(CAST(LEN(RTRIM(LTRIM("{COL_NAME}"))) - LEN(REPLACE(RTRIM(LTRIM("{COL_NAME}")),' ','')) AS FLOAT)) AS avg_embedded_spaces, - -strTemplate14_else: NULL as distinct_pattern_ct, +14_else: NULL as distinct_pattern_ct, NULL as embedded_space_ct, NULL as avg_embedded_spaces, -strTemplate15_ALL: NULL as functional_data_type, - NULL as functional_table_type, - -strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id" +16_all: " '{PROFILE_RUN_ID}' as profile_run_id" -strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK)' +98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK)' +98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)' -strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)' - -strTemplate99_N: | +99_N: | , (SELECT TOP 1 PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)) pctile - -strTemplate99_N_sampling: | +99_N_sampling: | , (SELECT TOP 1 PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK)) pctile +99_else: ' ' -strTemplate99_else: ' ' - -strTemplate100_sampling: ' ' +100_sampling: ' ' diff --git a/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query_mssql.sql b/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query.sql similarity index 100% rename from testgen/template/flavors/mssql/profiling/project_secondary_profiling_query_mssql.sql rename to testgen/template/flavors/mssql/profiling/project_secondary_profiling_query.sql diff --git a/testgen/template/flavors/postgresql/data_chars/schema_ddf_query_postgresql.sql b/testgen/template/flavors/postgresql/data_chars/get_schema_ddf.sql similarity index 87% rename from testgen/template/flavors/postgresql/data_chars/schema_ddf_query_postgresql.sql rename to testgen/template/flavors/postgresql/data_chars/get_schema_ddf.sql index 24d5077..b5fcc32 100644 --- a/testgen/template/flavors/postgresql/data_chars/schema_ddf_query_postgresql.sql +++ b/testgen/template/flavors/postgresql/data_chars/get_schema_ddf.sql @@ -1,6 +1,5 @@ -SELECT '{PROJECT_CODE}' as project_code, - CURRENT_TIMESTAMP AT TIME ZONE 'UTC' as refresh_timestamp, - c.table_schema, +SELECT + c.table_schema AS schema_name, c.table_name, c.column_name, CASE @@ -25,8 +24,6 @@ SELECT '{PROJECT_CODE}' as project_code, THEN c.data_type || COALESCE('(' || CAST(c.datetime_precision AS VARCHAR) || ')', '') ELSE c.data_type END AS db_data_type, - COALESCE(c.character_maximum_length, CASE WHEN c.data_type IN ('text', 'character varying') THEN 65535 END) - as character_maximum_length, c.ordinal_position, CASE WHEN c.data_type ILIKE '%char%' or c.data_type = 'text' @@ -46,7 +43,10 @@ SELECT '{PROJECT_CODE}' as project_code, CASE WHEN c.data_type = 'numeric' THEN COALESCE(numeric_scale, 1) > 0 ELSE numeric_scale > 0 - END as is_decimal + END as is_decimal, + NULLIF(p.reltuples::BIGINT, -1) AS approx_record_ct FROM information_schema.columns c + LEFT JOIN pg_namespace n ON c.table_schema = n.nspname + LEFT JOIN pg_class p ON n.oid = p.relnamespace AND c.table_name = p.relname WHERE c.table_schema = '{DATA_SCHEMA}' {TABLE_CRITERIA} ORDER BY c.table_schema, c.table_name, c.ordinal_position diff --git a/testgen/template/flavors/postgresql/profiling/project_get_table_sample_count_postgresql.sql b/testgen/template/flavors/postgresql/profiling/project_get_table_sample_count_postgresql.sql deleted file mode 100644 index 6939bae..0000000 --- a/testgen/template/flavors/postgresql/profiling/project_get_table_sample_count_postgresql.sql +++ /dev/null @@ -1,23 +0,0 @@ -WITH stats - AS (SELECT COUNT(*)::FLOAT as record_ct, - ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0) as calc_sample_ct, - CAST({PROFILE_SAMPLE_MIN_COUNT} as FLOAT) as min_sample_ct, - CAST(999000 as FLOAT) as max_sample_ct - FROM {SAMPLING_TABLE} ) -SELECT '{SAMPLING_TABLE}' as schema_table, - CASE WHEN record_ct <= min_sample_ct THEN -1 - WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct - ELSE {PROFILE_SAMPLE_MIN_COUNT} - END as sample_count, - CASE WHEN record_ct <= min_sample_ct THEN 1 - WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct - ELSE record_ct / min_sample_ct - END as sample_ratio, - ROUND(CASE WHEN record_ct <= min_sample_ct THEN 100 - WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct - WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct - ELSE 100.0 * min_sample_ct / record_ct - END::NUMERIC, 4) as sample_percent_calc - FROM stats; diff --git a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml b/testgen/template/flavors/postgresql/profiling/project_profiling_query.yaml similarity index 80% rename from testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml rename to testgen/template/flavors/postgresql/profiling/project_profiling_query.yaml index 6bf6631..a9e65d0 100644 --- a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml +++ b/testgen/template/flavors/postgresql/profiling/project_profiling_query.yaml @@ -1,7 +1,7 @@ --- -strTemplate01_sampling: "SELECT " -strTemplate01_else: "SELECT " -strTemplate01_5: | +01_sampling: "SELECT " +01_else: "SELECT " +01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -14,26 +14,30 @@ strTemplate01_5: | '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, -strTemplate02_X: | + +02_X: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, -strTemplate02_else: | +02_else: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, -strTemplate03_ADN: MIN(LENGTH(CAST("{COL_NAME}" AS TEXT))) AS min_length, + +03_ADN: MIN(LENGTH(CAST("{COL_NAME}" AS TEXT))) AS min_length, MAX(LENGTH(CAST("{COL_NAME}" AS TEXT))) AS max_length, AVG(NULLIF(LENGTH(CAST("{COL_NAME}" AS TEXT)), 0)::FLOAT) AS avg_length, -strTemplate03_else: NULL as min_length, +03_else: NULL as min_length, NULL as max_length, NULL as avg_length, -strTemplate04_A: SUM(CASE + +04_A: SUM(CASE WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 ELSE 0 END) AS zero_value_ct, -strTemplate04_N: SUM( 1 - ABS(SIGN("{COL_NAME}"::NUMERIC)) )::BIGINT AS zero_value_ct, -strTemplate04_else: NULL as zero_value_ct, -strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, +04_N: SUM( 1 - ABS(SIGN("{COL_NAME}"::NUMERIC)) )::BIGINT AS zero_value_ct, +04_else: NULL as zero_value_ct, + +05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, SUM(CASE WHEN "{COL_NAME}" = '' THEN 1 ELSE 0 @@ -96,7 +100,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN' END as std_pattern_match, -strTemplate05_else: NULL as distinct_std_value_ct, +05_else: NULL as distinct_std_value_ct, NULL as zero_length_ct, NULL as lead_space_ct, NULL as quoted_value_ct, @@ -111,7 +115,8 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, -strTemplate06_A_patterns: ( SELECT LEFT(STRING_AGG(pattern, ' | ' ORDER BY ct DESC) , 1000) AS concat_pats + +06_A: ( SELECT LEFT(STRING_AGG(pattern, ' | ' ORDER BY ct DESC) , 1000) AS concat_pats FROM ( SELECT CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern, COUNT(*) AS ct @@ -121,26 +126,15 @@ strTemplate06_A_patterns: ( SELECT LEFT(STRING_AGG(pattern, ' | ' ORDER BY ct DE '[0-9]', 'N', 'g') AS pattern FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LENGTH("{COL_NAME}")) - FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p + FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC LIMIT 5 ) ps) AS top_patterns, -strTemplate06_else: NULL as top_patterns, -strTemplate07_A_freq: ( SELECT LEFT(STRING_AGG(val, ' | ' ORDER BY ct DESC), 1000) as concat_vals - FROM ( - SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val, - COUNT(*) as ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" - WHERE "{COL_NAME}" > ' ' - GROUP BY "{COL_NAME}" - HAVING "{COL_NAME}" > ' ' - ORDER BY COUNT(*), "{COL_NAME}" DESC - ) ps - ) AS top_freq_values, -strTemplate07_else: NULL as top_freq_values, -strTemplate08_N: MIN("{COL_NAME}") AS min_value, +06_else: NULL as top_patterns, + +08_N: MIN("{COL_NAME}") AS min_value, MIN(CASE WHEN "{COL_NAME}"::NUMERIC > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, MAX("{COL_NAME}") AS max_value, AVG(CAST("{COL_NAME}"::NUMERIC AS FLOAT)) AS avg_value, @@ -148,7 +142,7 @@ strTemplate08_N: MIN("{COL_NAME}") AS min_value, MIN(pct_25) as percentile_25, MIN(pct_50) as percentile_50, MIN(pct_75) as percentile_75, -strTemplate08_else: NULL as min_value, +08_else: NULL as min_value, NULL as min_value_over_0, NULL as max_value, NULL as avg_value, @@ -156,11 +150,11 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, -strTemplate10_else: NULL as fractional_sum, +10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, +10_else: NULL as fractional_sum, -strTemplate11_D: CASE +11_D: CASE WHEN MIN("{COL_NAME}") IS NULL THEN NULL ELSE GREATEST(MIN("{COL_NAME}"), '0001-01-01') END as min_date, @@ -199,9 +193,7 @@ strTemplate11_D: CASE COUNT(DISTINCT <%DATEDIFF_DAY;"{COL_NAME}";'{RUN_DATE}'%>) as date_days_present, COUNT(DISTINCT <%DATEDIFF_WEEK;"{COL_NAME}";'{RUN_DATE}'%>) as date_weeks_present, COUNT(DISTINCT <%DATEDIFF_MONTH;"{COL_NAME}";'{RUN_DATE}'%>) as date_months_present, - - -strTemplate11_else: NULL as min_date, +11_else: NULL as min_date, NULL as max_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, @@ -215,12 +207,10 @@ strTemplate11_else: NULL as min_date, NULL as date_weeks_present, NULL as date_months_present, -strTemplate12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, - -strTemplate12_else: NULL as boolean_true_ct, +12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +12_else: NULL as boolean_true_ct, -strTemplate13_ALL: NULL AS datatype_suggestion, -strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( +14_A: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( "{COL_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') @@ -229,38 +219,27 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPL WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(SIGN(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REGEXP_REPLACE(TRIM("{COL_NAME}"), ' ', '', 'g')))::BIGINT) AS embedded_space_ct, AVG(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REGEXP_REPLACE(TRIM("{COL_NAME}"), ' ', '', 'g'))::FLOAT) AS avg_embedded_spaces, - -strTemplate14_A_no_patterns: NULL as distinct_pattern_ct, - SUM(SIGN(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REGEXP_REPLACE(TRIM("{COL_NAME}"), ' ', '', 'g')))::BIGINT) AS embedded_space_ct, - AVG(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REGEXP_REPLACE(TRIM("{COL_NAME}"), ' ', '', 'g'))::FLOAT) AS avg_embedded_spaces, - -strTemplate14_else: NULL as distinct_pattern_ct, +14_else: NULL as distinct_pattern_ct, NULL as embedded_space_ct, NULL as avg_embedded_spaces, -strTemplate15_ALL: NULL as functional_data_type, - NULL as functional_table_type, +16_all: " '{PROFILE_RUN_ID}' as profile_run_id" -strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id" +98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64)' +98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ' -strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64)' - -strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ' - -strTemplate99_N: | +99_N: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile - -strTemplate99_N_sampling: | +99_N_sampling: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64) LIMIT 1) pctile +99_else: ' ' -strTemplate99_else: ' ' - -strTemplate100_sampling: ' ' +100_sampling: ' ' diff --git a/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query_postgresql.sql b/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query.sql similarity index 100% rename from testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query_postgresql.sql rename to testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query.sql diff --git a/testgen/template/flavors/redshift/data_chars/schema_ddf_query_redshift.sql b/testgen/template/flavors/redshift/data_chars/get_schema_ddf.sql similarity index 85% rename from testgen/template/flavors/redshift/data_chars/schema_ddf_query_redshift.sql rename to testgen/template/flavors/redshift/data_chars/get_schema_ddf.sql index d54ba38..6bda34e 100644 --- a/testgen/template/flavors/redshift/data_chars/schema_ddf_query_redshift.sql +++ b/testgen/template/flavors/redshift/data_chars/get_schema_ddf.sql @@ -1,6 +1,5 @@ -SELECT '{PROJECT_CODE}' as project_code, - CURRENT_TIMESTAMP AT TIME ZONE 'UTC' as refresh_timestamp, - c.table_schema, +SELECT + c.table_schema AS schema_name, c.table_name, c.column_name, CASE @@ -20,7 +19,6 @@ SELECT '{PROJECT_CODE}' as project_code, || CAST(c.numeric_scale AS VARCHAR) || ')', '') ELSE c.data_type END AS db_data_type, - c.character_maximum_length, c.ordinal_position, CASE WHEN c.data_type ILIKE 'char%' @@ -40,7 +38,13 @@ SELECT '{PROJECT_CODE}' as project_code, CASE WHEN c.data_type = 'numeric' THEN COALESCE(numeric_scale, 1) > 0 ELSE numeric_scale > 0 - END AS is_decimal + END AS is_decimal, + CASE + WHEN reltuples > 0 AND reltuples < 1 THEN NULL + ELSE reltuples::BIGINT + END AS approx_record_ct FROM information_schema.columns c + LEFT JOIN pg_namespace n ON c.table_schema = n.nspname + LEFT JOIN pg_class p ON n.oid = p.relnamespace AND c.table_name = p.relname WHERE c.table_schema = '{DATA_SCHEMA}' {TABLE_CRITERIA} ORDER BY c.table_schema, c.table_name, c.ordinal_position diff --git a/testgen/template/flavors/redshift/profiling/project_get_table_sample_count_redshift.sql b/testgen/template/flavors/redshift/profiling/project_get_table_sample_count_redshift.sql deleted file mode 100644 index 9a62c3d..0000000 --- a/testgen/template/flavors/redshift/profiling/project_get_table_sample_count_redshift.sql +++ /dev/null @@ -1,23 +0,0 @@ -WITH stats - AS (SELECT COUNT(*)::FLOAT as record_ct, - ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0) as calc_sample_ct, - CAST({PROFILE_SAMPLE_MIN_COUNT} as FLOAT) as min_sample_ct, - CAST(999000 as FLOAT) as max_sample_ct - FROM {SAMPLING_TABLE} ) -SELECT '{SAMPLING_TABLE}' as schema_table, - CASE WHEN record_ct <= min_sample_ct THEN -1 - WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct - ELSE {PROFILE_SAMPLE_MIN_COUNT} - END as sample_count, - CASE WHEN record_ct <= min_sample_ct THEN 1 - WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct - ELSE record_ct / min_sample_ct - END as sample_ratio, - ROUND(CASE WHEN record_ct <= min_sample_ct THEN 100 - WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct - WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct - ELSE 100.0 * min_sample_ct / record_ct - END, 4) as sample_percent_calc - FROM stats; diff --git a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml b/testgen/template/flavors/redshift/profiling/project_profiling_query.yaml similarity index 78% rename from testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml rename to testgen/template/flavors/redshift/profiling/project_profiling_query.yaml index 8ee6eed..7cbbfd4 100644 --- a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml +++ b/testgen/template/flavors/redshift/profiling/project_profiling_query.yaml @@ -1,7 +1,7 @@ --- -strTemplate01_sampling: "SELECT " -strTemplate01_else: "SELECT " -strTemplate01_5: | +01_sampling: "SELECT " +01_else: "SELECT " +01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -14,24 +14,28 @@ strTemplate01_5: | '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, -strTemplate02_X: | + +02_X: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -strTemplate02_else: | +02_else: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -strTemplate03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, + +03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, MAX(LEN("{COL_NAME}")) AS max_length, AVG(NULLIF(LEN("{COL_NAME}"), 0)::FLOAT) AS avg_length, -strTemplate03_else: NULL as min_length, +03_else: NULL as min_length, NULL as max_length, NULL as avg_length, -strTemplate04_A: COUNT( CASE WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 END) AS zero_value_ct, -strTemplate04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct, -strTemplate04_else: NULL as zero_value_ct, -strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, + +04_A: COUNT( CASE WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 END) AS zero_value_ct, +04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct, +04_else: NULL as zero_value_ct, + +05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, COUNT( CASE WHEN "{COL_NAME}" = '' THEN 1 END) AS zero_length_ct, COUNT( CASE WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 END ) AS lead_space_ct, COUNT( CASE WHEN "{COL_NAME}" ILIKE '"%"' OR "{COL_NAME}" ILIKE '''%''' THEN 1 END ) as quoted_value_ct, @@ -76,7 +80,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN' END as std_pattern_match, -strTemplate05_else: NULL as distinct_std_value_ct, +05_else: NULL as distinct_std_value_ct, NULL as zero_length_ct, NULL as lead_space_ct, NULL as quoted_value_ct, @@ -91,7 +95,8 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, -strTemplate06_A_patterns: (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats + +06_A: (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats FROM ( SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern, COUNT(*) AS ct FROM ( SELECT REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( @@ -100,24 +105,13 @@ strTemplate06_A_patterns: (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORD '[0-9]', 'N') AS pattern FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}")) - FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p + FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC) as ps) AS top_patterns, -strTemplate06_else: NULL as top_patterns, -strTemplate07_A_freq: ( SELECT LEFT(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) as concat_vals - FROM ( - SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val, - COUNT(*) as ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" - WHERE "{COL_NAME}" > ' ' - GROUP BY "{COL_NAME}" - HAVING "{COL_NAME}" > ' ' - ORDER BY COUNT(*), "{COL_NAME}" DESC - ) ps - ) AS top_freq_values, -strTemplate07_else: NULL as top_freq_values, -strTemplate08_N: MIN("{COL_NAME}") AS min_value, +06_else: NULL as top_patterns, + +08_N: MIN("{COL_NAME}") AS min_value, MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, MAX("{COL_NAME}") AS max_value, AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value, @@ -125,7 +119,7 @@ strTemplate08_N: MIN("{COL_NAME}") AS min_value, MIN(pct_25) as percentile_25, MIN(pct_50) as percentile_50, MIN(pct_75) as percentile_75, -strTemplate08_else: NULL as min_value, +08_else: NULL as min_value, NULL as min_value_over_0, NULL as max_value, NULL as avg_value, @@ -133,11 +127,11 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, -strTemplate10_else: NULL as fractional_sum, +10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, +10_else: NULL as fractional_sum, -strTemplate11_D: CASE +11_D: CASE WHEN MIN("{COL_NAME}") IS NULL THEN NULL ELSE GREATEST(MIN("{COL_NAME}"), '0001-01-01') END as min_date, @@ -153,8 +147,7 @@ strTemplate11_D: CASE COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) as date_days_present, COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) as date_weeks_present, COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) as date_months_present, - -strTemplate11_else: NULL as min_date, +11_else: NULL as min_date, NULL as max_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, @@ -168,12 +161,10 @@ strTemplate11_else: NULL as min_date, NULL as date_weeks_present, NULL as date_months_present, -strTemplate12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, - -strTemplate12_else: NULL as boolean_true_ct, +12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +12_else: NULL as boolean_true_ct, -strTemplate13_ALL: NULL AS datatype_suggestion, -strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( +14_A: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( "{COL_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') @@ -182,38 +173,27 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPL WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct, AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces, - -strTemplate14_A_no_patterns: NULL as distinct_pattern_ct, - SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct, - AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces, - -strTemplate14_else: NULL as distinct_pattern_ct, +14_else: NULL as distinct_pattern_ct, NULL as embedded_space_ct, NULL as avg_embedded_spaces, -strTemplate15_ALL: NULL as functional_data_type, - NULL as functional_table_type, +16_all: " '{PROFILE_RUN_ID}' as profile_run_id" -strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id" +98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ' +98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' -strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ' - -strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' - -strTemplate99_N: | +99_N: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile - -strTemplate99_N_sampling: | +99_N_sampling: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile +99_else: ' ' -strTemplate99_else: ' ' - -strTemplate100_sampling: 'WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO}' +100_sampling: 'WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO}' diff --git a/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query_redshift.sql b/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query.sql similarity index 100% rename from testgen/template/flavors/redshift/profiling/project_secondary_profiling_query_redshift.sql rename to testgen/template/flavors/redshift/profiling/project_secondary_profiling_query.sql diff --git a/testgen/template/flavors/redshift_spectrum/data_chars/schema_ddf_query_redshift_spectrum.sql b/testgen/template/flavors/redshift_spectrum/data_chars/get_schema_ddf.sql similarity index 76% rename from testgen/template/flavors/redshift_spectrum/data_chars/schema_ddf_query_redshift_spectrum.sql rename to testgen/template/flavors/redshift_spectrum/data_chars/get_schema_ddf.sql index 76ded62..3a6669f 100644 --- a/testgen/template/flavors/redshift_spectrum/data_chars/schema_ddf_query_redshift_spectrum.sql +++ b/testgen/template/flavors/redshift_spectrum/data_chars/get_schema_ddf.sql @@ -1,14 +1,9 @@ -SELECT '{PROJECT_CODE}' AS project_code, - CURRENT_TIMESTAMP AT TIME ZONE 'UTC' AS refresh_timestamp, - c.schemaname AS table_schema, +SELECT + c.schemaname AS schema_name, c.tablename AS table_name, c.columnname AS column_name, c.external_type AS column_type, c.external_type AS db_data_type, - NULLIF( - REGEXP_SUBSTR(c.external_type, 'char\\(([0-9]+)\\)', 1, 1, 'e'), - '' - ) AS character_maximum_length, c.columnnum AS ordinal_position, CASE WHEN c.external_type = 'string' @@ -29,7 +24,8 @@ SELECT '{PROJECT_CODE}' AS project_code, WHEN REGEXP_SUBSTR(c.external_type, 'decimal\\([0-9]+,([0-9]+)\\)', 1, 1, 'e') > 0 THEN 1 ELSE 0 - END AS is_decimal + END AS is_decimal, + NULL AS approx_record_ct -- Table statistics unavailable FROM svv_external_columns c WHERE c.schemaname = '{DATA_SCHEMA}' {TABLE_CRITERIA} diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_get_table_sample_count_redshift_spectrum.sql b/testgen/template/flavors/redshift_spectrum/profiling/project_get_table_sample_count_redshift_spectrum.sql deleted file mode 100644 index 9a62c3d..0000000 --- a/testgen/template/flavors/redshift_spectrum/profiling/project_get_table_sample_count_redshift_spectrum.sql +++ /dev/null @@ -1,23 +0,0 @@ -WITH stats - AS (SELECT COUNT(*)::FLOAT as record_ct, - ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0) as calc_sample_ct, - CAST({PROFILE_SAMPLE_MIN_COUNT} as FLOAT) as min_sample_ct, - CAST(999000 as FLOAT) as max_sample_ct - FROM {SAMPLING_TABLE} ) -SELECT '{SAMPLING_TABLE}' as schema_table, - CASE WHEN record_ct <= min_sample_ct THEN -1 - WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct - ELSE {PROFILE_SAMPLE_MIN_COUNT} - END as sample_count, - CASE WHEN record_ct <= min_sample_ct THEN 1 - WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct - ELSE record_ct / min_sample_ct - END as sample_ratio, - ROUND(CASE WHEN record_ct <= min_sample_ct THEN 100 - WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct - WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct - ELSE 100.0 * min_sample_ct / record_ct - END, 4) as sample_percent_calc - FROM stats; diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query_redshift_spectrum.yaml b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.yaml similarity index 78% rename from testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query_redshift_spectrum.yaml rename to testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.yaml index 80b7a58..cfdbb33 100644 --- a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query_redshift_spectrum.yaml +++ b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.yaml @@ -1,7 +1,7 @@ --- -strTemplate01_sampling: "SELECT " -strTemplate01_else: "SELECT " -strTemplate01_5: | +01_sampling: "SELECT " +01_else: "SELECT " +01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -14,24 +14,28 @@ strTemplate01_5: | '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, -strTemplate02_X: | + +02_X: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -strTemplate02_else: | +02_else: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -strTemplate03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, + +03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, MAX(LEN("{COL_NAME}")) AS max_length, AVG(NULLIF(LEN("{COL_NAME}"), 0)::FLOAT) AS avg_length, -strTemplate03_else: NULL as min_length, +03_else: NULL as min_length, NULL as max_length, NULL as avg_length, -strTemplate04_A: COUNT( CASE WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 END) AS zero_value_ct, -strTemplate04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct, -strTemplate04_else: NULL as zero_value_ct, -strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, + +04_A: COUNT( CASE WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 END) AS zero_value_ct, +04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct, +04_else: NULL as zero_value_ct, + +05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, COUNT( CASE WHEN "{COL_NAME}" = '' THEN 1 END) AS zero_length_ct, COUNT( CASE WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 END ) AS lead_space_ct, COUNT( CASE WHEN "{COL_NAME}" ILIKE '"%"' OR "{COL_NAME}" ILIKE '''%''' THEN 1 END ) as quoted_value_ct, @@ -76,7 +80,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN' END as std_pattern_match, -strTemplate05_else: NULL as distinct_std_value_ct, +05_else: NULL as distinct_std_value_ct, NULL as zero_length_ct, NULL as lead_space_ct, NULL as quoted_value_ct, @@ -91,7 +95,8 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, -strTemplate06_A_patterns: (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats + +06_A: (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats FROM ( SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern, COUNT(*) AS ct FROM ( SELECT REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( @@ -100,24 +105,13 @@ strTemplate06_A_patterns: (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORD '[0-9]', 'N') AS pattern FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}")) - FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p + FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC) as ps) AS top_patterns, -strTemplate06_else: NULL as top_patterns, -strTemplate07_A_freq: ( SELECT LEFT(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) as concat_vals - FROM ( - SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val, - COUNT(*) as ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" - WHERE "{COL_NAME}" > ' ' - GROUP BY "{COL_NAME}" - HAVING "{COL_NAME}" > ' ' - ORDER BY COUNT(*), "{COL_NAME}" DESC - ) ps - ) AS top_freq_values, -strTemplate07_else: NULL as top_freq_values, -strTemplate08_N: MIN("{COL_NAME}") AS min_value, +06_else: NULL as top_patterns, + +08_N: MIN("{COL_NAME}") AS min_value, MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, MAX("{COL_NAME}") AS max_value, AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value, @@ -125,7 +119,7 @@ strTemplate08_N: MIN("{COL_NAME}") AS min_value, MIN(pct_25) as percentile_25, MIN(pct_50) as percentile_50, MIN(pct_75) as percentile_75, -strTemplate08_else: NULL as min_value, +08_else: NULL as min_value, NULL as min_value_over_0, NULL as max_value, NULL as avg_value, @@ -133,11 +127,11 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, -strTemplate10_else: NULL as fractional_sum, +10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, +10_else: NULL as fractional_sum, -strTemplate11_D: CASE +11_D: CASE WHEN MIN("{COL_NAME}") IS NULL THEN NULL ELSE GREATEST(MIN("{COL_NAME}"), '0001-01-01') END as min_date, @@ -153,8 +147,7 @@ strTemplate11_D: CASE COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present, COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present, COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present, - -strTemplate11_else: NULL as min_date, +11_else: NULL as min_date, NULL as max_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, @@ -168,12 +161,10 @@ strTemplate11_else: NULL as min_date, NULL as date_weeks_present, NULL as date_months_present, -strTemplate12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, - -strTemplate12_else: NULL as boolean_true_ct, +12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +12_else: NULL as boolean_true_ct, -strTemplate13_ALL: NULL AS datatype_suggestion, -strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( +14_A: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( "{COL_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') @@ -182,38 +173,27 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPL WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct, AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces, - -strTemplate14_A_no_patterns: NULL as distinct_pattern_ct, - SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct, - AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces, - -strTemplate14_else: NULL as distinct_pattern_ct, +14_else: NULL as distinct_pattern_ct, NULL as embedded_space_ct, NULL as avg_embedded_spaces, -strTemplate15_ALL: NULL as functional_data_type, - NULL as functional_table_type, +16_all: " '{PROFILE_RUN_ID}' as profile_run_id" -strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id" +98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ' +98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' -strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ' - -strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' - -strTemplate99_N: | +99_N: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile - -strTemplate99_N_sampling: | +99_N_sampling: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile +99_else: ' ' -strTemplate99_else: ' ' - -strTemplate100_sampling: 'WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO}' +100_sampling: 'WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO}' diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query_redshift_spectrum.sql b/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query.sql similarity index 100% rename from testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query_redshift_spectrum.sql rename to testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query.sql diff --git a/testgen/template/flavors/snowflake/data_chars/schema_ddf_query_snowflake.sql b/testgen/template/flavors/snowflake/data_chars/get_schema_ddf.sql similarity index 89% rename from testgen/template/flavors/snowflake/data_chars/schema_ddf_query_snowflake.sql rename to testgen/template/flavors/snowflake/data_chars/get_schema_ddf.sql index 49e6c1e..54940da 100644 --- a/testgen/template/flavors/snowflake/data_chars/schema_ddf_query_snowflake.sql +++ b/testgen/template/flavors/snowflake/data_chars/get_schema_ddf.sql @@ -1,6 +1,5 @@ -SELECT '{PROJECT_CODE}' as project_code, - CURRENT_TIMESTAMP as refresh_timestamp, - c.table_schema, +SELECT + c.table_schema AS schema_name, c.table_name, c.column_name, CASE @@ -25,7 +24,6 @@ SELECT '{PROJECT_CODE}' as project_code, THEN c.data_type || COALESCE('(' || CAST(c.datetime_precision AS VARCHAR) || ')', '') ELSE c.data_type END AS db_data_type, - c.character_maximum_length, c.ordinal_position, CASE WHEN c.data_type = 'TEXT' @@ -43,7 +41,9 @@ SELECT '{PROJECT_CODE}' as project_code, ELSE 'X' END AS general_type, - numeric_scale > 0 AS is_decimal + numeric_scale > 0 AS is_decimal, + t.row_count AS approx_record_ct FROM information_schema.columns c + LEFT JOIN information_schema.tables t ON c.table_schema = t.table_schema AND c.table_name = t.table_name WHERE c.table_schema = '{DATA_SCHEMA}' {TABLE_CRITERIA} ORDER BY c.table_schema, c.table_name, c.ordinal_position; diff --git a/testgen/template/flavors/snowflake/profiling/project_get_table_sample_count_snowflake.sql b/testgen/template/flavors/snowflake/profiling/project_get_table_sample_count_snowflake.sql deleted file mode 100644 index 9a62c3d..0000000 --- a/testgen/template/flavors/snowflake/profiling/project_get_table_sample_count_snowflake.sql +++ /dev/null @@ -1,23 +0,0 @@ -WITH stats - AS (SELECT COUNT(*)::FLOAT as record_ct, - ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0) as calc_sample_ct, - CAST({PROFILE_SAMPLE_MIN_COUNT} as FLOAT) as min_sample_ct, - CAST(999000 as FLOAT) as max_sample_ct - FROM {SAMPLING_TABLE} ) -SELECT '{SAMPLING_TABLE}' as schema_table, - CASE WHEN record_ct <= min_sample_ct THEN -1 - WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct - ELSE {PROFILE_SAMPLE_MIN_COUNT} - END as sample_count, - CASE WHEN record_ct <= min_sample_ct THEN 1 - WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct - ELSE record_ct / min_sample_ct - END as sample_ratio, - ROUND(CASE WHEN record_ct <= min_sample_ct THEN 100 - WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct - WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct - ELSE 100.0 * min_sample_ct / record_ct - END, 4) as sample_percent_calc - FROM stats; diff --git a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml b/testgen/template/flavors/snowflake/profiling/project_profiling_query.yaml similarity index 79% rename from testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml rename to testgen/template/flavors/snowflake/profiling/project_profiling_query.yaml index a42e3e2..3788d28 100644 --- a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml +++ b/testgen/template/flavors/snowflake/profiling/project_profiling_query.yaml @@ -1,7 +1,7 @@ --- -strTemplate01_sampling: "SELECT " -strTemplate01_else: "SELECT " -strTemplate01_5: | +01_sampling: "SELECT " +01_else: "SELECT " +01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -14,26 +14,30 @@ strTemplate01_5: | '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, -strTemplate02_X: | + +02_X: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -strTemplate02_else: | +02_else: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -strTemplate03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, + +03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, MAX(LEN("{COL_NAME}")) AS max_length, AVG(NULLIF(LEN("{COL_NAME}"), 0)::FLOAT) AS avg_length, -strTemplate03_else: NULL as min_length, +03_else: NULL as min_length, NULL as max_length, NULL as avg_length, -strTemplate04_A: COUNT(CASE + +04_A: COUNT(CASE WHEN REGEXP_LIKE(TRIM("{COL_NAME}"::VARCHAR), '^0(\.0*)?$') THEN 1 END) AS zero_value_ct, -strTemplate04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct, -strTemplate04_else: NULL as zero_value_ct, -strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, +04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct, +04_else: NULL as zero_value_ct, + +05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, COUNT(CASE WHEN "{COL_NAME}" = '' THEN 1 END) AS zero_length_ct, @@ -83,7 +87,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN' END as std_pattern_match, -strTemplate05_else: NULL as distinct_std_value_ct, +05_else: NULL as distinct_std_value_ct, NULL as zero_length_ct, NULL as lead_space_ct, NULL as quoted_value_ct, @@ -98,7 +102,8 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, -strTemplate06_A_patterns: ( SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats + +06_A: ( SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats FROM ( SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern, COUNT(*) AS ct @@ -108,24 +113,13 @@ strTemplate06_A_patterns: ( SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (OR '[0-9]', 'N') AS pattern FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}")) - FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p + FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC) as ps) AS top_patterns, -strTemplate06_else: NULL as top_patterns, -strTemplate07_A_freq: ( SELECT LEFT(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) as concat_vals - FROM ( - SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val, - COUNT(*) as ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" - WHERE "{COL_NAME}" > ' ' - GROUP BY "{COL_NAME}" - HAVING "{COL_NAME}" > ' ' - ORDER BY COUNT(*), "{COL_NAME}" DESC - ) ps - ) AS top_freq_values, -strTemplate07_else: NULL as top_freq_values, -strTemplate08_N: MIN("{COL_NAME}") AS min_value, +06_else: NULL as top_patterns, + +08_N: MIN("{COL_NAME}") AS min_value, MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, MAX("{COL_NAME}") AS max_value, AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value, @@ -133,7 +127,7 @@ strTemplate08_N: MIN("{COL_NAME}") AS min_value, MIN(pct_25) as percentile_25, MIN(pct_50) as percentile_50, MIN(pct_75) as percentile_75, -strTemplate08_else: NULL as min_value, +08_else: NULL as min_value, NULL as min_value_over_0, NULL as max_value, NULL as avg_value, @@ -141,11 +135,11 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, -strTemplate10_else: NULL as fractional_sum, +10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, +10_else: NULL as fractional_sum, -strTemplate11_D: GREATEST(MIN("{COL_NAME}"), '0001-01-01') as min_date, +11_D: GREATEST(MIN("{COL_NAME}"), '0001-01-01') as min_date, MAX("{COL_NAME}") as max_date, COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 12 THEN 1 END) AS before_1yr_date_ct, COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 60 THEN 1 END) AS before_5yr_date_ct, @@ -158,8 +152,7 @@ strTemplate11_D: GREATEST(MIN("{COL_NAME}"), '0001-01-01') as min_date, COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present, COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present, COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present, - -strTemplate11_else: NULL as min_date, +11_else: NULL as min_date, NULL as max_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, @@ -173,12 +166,10 @@ strTemplate11_else: NULL as min_date, NULL as date_weeks_present, NULL as date_months_present, -strTemplate12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, - -strTemplate12_else: NULL as boolean_true_ct, +12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +12_else: NULL as boolean_true_ct, -strTemplate13_ALL: NULL AS datatype_suggestion, -strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( +14_A: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( "{COL_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') @@ -187,40 +178,29 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REP WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' '))::BIGINT) AS embedded_space_ct, AVG(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' ')::FLOAT) AS avg_embedded_spaces, - -strTemplate14_A_no_patterns: NULL as distinct_pattern_ct, - SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' '))::BIGINT) AS embedded_space_ct, - AVG(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' ')::FLOAT) AS avg_embedded_spaces, - -strTemplate14_else: NULL as distinct_pattern_ct, +14_else: NULL as distinct_pattern_ct, NULL as embedded_space_ct, NULL as avg_embedded_spaces, -strTemplate15_ALL: NULL as functional_data_type, - NULL as functional_table_type, +16_all: " '{PROFILE_RUN_ID}' as profile_run_id " -strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id " +98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_SIZE} rows)' +98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' -strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_SIZE} rows)' - -strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' - -strTemplate99_N: | +99_N: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile - -strTemplate99_N_sampling: | +99_N_sampling: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_SIZE} rows) LIMIT 1 ) pctile +99_else: ; -strTemplate99_else: ; - -strTemplate100_sampling: ' ' +100_sampling: ' ' diff --git a/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query_snowflake.sql b/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query.sql similarity index 100% rename from testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query_snowflake.sql rename to testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query.sql diff --git a/testgen/template/flavors/trino/profiling/project_get_table_sample_count_trino.sql b/testgen/template/flavors/trino/profiling/project_get_table_sample_count_trino.sql deleted file mode 100644 index 23f5a4b..0000000 --- a/testgen/template/flavors/trino/profiling/project_get_table_sample_count_trino.sql +++ /dev/null @@ -1,23 +0,0 @@ -WITH stats - AS (SELECT COUNT(*)::REAL as record_ct, - ROUND(CAST({PROFILE_SAMPLE_PERCENT} as REAL) * CAST(COUNT(*) as REAL) / 100.0) as calc_sample_ct, - CAST({PROFILE_SAMPLE_MIN_COUNT} as REAL) as min_sample_ct, - CAST(999000 as REAL) as max_sample_ct - FROM {SAMPLING_TABLE} ) -SELECT '{SAMPLING_TABLE}' as schema_table, - CASE WHEN record_ct <= min_sample_ct THEN -1 - WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct - ELSE {PROFILE_SAMPLE_MIN_COUNT} - END as sample_count, - CASE WHEN record_ct <= min_sample_ct THEN 1 - WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct - ELSE record_ct / min_sample_ct - END as sample_ratio, - ROUND(CASE WHEN record_ct <= min_sample_ct THEN 100 - WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct - WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct - ELSE 100.0 * min_sample_ct / record_ct - END, 4) as sample_percent_calc - FROM stats; diff --git a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml b/testgen/template/flavors/trino/profiling/project_profiling_query.yaml similarity index 81% rename from testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml rename to testgen/template/flavors/trino/profiling/project_profiling_query.yaml index 313f79b..126a5cb 100644 --- a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml +++ b/testgen/template/flavors/trino/profiling/project_profiling_query.yaml @@ -1,7 +1,7 @@ --- -strTemplate01_sampling: "SELECT " -strTemplate01_else: "SELECT " -strTemplate01_5: | +01_sampling: "SELECT " +01_else: "SELECT " +01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -14,26 +14,29 @@ strTemplate01_5: | '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, -strTemplate02_X: | + +02_X: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -strTemplate02_else: | +02_else: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -strTemplate03_ADN: MIN(LENGTH("{COL_NAME}")) AS min_length, + +03_ADN: MIN(LENGTH("{COL_NAME}")) AS min_length, MAX(LENGTH("{COL_NAME}")) AS max_length, AVG(CAST(NULLIF(LENGTH("{COL_NAME}"), 0) AS REAL)) AS avg_length, -strTemplate03_else: NULL as min_length, +03_else: NULL as min_length, NULL as max_length, NULL as avg_length, -strTemplate04_A: SUM(CASE +04_A: SUM(CASE WHEN REGEXP_LIKE(TRIM("{COL_NAME}") , '^0(\.0*)?$') = TRUE THEN 1 ELSE 0 END) AS zero_value_ct, -strTemplate04_N: CAST(SUM( 1 - ABS(SIGN("{COL_NAME}")) ) AS BIGINT) AS zero_value_ct, -strTemplate04_else: NULL as zero_value_ct, -strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, +04_N: CAST(SUM( 1 - ABS(SIGN("{COL_NAME}")) ) AS BIGINT) AS zero_value_ct, +04_else: NULL as zero_value_ct, + +05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, SUM(CASE WHEN "{COL_NAME}" = '' THEN 1 ELSE 0 @@ -97,7 +100,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a AND SUBSTRING("{COL_NAME}", 1, 3) NOT BETWEEN '734' AND '749' AND SUBSTRING("{COL_NAME}", 1, 3) <> '666' THEN 1 END) AS REAL)/CAST(COUNT("{COL_NAME}") AS REAL) > 0.9 THEN 'SSN' END as std_pattern_match, -strTemplate05_else: NULL as distinct_std_value_ct, +05_else: NULL as distinct_std_value_ct, NULL as zero_length_ct, NULL as lead_space_ct, NULL as quoted_value_ct, @@ -112,7 +115,8 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, -strTemplate06_A_patterns: (SELECT SUBSTRING(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1, 1000) AS concat_pats + +06_A: (SELECT SUBSTRING(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1, 1000) AS concat_pats FROM ( SELECT CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern, COUNT(*) AS ct FROM ( SELECT REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( @@ -121,24 +125,13 @@ strTemplate06_A_patterns: (SELECT SUBSTRING(LISTAGG(pattern, ' | ') WITHIN GROUP '[0-9]', 'N') AS pattern FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LENGTH("{COL_NAME}")) - FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p + FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC LIMIT 5) as ps) AS top_patterns, -strTemplate06_else: NULL as top_patterns, -strTemplate07_A_freq: ( SELECT SUBSTRING(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1, 1000) as concat_vals - FROM ( - SELECT CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val, COUNT(*) as ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" - WHERE "{COL_NAME}" > ' ' - GROUP BY "{COL_NAME}" - HAVING "{COL_NAME}" > ' ' - ORDER BY COUNT(*), "{COL_NAME}" DESC - LIMIT 10 - ) ps - ) AS top_freq_values, -strTemplate07_else: NULL as top_freq_values, -strTemplate08_N: MIN("{COL_NAME}") AS min_value, +06_else: NULL as top_patterns, + +08_N: MIN("{COL_NAME}") AS min_value, MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, MAX("{COL_NAME}") AS max_value, AVG(CAST("{COL_NAME}" AS REAL)) AS avg_value, @@ -146,7 +139,7 @@ strTemplate08_N: MIN("{COL_NAME}") AS min_value, MIN(pct_25) as percentile_25, MIN(pct_50) as percentile_50, MIN(pct_75) as percentile_75, -strTemplate08_else: NULL as min_value, +08_else: NULL as min_value, NULL as min_value_over_0, NULL as max_value, NULL as avg_value, @@ -154,10 +147,11 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, -strTemplate10_else: NULL as fractional_sum, -strTemplate11_D: CASE +10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, +10_else: NULL as fractional_sum, + +11_D: CASE WHEN MIN("{COL_NAME}") IS NULL THEN NULL ELSE GREATEST(MIN("{COL_NAME}"), '0001-01-01') END as min_date, @@ -196,8 +190,7 @@ strTemplate11_D: CASE COUNT(DISTINCT DATE_DIFF('day', TIMESTAMP "{COL_NAME}", TIMESTAMP '{RUN_DATE}' ) ) as date_days_present, COUNT(DISTINCT DATE_DIFF('week', TIMESTAMP "{COL_NAME}", TIMESTAMP '{RUN_DATE}' ) ) as date_weeks_present, COUNT(DISTINCT DATE_DIFF('month', TIMESTAMP "{COL_NAME}", TIMESTAMP '{RUN_DATE}' ) ) as date_months_present, - -strTemplate11_else: NULL as min_date, +11_else: NULL as min_date, NULL as max_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, @@ -211,12 +204,10 @@ strTemplate11_else: NULL as min_date, NULL as date_weeks_present, NULL as date_months_present, -strTemplate12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, - -strTemplate12_else: NULL as boolean_true_ct, +12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +12_else: NULL as boolean_true_ct, -strTemplate13_ALL: NULL AS datatype_suggestion, -strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( +14_A: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( "{COL_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') @@ -225,38 +216,27 @@ strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPL WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(CAST(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')) AS BIGINT)) AS embedded_space_ct, AVG(CAST(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ') AS REAL)) AS avg_embedded_spaces, - -strTemplate14_A_no_patterns: NULL as distinct_pattern_ct, - SUM(CAST(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')) AS BIGINT)) AS embedded_space_ct, - AVG(CAST(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ') AS REAL)) AS avg_embedded_spaces, - -strTemplate14_else: NULL as distinct_pattern_ct, +14_else: NULL as distinct_pattern_ct, NULL as embedded_space_ct, NULL as avg_embedded_spaces, -strTemplate15_ALL: NULL as functional_data_type, - NULL as functional_table_type, +16_all: " '{PROFILE_RUN_ID}' as profile_run_id" -strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id" +98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE SYSTEM ({SAMPLE_PERCENT_CALC})' +98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' -strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE SYSTEM ({SAMPLE_PERCENT_CALC})' - -strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' - -strTemplate99_N: | +99_N: | , (SELECT APPROX_PERCENTILE("{COL_NAME}", 0.25) AS pct_25, APPROX_PERCENTILE("{COL_NAME}", 0.50) AS pct_50, APPROX_PERCENTILE("{COL_NAME}", 0.75) AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile - -strTemplate99_N_sampling: | +99_N_sampling: | , (SELECT APPROX_PERCENTILE("{COL_NAME}", 0.25) AS pct_25, APPROX_PERCENTILE("{COL_NAME}", 0.50) AS pct_50, APPROX_PERCENTILE("{COL_NAME}", 0.75) AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE SYSTEM ({SAMPLE_PERCENT_CALC}) ) pctile +99_else: ' ' -strTemplate99_else: ' ' - -strTemplate100_sampling: ' ' +100_sampling: ' ' diff --git a/testgen/template/parms/parms_profiling.sql b/testgen/template/parms/parms_profiling.sql deleted file mode 100644 index 7b98d41..0000000 --- a/testgen/template/parms/parms_profiling.sql +++ /dev/null @@ -1,28 +0,0 @@ -SELECT tg.project_code, - tg.id::VARCHAR(50) as table_groups_id, - tg.table_group_schema, - tg.table_group_schema, - CASE - WHEN tg.profiling_table_set ILIKE '''%''' THEN tg.profiling_table_set - ELSE fn_format_csv_quotes(tg.profiling_table_set) - END as profiling_table_set, - tg.profiling_include_mask, - tg.profiling_exclude_mask, - tg.profile_id_column_mask, - tg.profile_sk_column_mask, - tg.profile_use_sampling, - tg.profile_flag_cdes, - tg.profile_sample_percent, - tg.profile_sample_min_count, - tg.profile_do_pair_rules, - tg.profile_pair_rule_pct, - CASE - WHEN tg.monitor_test_suite_id IS NULL THEN NULL - ELSE tg.monitor_test_suite_id::VARCHAR(50) - END as monitor_test_suite_id, - CASE - WHEN tg.last_complete_profile_run_id is NULL THEN NULL - ELSE tg.last_complete_profile_run_id::VARCHAR(50) - END as last_complete_profile_run_id - FROM table_groups tg - WHERE tg.id = :TABLE_GROUP_ID; diff --git a/testgen/template/profiling/functional_datatype.sql b/testgen/template/profiling/functional_datatype.sql index af610db..e3c6659 100644 --- a/testgen/template/profiling/functional_datatype.sql +++ b/testgen/template/profiling/functional_datatype.sql @@ -590,4 +590,30 @@ WHERE profile_run_id = :PROFILE_RUN_ID AND (TRIM(SPLIT_PART(top_patterns, '|', 4)) ~ '^N{1,3}(\.N+)?%$' OR distinct_pattern_ct < 2) AND (TRIM(SPLIT_PART(top_patterns, '|', 6)) ~ '^N{1,3}(\.N+)?%$' OR distinct_pattern_ct < 3); +--- Update column characteristics --- + +WITH new_chars AS ( + SELECT table_groups_id, + schema_name, + table_name, + column_name, + general_type, + functional_data_type + FROM profile_results + WHERE table_groups_id = :TABLE_GROUPS_ID + AND profile_run_id = :PROFILE_RUN_ID +) +UPDATE data_column_chars +SET general_type = n.general_type, + functional_data_type = COALESCE(n.functional_data_type, d.functional_data_type) +FROM new_chars n + INNER JOIN data_column_chars d ON ( + n.table_groups_id = d.table_groups_id + AND n.schema_name = d.schema_name + AND n.table_name = d.table_name + AND n.column_name = d.column_name + ) +WHERE data_column_chars.table_id = d.table_id + AND data_column_chars.column_name = d.column_name; + --- END OF QUERY --- diff --git a/testgen/template/profiling/functional_tabletype_update.sql b/testgen/template/profiling/functional_tabletype_update.sql index 3ae8c59..407bffb 100644 --- a/testgen/template/profiling/functional_tabletype_update.sql +++ b/testgen/template/profiling/functional_tabletype_update.sql @@ -6,3 +6,27 @@ WHERE s.project_code = profile_results.project_code AND s.table_name = profile_results.table_name AND s.run_date = profile_results.run_date AND s.run_date = :RUN_DATE; + +--- Update table characteristics --- + +WITH new_chars AS ( + SELECT table_groups_id, + schema_name, + table_name, + functional_table_type + FROM profile_results + WHERE table_groups_id = :TABLE_GROUPS_ID + GROUP BY table_groups_id, + schema_name, + table_name, + functional_table_type +) +UPDATE data_table_chars +SET functional_table_type = COALESCE(n.functional_table_type, d.functional_table_type) +FROM new_chars n + INNER JOIN data_table_chars d ON ( + n.table_groups_id = d.table_groups_id + AND n.schema_name = d.schema_name + AND n.table_name = d.table_name + ) +WHERE data_table_chars.table_id = d.table_id; diff --git a/testgen/template/profiling/project_profile_run_record_insert.sql b/testgen/template/profiling/project_profile_run_record_insert.sql deleted file mode 100644 index e1c379f..0000000 --- a/testgen/template/profiling/project_profile_run_record_insert.sql +++ /dev/null @@ -1,8 +0,0 @@ -INSERT INTO profiling_runs (id, project_code, connection_id, table_groups_id, profiling_starttime, process_id) -(SELECT :PROFILE_RUN_ID as id, - :PROJECT_CODE as project_code, - :CONNECTION_ID as connection_id, - :TABLE_GROUPS_ID as table_groups_id, - :RUN_DATE as profiling_starttime, - :PROCESS_ID as process_id - ); diff --git a/testgen/template/profiling/project_profile_run_record_update.sql b/testgen/template/profiling/project_profile_run_record_update.sql deleted file mode 100644 index e6c7b0d..0000000 --- a/testgen/template/profiling/project_profile_run_record_update.sql +++ /dev/null @@ -1,5 +0,0 @@ -UPDATE profiling_runs -SET status = CASE WHEN length(:EXCEPTION_MESSAGE) = 0 then 'Complete' else 'Error' end, - profiling_endtime = :NOW_TIMESTAMP, - log_message = :EXCEPTION_MESSAGE -where id = :PROFILE_RUN_ID; diff --git a/testgen/template/profiling/project_update_profile_results_to_estimates.sql b/testgen/template/profiling/project_update_profile_results_to_estimates.sql index e5a8741..7e4d353 100644 --- a/testgen/template/profiling/project_update_profile_results_to_estimates.sql +++ b/testgen/template/profiling/project_update_profile_results_to_estimates.sql @@ -24,8 +24,8 @@ set sample_ratio = :PROFILE_SAMPLE_RATIO, future_date_ct = ROUND(future_date_ct * :PROFILE_SAMPLE_RATIO, 0), boolean_true_ct = ROUND(boolean_true_ct * :PROFILE_SAMPLE_RATIO, 0) where profile_run_id = :PROFILE_RUN_ID -and schema_name = TRIM(SPLIT_PART(:SAMPLING_TABLE, '.', 1), :QUOTE) -and table_name = TRIM(SPLIT_PART(:SAMPLING_TABLE, '.', 2), :QUOTE) +and schema_name = :DATA_SCHEMA +and table_name = :SAMPLING_TABLE and sample_ratio IS NULL; diff --git a/testgen/template/profiling/refresh_anomalies.sql b/testgen/template/profiling/refresh_anomalies.sql index 9159fbf..b97f9ce 100644 --- a/testgen/template/profiling/refresh_anomalies.sql +++ b/testgen/template/profiling/refresh_anomalies.sql @@ -1,32 +1,15 @@ -WITH anomalies +WITH stats AS ( SELECT profile_run_id, COUNT(*) as anomaly_ct, COUNT(DISTINCT schema_name || '.' || table_name) as anomaly_table_ct, COUNT(DISTINCT schema_name || '.' || table_name || '.' || column_name) as anomaly_column_ct FROM profile_anomaly_results WHERE profile_run_id = :PROFILE_RUN_ID - GROUP BY profile_run_id ), -profiles - AS ( SELECT r.id as profile_run_id, - COUNT(DISTINCT p.schema_name || '.' || p.table_name) as table_ct, - COUNT(*) as column_ct - FROM profiling_runs r - INNER JOIN profile_results p - ON r.id = p.profile_run_id - WHERE r.id = :PROFILE_RUN_ID - GROUP BY r.id ), -stats - AS ( SELECT p.profile_run_id, table_ct, column_ct, - a.anomaly_ct, a.anomaly_table_ct, a.anomaly_column_ct - FROM profiles p - LEFT JOIN anomalies a - ON (p.profile_run_id = a.profile_run_id) ) + GROUP BY profile_run_id ) UPDATE profiling_runs - SET table_ct = stats.table_ct, - column_ct = stats.column_ct, - anomaly_ct = COALESCE(stats.anomaly_ct, 0), + SET anomaly_ct = COALESCE(stats.anomaly_ct, 0), anomaly_table_ct = COALESCE(stats.anomaly_table_ct, 0), anomaly_column_ct = COALESCE(stats.anomaly_column_ct, 0) FROM stats diff --git a/testgen/ui/components/frontend/css/shared.css b/testgen/ui/components/frontend/css/shared.css index 335858a..2de2aba 100644 --- a/testgen/ui/components/frontend/css/shared.css +++ b/testgen/ui/components/frontend/css/shared.css @@ -157,7 +157,7 @@ body { } .table-row { - padding: 12px 0; + padding: 8px 0; } .table.hoverable .table-row:hover { @@ -168,10 +168,6 @@ body { border-bottom: var(--button-stroked-border); } -.table-row:last-child { - padding-bottom: 0; -} - .table-header { border-bottom: var(--button-stroked-border); padding: 0 0 8px 0; @@ -216,6 +212,12 @@ body { .text-capitalize { text-transform: capitalize; } + +.text-code { + font-family:'Courier New', Courier, monospace; + line-height: 1.5; + white-space: pre; +} /* */ /* Flex utilities */ @@ -638,6 +640,10 @@ code > .tg-icon:hover { border-radius: 4px; } +.border-radius-2 { + border-radius: 8px; +} + input::-ms-reveal, input::-ms-clear { display: none; diff --git a/testgen/ui/components/frontend/js/components/score_issues.js b/testgen/ui/components/frontend/js/components/score_issues.js index 4612770..659f802 100644 --- a/testgen/ui/components/frontend/js/components/score_issues.js +++ b/testgen/ui/components/frontend/js/components/score_issues.js @@ -76,7 +76,7 @@ const IssuesTable = ( const selectedIssues = van.state([]); return div( - { class: 'table', 'data-testid': 'score-issues' }, + { class: 'table pb-0', 'data-testid': 'score-issues' }, div( { class: 'flex-row fx-justify-space-between fx-align-flex-start'}, div( diff --git a/testgen/ui/components/frontend/js/components/select.js b/testgen/ui/components/frontend/js/components/select.js index 72bb11c..e735042 100644 --- a/testgen/ui/components/frontend/js/components/select.js +++ b/testgen/ui/components/frontend/js/components/select.js @@ -3,7 +3,7 @@ * @type {object} * @property {string} label * @property {string} value - * @property {boolean} selected + * @property {boolean?} selected * @property {string?} icon * * @typedef Properties diff --git a/testgen/ui/components/frontend/js/components/table_group_stats.js b/testgen/ui/components/frontend/js/components/table_group_stats.js new file mode 100644 index 0000000..361118c --- /dev/null +++ b/testgen/ui/components/frontend/js/components/table_group_stats.js @@ -0,0 +1,130 @@ +/** + * @typedef TableGroupStats + * @type {object} + * @property {string} id + * @property {string} table_groups_name + * @property {string} table_group_schema + * @property {number} table_ct + * @property {number} column_ct + * @property {number} approx_record_ct + * @property {number?} record_ct + * @property {number} approx_data_point_ct + * @property {number?} data_point_ct + * + * @typedef Properties + * @type {object} + * @property {boolean?} hideApproxCaption + * @property {boolean?} hideWarning + * @property {string?} class + */ +import van from '../van.min.js'; +import { formatNumber } from '../display_utils.js'; +import { Alert } from '../components/alert.js'; + +const { div, span, strong } = van.tags; +const profilingWarningText = 'Profiling on large datasets could be time-consuming or resource-intensive, depending on your database configuration.'; + +/** + * @param {Properties} props + * @param {TableGroupStats} stats + * @returns {HTMLElement} + */ +const TableGroupStats = (props, stats) => { + const useApprox = stats.record_ct === null || stats.record_ct === undefined; + const rowCount = useApprox ? stats.approx_record_ct : stats.record_ct; + const dataPointCount = useApprox ? stats.approx_data_point_ct : stats.data_point_ct; + const warning = !props.hideWarning ? WarningText(rowCount, dataPointCount) : null; + + return div( + { class: `flex-column fx-gap-1 p-3 border border-radius-2 ${props.class ?? ''}` }, + span( + span({ class: 'text-secondary' }, 'Schema: '), + stats.table_group_schema, + ), + div( + { class: 'flex-row' }, + div( + { class: 'flex-column fx-gap-1', style: 'flex: 1 1 50%;' }, + span( + span({ class: 'text-secondary' }, 'Tables: '), + formatNumber(stats.table_ct), + ), + span( + span({ class: 'text-secondary' }, 'Columns: '), + formatNumber(stats.column_ct), + ), + ), + div( + { class: 'flex-column fx-gap-1', style: 'flex: 1 1 50%;' }, + span( + span({ class: 'text-secondary' }, 'Rows: '), + formatNumber(rowCount), + useApprox ? ' *' : '', + ), + span( + span({ class: 'text-secondary' }, 'Data points: '), + formatNumber(dataPointCount), + useApprox ? ' *' : '', + ), + ), + ), + useApprox && !props.hideApproxCaption + ? span( + { class: 'text-caption text-right mt-1' }, + '* Approximate counts based on server statistics', + ) + : null, + warning + ? Alert({ type: 'warn', icon: 'warning', class: 'mt-2' }, warning) + : null, + ); +}; + +/** + * @param {number | null} rowCount + * @param {number | null} dataPointCount + * @returns {HTMLElement | null} + */ +const WarningText = (rowCount, dataPointCount) => { + if (rowCount === null) { // Unknown counts + return div(`WARNING: ${profilingWarningText}`); + } + + const rowTier = getStatTier(rowCount); + const dataPointTier = getStatTier(dataPointCount); + + if (rowTier || dataPointTier) { + let category; + if (rowTier && dataPointTier) { + category = rowTier === dataPointTier + ? [ strong(rowTier), ' of rows and data points' ] + : [ strong(rowTier), ' of rows and ', strong(dataPointTier), ' of data points' ]; + } else { + category = rowTier + ? [ strong(rowTier), ' of rows' ] + : [ strong(dataPointTier), ' of data points' ]; + } + return div( + div('WARNING: The table group has ', ...category, '.'), + div({ class: 'mt-2' }, profilingWarningText), + ); + } + return null; +} + +/** + * @param {number | null} count + * @returns {string | null} + */ +function getStatTier(/** @type number */ count) { + if (count > 1000000000) { + return 'billions'; + } else if (count > 1000000) { + return 'millions'; + } else if (count > 100000) { + return 'hundreds of thousands'; + } + return null; +}; + +export { TableGroupStats }; diff --git a/testgen/ui/components/frontend/js/components/table_group_test.js b/testgen/ui/components/frontend/js/components/table_group_test.js index bb226a4..ff987f0 100644 --- a/testgen/ui/components/frontend/js/components/table_group_test.js +++ b/testgen/ui/components/frontend/js/components/table_group_test.js @@ -1,9 +1,17 @@ /** + * @import { TableGroupStats } from './table_group_stats.js' + * + * @typedef TablePreview + * @type {object} + * @property {number} column_ct + * @property {number} approx_record_ct + * @property {number} approx_data_point_ct + * @property {boolean} can_access + * * @typedef TableGroupPreview * @type {object} - * @property {string} schema - * @property {Record?} tables - * @property {number?} column_count + * @property {TableGroupStats} stats + * @property {Record?} tables * @property {boolean?} success * @property {string?} message * @@ -12,43 +20,26 @@ * @property {(() => void)?} onVerifyAcess */ import van from '../van.min.js'; -import { emitEvent, getValue } from '../utils.js'; +import { getValue } from '../utils.js'; +import { formatNumber } from '../display_utils.js'; import { Alert } from '../components/alert.js'; import { Icon } from '../components/icon.js'; import { Button } from '../components/button.js'; +import { TableGroupStats } from './table_group_stats.js'; -const { div, span, strong } = van.tags; +const { div, span } = van.tags; /** - * - * @param {string} schema * @param {TableGroupPreview?} preview * @param {ComponentOptions} options * @returns {HTMLElement} */ -const TableGroupTest = (schema, preview, options) => { +const TableGroupTest = (preview, options) => { return div( { class: 'flex-column fx-gap-2' }, div( - { class: 'flex-row fx-justify-space-between' }, - div( - { class: 'flex-column fx-gap-2' }, - div( - { class: 'flex-row fx-gap-1' }, - strong({}, 'Schema:'), - span({}, schema), - ), - div( - { class: 'flex-row fx-gap-1' }, - strong({}, 'Table Count:'), - () => span({}, Object.keys(getValue(preview)?.tables ?? {})?.length ?? '--'), - ), - div( - { class: 'flex-row fx-gap-1' }, - strong({}, 'Column Count:'), - () => span({}, getValue(preview)?.column_count ?? '--'), - ), - ), + { class: 'flex-row fx-justify-space-between fx-align-flex-end' }, + span({ class: 'text-caption text-right' }, '* Approximate row counts based on server statistics'), options.onVerifyAcess ? div( { class: 'flex-row' }, @@ -62,6 +53,9 @@ const TableGroupTest = (schema, preview, options) => { ) : '', ), + () => getValue(preview) + ? TableGroupStats({ hideWarning: true, hideApproxCaption: true }, getValue(preview).stats) + : '', () => { const tableGroupPreview = getValue(preview); const wasPreviewExecuted = tableGroupPreview && typeof tableGroupPreview.success === 'boolean'; @@ -72,33 +66,44 @@ const TableGroupTest = (schema, preview, options) => { const tables = tableGroupPreview?.tables ?? {}; const hasTables = Object.keys(tables).length > 0; - const verifiedAccess = Object.values(tables).some(v => v != null); - const tableAccessWarning = Object.values(tables).some(v => v != null && v === false) + const verifiedAccess = Object.values(tables).some(({ can_access }) => can_access != null); + const tableAccessWarning = Object.values(tables).some(({ can_access }) => can_access != null && can_access === false) ? tableGroupPreview.message : ''; + const columns = ['50%', '14%', '14%', '14%', '8%']; + return div( {class: 'flex-column fx-gap-2'}, div( - { class: 'table hoverable p-3' }, + { class: 'table hoverable p-3 pb-0' }, div( - { class: 'table-header flex-row fx-justify-space-between' }, - span('Tables'), + { class: 'table-header flex-row' }, + span({ style: `flex: 1 1 ${columns[0]}; max-width: ${columns[0]};` }, 'Tables'), + span({ style: `flex: 1 1 ${columns[1]};` }, 'Columns'), + span({ style: `flex: 1 1 ${columns[2]};` }, 'Rows *'), + span({ style: `flex: 1 1 ${columns[3]};` }, 'Data Points *'), verifiedAccess - ? span({class: 'flex-row fx-justify-center', style: 'width: 100px;'}, 'Has access?') + ? span({class: 'flex-row fx-justify-center', style: `flex: 1 1 ${columns[4]};`}, 'Can access?') : '', ), div( - { class: 'flex-column', style: 'max-height: 200px; overflow-y: auto;' }, + { class: 'flex-column', style: 'max-height: 400px; overflow-y: auto;' }, hasTables - ? Object.entries(tables).map(([tableName, hasAccess]) => + ? Object.entries(tables).map(([ tableName, table ]) => div( { class: 'table-row flex-row fx-justify-space-between' }, - span(tableName), - hasAccess != null + span( + { style: `flex: 1 1 ${columns[0]}; max-width: ${columns[0]}; word-wrap: break-word;` }, + tableName, + ), + span({ style: `flex: 1 1 ${columns[1]};` }, formatNumber(table.column_ct)), + span({ style: `flex: 1 1 ${columns[2]};` }, formatNumber(table.approx_record_ct)), + span({ style: `flex: 1 1 ${columns[3]};` }, formatNumber(table.approx_data_point_ct)), + table.can_access != null ? span( - {class: 'flex-row fx-justify-center', style: 'width: 100px;'}, - hasAccess + {class: 'flex-row fx-justify-center', style: `flex: 1 1 ${columns[4]};`}, + table.can_access ? Icon({classes: 'text-green', size: 20}, 'check_circle') : Icon({classes: 'text-error', size: 20}, 'dangerous'), ) diff --git a/testgen/ui/components/frontend/js/data_profiling/column_distribution.js b/testgen/ui/components/frontend/js/data_profiling/column_distribution.js index 4d51f65..8568909 100644 --- a/testgen/ui/components/frontend/js/data_profiling/column_distribution.js +++ b/testgen/ui/components/frontend/js/data_profiling/column_distribution.js @@ -11,6 +11,7 @@ import van from '../van.min.js'; import { Card } from '../components/card.js'; import { Attribute } from '../components/attribute.js'; import { Button } from '../components/button.js'; +import { Alert } from '../components/alert.js'; import { SummaryBar } from '../components/summary_bar.js'; import { PercentBar } from '../components/percent_bar.js'; import { FrequencyBars } from '../components/frequency_bars.js'; @@ -24,6 +25,7 @@ const columnTypeFunctionMap = { B: BooleanColumn, D: DatetimeColumn, N: NumericColumn, + X: UnknownColumn, }; const attributeWidth = 250; const percentWidth = 250; @@ -33,16 +35,13 @@ const boxPlotWidth = 800; const ColumnDistributionCard = (/** @type Properties */ props, /** @type Column */ item) => { loadStylesheet('column-distribution', stylesheet); - const columnFunction = columnTypeFunctionMap[item.general_type]; + const displayType = item.profile_run_id && item.record_ct !== 0 ? item.general_type : 'X' + const columnFunction = columnTypeFunctionMap[displayType]; return Card({ border: props.border, title: `Value Distribution ${item.is_latest_profile ? '*' : ''}`, - content: item.profile_run_id - ? (item.record_ct === 0 - ? BaseCounts(item) - : columnFunction?.(item)) - : null, + content: columnFunction?.(item), actionContent: div( { class: 'flex-row fx-gap-3' }, item.profile_run_id @@ -68,13 +67,13 @@ const ColumnDistributionCard = (/** @type Properties */ props, /** @type Column ]) : span( { class: 'text-secondary' }, - 'No profiling data available', + 'No profiling results for column', ), ), }) }; -function AlphaColumn(/** @type ColumnProfile */ item) { +function AlphaColumn(/** @type Column */ item) { const standardPatternLabels = { STREET_ADDR: 'Street Address', STATE_USA: 'State (USA)', @@ -210,7 +209,7 @@ function AlphaColumn(/** @type ColumnProfile */ item) { ); } -function BooleanColumn(/** @type ColumnProfile */ item) { +function BooleanColumn(/** @type Column */ item) { return div( { class: 'flex-column fx-gap-5' }, BaseCounts(item), @@ -227,7 +226,7 @@ function BooleanColumn(/** @type ColumnProfile */ item) { ); } -function DatetimeColumn(/** @type ColumnProfile */ item) { +function DatetimeColumn(/** @type Column */ item) { const total = item.record_ct; return div( @@ -265,7 +264,7 @@ function DatetimeColumn(/** @type ColumnProfile */ item) { ); } -function NumericColumn(/** @type ColumnProfile */ item) { +function NumericColumn(/** @type Column */ item) { return div( { class: 'flex-column fx-gap-5' }, BaseCounts(item), @@ -309,18 +308,43 @@ function NumericColumn(/** @type ColumnProfile */ item) { ); } -const BaseCounts = (/** @type ColumnProfile */ item) => { +function UnknownColumn(/** @type Column */ item) { + return div( + { class: 'flex-column fx-gap-3' }, + BaseCounts(item), + item.profiling_error + ? Alert( + { type: 'warn', icon: 'warning' }, + div({ style: 'font-size: 14px;' }, 'Profiling encountered an error for this column.'), + div({ class: 'text-primary text-code', style: 'font-size: 12px;' }, item.profiling_error), + ) + : null, + ); +} + +const BaseCounts = (/** @type Column */ item) => { + const useApprox = item.record_ct === null; const attributes = [ - { key: 'record_ct', label: 'Record Count' }, - { key: 'value_ct', label: 'Value Count' }, + { + label: `Row Count${useApprox ? ' †' : ''}`, + value: useApprox ? item.approx_record_ct : item.record_ct, + } ]; + if (item.value_ct !== null) { + attributes.push({ label: 'Value Count', value: item.value_ct }); + } return div( - { class: 'flex-row fx-gap-4' }, - attributes.map(({ key, label }) => Attribute({ - label: item[key] === 0 ? span({ class: 'text-error' }, label) : label, - value: formatNumber(item[key]), - width: attributeWidth, - })), + div( + { class: 'flex-row fx-gap-4' }, + attributes.map(({ label, value }) => Attribute({ + label: value === 0 ? span({ class: 'text-error' }, label) : label, + value: formatNumber(value), + width: attributeWidth, + })), + ), + useApprox + ? div({ class: 'text-caption text-right mt-1' }, '† Approximate count based on server statistics') + : null, ); }; diff --git a/testgen/ui/components/frontend/js/data_profiling/data_issues.js b/testgen/ui/components/frontend/js/data_profiling/data_issues.js index 4026531..261a228 100644 --- a/testgen/ui/components/frontend/js/data_profiling/data_issues.js +++ b/testgen/ui/components/frontend/js/data_profiling/data_issues.js @@ -41,6 +41,7 @@ const STATUS_COLORS = { }; const PotentialPIICard = (/** @type Properties */ props, /** @type Table | Column */ item) => { + const title = `Potential PII ${item.is_latest_profile ? '*' : ''}`; const attributes = [ { key: 'detail', width: 150, label: 'Type', @@ -66,12 +67,15 @@ const PotentialPIICard = (/** @type Properties */ props, /** @type Table | Colum href: 'profiling-runs:hygiene', params: { run_id: item.profile_run_id, issue_class: 'Potential PII' }, }; - const noneContent = item.profile_run_id ? 'No potential PII detected' : null; + const noneContent = item.profile_run_id && !item.profiling_error + ? 'No potential PII detected' + : span({ class: 'text-secondary' }, `No profiling results for ${item.type}`); - return IssuesCard(props, 'Potential PII *', potentialPII, attributes, linkProps, noneContent); + return IssuesCard(props, title, potentialPII, attributes, linkProps, noneContent); }; const HygieneIssuesCard = (/** @type Properties */ props, /** @type Table | Column */ item) => { + const title = `Hygiene Issues ${item.is_latest_profile ? '*' : ''}`; const attributes = [ { key: 'anomaly_name', width: 200, label: 'Issue' }, { @@ -99,9 +103,11 @@ const HygieneIssuesCard = (/** @type Properties */ props, /** @type Table | Colu column_name: item.column_name, }, }; - const noneContent = item.profile_run_id ? 'No hygiene issues detected' : null; + const noneContent = item.profile_run_id && !item.profiling_error + ? 'No hygiene issues detected' + : span({ class: 'text-secondary' }, `No profiling results for ${item.type}`); - return IssuesCard(props, 'Hygiene Issues *', hygieneIssues, attributes, linkProps, noneContent); + return IssuesCard(props, title, hygieneIssues, attributes, linkProps, noneContent); }; const TestIssuesCard = (/** @type Properties */ props, /** @type Table | Column */ item) => { diff --git a/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js b/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js index e189670..6c4c958 100644 --- a/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js +++ b/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js @@ -75,12 +75,14 @@ * @property {string?} profile_run_id * @property {number?} profile_run_date * @property {boolean?} is_latest_profile + * @property {string?} profiling_error * @property {number?} has_test_runs * * Scores * @property {string?} dq_score * @property {string?} dq_score_profiling * @property {string?} dq_score_testing * * Value Counts + * @property {number?} approx_record_ct * @property {number} record_ct * @property {number} value_ct * @property {number} distinct_value_ct @@ -147,9 +149,9 @@ * @property {string} project_code * * Characteristics * @property {string} functional_table_type + * @property {number} approx_record_ct * @property {number} record_ct * @property {number} column_ct - * @property {number} data_point_ct * @property {number} add_date * @property {number} last_refresh_date * @property {number} drop_date @@ -231,10 +233,10 @@ const LatestProfilingTime = (/** @type Properties */ props, /** @type Table | Co }); if (!item.profile_run_id) { if (item.drop_date) { - text = 'No profiling results for table group'; + text = `No profiling results for ${item.type}`; link = null; } else { - text = 'No profiling results yet for table group.'; + text = `No profiling results yet for ${item.type}.`; link = Link({ href: 'table-groups', params: { project_code: item.project_code, connection_id: item.connection_id }, diff --git a/testgen/ui/components/frontend/js/data_profiling/table_size.js b/testgen/ui/components/frontend/js/data_profiling/table_size.js index 2573d9c..af0f43b 100644 --- a/testgen/ui/components/frontend/js/data_profiling/table_size.js +++ b/testgen/ui/components/frontend/js/data_profiling/table_size.js @@ -14,10 +14,15 @@ import { formatNumber, formatTimestamp } from '../display_utils.js'; const { div, span } = van.tags; const TableSizeCard = (/** @type Properties */ _props, /** @type Table */ item) => { + const useApprox = item.record_ct === null; + const rowCount = useApprox ? item.approx_record_ct : item.record_ct; const attributes = [ - { key: 'column_ct', label: 'Column Count' }, - { key: 'record_ct', label: 'Row Count' }, - { key: 'data_point_ct', label: 'Data Point Count' }, + { label: 'Column Count', value: item.column_ct }, + { label: `Row Count${useApprox ? ' †': ''}`, value: rowCount }, + { + label: `Data Point Count${useApprox ? ' †': ''}`, + value: rowCount !== null ? (item.column_ct * rowCount) : null, + } ]; return Card({ @@ -25,13 +30,16 @@ const TableSizeCard = (/** @type Properties */ _props, /** @type Table */ item) content: div( div( { class: 'flex-row fx-flex-wrap fx-gap-4' }, - attributes.map(({ key, label }) => Attribute({ - label: item[key] === 0 ? span({ class: 'text-error' }, label) : label, - value: formatNumber(item[key]), + attributes.map(({ label, value }) => Attribute({ + label: value === 0 ? span({ class: 'text-error' }, label) : label, + value: formatNumber(value), width: 250, })), ), - span({ class: 'text-caption flex-row fx-justify-content-flex-end mt-2' }, `** as of ${formatTimestamp(item.last_refresh_date)}`), + div({ class: 'text-caption text-right mt-1' }, `** as of ${formatTimestamp(item.last_refresh_date)}`), + useApprox + ? div({ class: 'text-caption text-right mt-1' }, '† Approximate counts based on server statistics') + : null, ), actionContent: Button({ type: 'stroked', diff --git a/testgen/ui/components/frontend/js/pages/profiling_runs.js b/testgen/ui/components/frontend/js/pages/profiling_runs.js index de98152..3258192 100644 --- a/testgen/ui/components/frontend/js/pages/profiling_runs.js +++ b/testgen/ui/components/frontend/js/pages/profiling_runs.js @@ -2,18 +2,29 @@ * @import { ProjectSummary } from '../types.js'; * @import { SelectOption } from '../components/select.js'; * + * + * @typedef ProgressStep + * @type {object} + * @property {'data_chars'|'col_profiling'|'freq_analysis'|'hygiene_issues'} key + * @property {'Pending'|'Running'|'Completed'|'Warning'} status + * @property {string} label + * @property {string} detail + * * @typedef ProfilingRun * @type {object} - * @property {string} profiling_run_id - * @property {number} start_time - * @property {number} end_time + * @property {string} id + * @property {number} profiling_starttime + * @property {number} profiling_endtime * @property {string} table_groups_name * @property {'Running'|'Complete'|'Error'|'Cancelled'} status + * @property {ProgressStep[]} progress * @property {string} log_message * @property {string} process_id - * @property {string} schema_name + * @property {string} table_group_schema * @property {number} column_ct * @property {number} table_ct + * @property {number} record_ct + * @property {number} data_point_ct * @property {number} anomaly_ct * @property {number} anomalies_definite_ct * @property {number} anomalies_likely_ct @@ -33,28 +44,37 @@ * @property {Permissions} permissions */ import van from '../van.min.js'; -import { Tooltip } from '../components/tooltip.js'; +import { withTooltip } from '../components/tooltip.js'; import { SummaryCounts } from '../components/summary_counts.js'; import { Link } from '../components/link.js'; import { Button } from '../components/button.js'; import { Streamlit } from '../streamlit.js'; import { emitEvent, getValue, loadStylesheet, resizeFrameHeightToElement, resizeFrameHeightOnDOMChange } from '../utils.js'; -import { formatTimestamp, formatDuration } from '../display_utils.js'; +import { formatTimestamp, formatDuration, formatNumber } from '../display_utils.js'; import { Checkbox } from '../components/checkbox.js'; import { Select } from '../components/select.js'; import { Paginator } from '../components/paginator.js'; import { EMPTY_STATE_MESSAGE, EmptyState } from '../components/empty_state.js'; +import { Icon } from '../components/icon.js'; const { div, i, span, strong } = van.tags; const PAGE_SIZE = 100; const SCROLL_CONTAINER = window.top.document.querySelector('.stMain'); +const REFRESH_INTERVAL = 15000 // 15 seconds + +const progressStatusIcons = { + Pending: { color: 'grey', icon: 'more_horiz', size: 22 }, + Running: { color: 'blue', icon: 'autoplay', size: 18 }, + Completed: { color: 'green', icon: 'check', size: 24 }, + Warning: { color: 'orange', icon: 'warning', size: 20 }, +}; const ProfilingRuns = (/** @type Properties */ props) => { loadStylesheet('profilingRuns', stylesheet); Streamlit.setFrameHeight(1); window.testgen.isPage = true; - const columns = ['5%', '15%', '15%', '20%', '35%', '10%']; + const columns = ['5%', '15%', '20%', '20%', '30%', '10%']; const userCanEdit = getValue(props.permissions)?.can_edit ?? false; const pageIndex = van.state(0); @@ -62,13 +82,24 @@ const ProfilingRuns = (/** @type Properties */ props) => { pageIndex.val = 0; return getValue(props.profiling_runs); }); - const paginatedRuns = van.derive(() => profilingRuns.val.slice(PAGE_SIZE * pageIndex.val, PAGE_SIZE * (pageIndex.val + 1))); + let refreshIntervalId = null; + + const paginatedRuns = van.derive(() => { + const paginated = profilingRuns.val.slice(PAGE_SIZE * pageIndex.val, PAGE_SIZE * (pageIndex.val + 1)); + const hasActiveRuns = paginated.some(({ status }) => status === 'Running'); + if (!refreshIntervalId && hasActiveRuns) { + refreshIntervalId = setInterval(() => emitEvent('RefreshData', {}), REFRESH_INTERVAL); + } else if (refreshIntervalId && !hasActiveRuns) { + clearInterval(refreshIntervalId); + } + return paginated; + }); const selectedRuns = {}; const initializeSelectedStates = (items) => { for (const profilingRun of items) { - if (selectedRuns[profilingRun.profiling_run_id] == undefined) { - selectedRuns[profilingRun.profiling_run_id] = van.state(false); + if (selectedRuns[profilingRun.id] == undefined) { + selectedRuns[profilingRun.id] = van.state(false); } } }; @@ -89,9 +120,9 @@ const ProfilingRuns = (/** @type Properties */ props) => { () => profilingRuns.val.length ? div( div( - { class: 'table' }, + { class: 'table pb-0' }, () => { - const selectedItems = profilingRuns.val.filter(i => selectedRuns[i.profiling_run_id]?.val ?? false); + const selectedItems = profilingRuns.val.filter(i => selectedRuns[i.id]?.val ?? false); const someRunSelected = selectedItems.length > 0; const tooltipText = !someRunSelected ? 'No runs selected' : undefined; @@ -111,7 +142,7 @@ const ProfilingRuns = (/** @type Properties */ props) => { tooltipPosition: 'bottom-left', disabled: !someRunSelected, width: 'auto', - onclick: () => emitEvent('RunsDeleted', { payload: selectedItems.map(i => i.profiling_run_id) }), + onclick: () => emitEvent('RunsDeleted', { payload: selectedItems.map(i => i.id) }), }), ); }, @@ -119,7 +150,7 @@ const ProfilingRuns = (/** @type Properties */ props) => { { class: 'table-header flex-row' }, () => { const items = profilingRuns.val; - const selectedItems = items.filter(i => selectedRuns[i.profiling_run_id]?.val ?? false); + const selectedItems = items.filter(i => selectedRuns[i.id]?.val ?? false); const allSelected = selectedItems.length === items.length; const partiallySelected = selectedItems.length > 0 && selectedItems.length < items.length; @@ -133,7 +164,7 @@ const ProfilingRuns = (/** @type Properties */ props) => { ? Checkbox({ checked: allSelected, indeterminate: partiallySelected, - onChange: (checked) => items.forEach(item => selectedRuns[item.profiling_run_id].val = checked), + onChange: (checked) => items.forEach(item => selectedRuns[item.id].val = checked), testId: 'select-all-profiling-run', }) : '', @@ -152,7 +183,7 @@ const ProfilingRuns = (/** @type Properties */ props) => { 'Schema', ), span( - { style: `flex: ${columns[4]}` }, + { style: `flex: ${columns[4]}`, class: 'tg-profiling-runs--issues' }, 'Hygiene Issues', ), span( @@ -161,7 +192,7 @@ const ProfilingRuns = (/** @type Properties */ props) => { ), ), div( - paginatedRuns.val.map(item => ProfilingRunItem(item, columns, selectedRuns[item.profiling_run_id], userCanEdit)), + paginatedRuns.val.map(item => ProfilingRunItem(item, columns, selectedRuns[item.id], userCanEdit)), ), ), Paginator({ @@ -242,6 +273,8 @@ const ProfilingRunItem = ( /** @type boolean */ selected, /** @type boolean */ userCanEdit, ) => { + const runningStep = item.progress?.find((item) => item.status === 'Running'); + return div( { class: 'table-row flex-row', 'data-testid': 'profiling-run-item' }, userCanEdit @@ -256,49 +289,79 @@ const ProfilingRunItem = ( : '', div( { style: `flex: ${columns[1]}` }, - div({ 'data-testid': 'profiling-run-item-starttime' }, formatTimestamp(item.start_time)), + div({ 'data-testid': 'profiling-run-item-starttime' }, formatTimestamp(item.profiling_starttime)), div( { class: 'text-caption mt-1', 'data-testid': 'profiling-run-item-tablegroup' }, item.table_groups_name, ), ), div( - { class: 'flex-row', style: `flex: ${columns[2]}` }, + { style: `flex: ${columns[2]}` }, div( + { class: 'flex-row' }, ProfilingRunStatus(item), - div( + item.status === 'Running' && item.process_id && userCanEdit ? Button({ + type: 'stroked', + label: 'Cancel', + style: 'width: 64px; height: 28px; color: var(--purple); margin-left: 12px;', + onclick: () => emitEvent('RunCanceled', { payload: item }), + }) : null, + ), + item.profiling_endtime + ? div( { class: 'text-caption mt-1', 'data-testid': 'profiling-run-item-duration' }, - formatDuration(item.start_time, item.end_time), + formatDuration(item.profiling_starttime, item.profiling_endtime), + ) + : div( + { class: 'text-caption mt-1' }, + runningStep + ? [ + div( + runningStep.label, + withTooltip( + Icon({ style: 'font-size: 18px; margin-left: 4px; vertical-align: middle;' }, 'info'), + { text: ProgressTooltip(item) }, + ), + ), + div(runningStep.detail), + ] + : '--', ), - ), - item.status === 'Running' && item.process_id && userCanEdit ? Button({ - type: 'stroked', - label: 'Cancel Run', - style: 'width: auto; height: 32px; color: var(--purple); margin-left: 16px;', - onclick: () => emitEvent('RunCanceled', { payload: item }), - }) : null, ), div( { style: `flex: ${columns[3]}` }, - div({ 'data-testid': 'profiling-run-item-schema' }, item.schema_name), + div({ 'data-testid': 'profiling-run-item-schema' }, item.table_group_schema), div( { class: 'text-caption mt-1 mb-1', style: item.status === 'Complete' && !item.column_ct ? 'color: var(--red);' : '', 'data-testid': 'profiling-run-item-counts', }, - item.status === 'Complete' ? `${item.table_ct || 0} tables, ${item.column_ct || 0} columns` : null, + item.column_ct !== null + ? div( + `${formatNumber(item.table_ct || 0)} tables, ${formatNumber(item.column_ct || 0)} columns`, + item.record_ct !== null ? + withTooltip( + Icon({ style: 'font-size: 16px; margin-left: 4px; vertical-align: middle;' }, 'more' ), + { text: [ + div(`${formatNumber(item.record_ct || 0)} records`), + div(`${formatNumber(item.data_point_ct || 0)} data points`), + ] }, + ) + : null, + ) + : null, ), - item.column_ct ? Link({ + item.status !== 'Running' && item.column_ct ? Link({ label: 'View results', href: 'profiling-runs:results', - params: { 'run_id': item.profiling_run_id }, + params: { 'run_id': item.id }, underline: true, right_icon: 'chevron_right', }) : null, ), div( - { class: 'pr-3', style: `flex: ${columns[4]}` }, + { class: 'pr-3 tg-profiling-runs--issues', style: `flex: ${columns[4]}` }, item.anomaly_ct ? SummaryCounts({ items: [ { label: 'Definite', value: item.anomalies_definite_ct, color: 'red' }, @@ -310,7 +373,7 @@ const ProfilingRunItem = ( item.anomaly_ct ? Link({ label: `View ${item.anomaly_ct} issues`, href: 'profiling-runs:hygiene', - params: { 'run_id': item.profiling_run_id }, + params: { 'run_id': item.id }, underline: true, right_icon: 'chevron_right', style: 'margin-top: 4px;', @@ -326,7 +389,7 @@ const ProfilingRunItem = ( ); } -function ProfilingRunStatus(/** @type ProfilingRun */ item) { +const ProfilingRunStatus = (/** @type ProfilingRun */ item) => { const attributeMap = { Running: { label: 'Running', color: 'blue' }, Complete: { label: 'Completed', color: '' }, @@ -334,6 +397,7 @@ function ProfilingRunStatus(/** @type ProfilingRun */ item) { Cancelled: { label: 'Canceled', color: 'purple' }, }; const attributes = attributeMap[item.status] || { label: 'Unknown', color: 'grey' }; + const hasProgressError = item.progress?.some(({error}) => !!error); return span( { class: 'flex-row', @@ -341,21 +405,41 @@ function ProfilingRunStatus(/** @type ProfilingRun */ item) { 'data-testid': 'profiling-run-item-status' }, attributes.label, - () => { - const tooltipError = van.state(false); - return item.status === 'Error' && item.log_message ? i( - { - class: 'material-symbols-rounded text-secondary ml-1 profiling-runs--info', - style: 'position: relative; font-size: 16px;', - onmouseenter: () => tooltipError.val = true, - onmouseleave: () => tooltipError.val = false, - }, - 'info', - Tooltip({ text: item.log_message, show: tooltipError }), - ) : null; - }, + item.status === 'Complete' && hasProgressError + ? withTooltip( + Icon({ style: 'font-size: 18px; margin-left: 4px; vertical-align: middle; color: var(--orange);' }, 'warning' ), + { text: ProgressTooltip(item) }, + ) + : null, + item.status === 'Error' && item.log_message + ? withTooltip( + Icon({ style: 'font-size: 18px; margin-left: 4px;' }, 'info'), + { text: item.log_message, width: 250, style: 'word-break: break-word;' }, + ) + : null, ); -} +}; + +const ProgressTooltip = (/** @type ProfilingRun */ item) => { + return div( + { class: 'flex-column fx-gap-1' }, + item.progress?.map(step => { + const stepIcon = progressStatusIcons[step.status]; + return div( + { class: 'flex-row fx-gap-1' }, + Icon( + { style: `font-size: ${stepIcon.size}px; color: var(--${stepIcon.color}); min-width: 24px;` }, + stepIcon.icon, + ), + div( + { class: 'flex-column fx-align-flex-start text-left' }, + span(`${step.label}${step.detail ? (': ' + step.detail) : ''}`), + span({ style: 'font-size: 12px; opacity: 0.6; margin-top: 2px;' }, step.error), + ), + ); + }), + ); +}; const ConditionalEmptyState = ( /** @type ProjectSummary */ projectSummary, @@ -409,6 +493,10 @@ stylesheet.replace(` .tg-profiling-runs { min-height: 550px; } + +.tg-profiling-runs--issues { + min-width: 310px; +} `); export { ProfilingRuns }; diff --git a/testgen/ui/components/frontend/js/pages/project_dashboard.js b/testgen/ui/components/frontend/js/pages/project_dashboard.js index d3f5b2e..1eeee31 100644 --- a/testgen/ui/components/frontend/js/pages/project_dashboard.js +++ b/testgen/ui/components/frontend/js/pages/project_dashboard.js @@ -6,14 +6,17 @@ * @type {object} * @property {string} id * @property {string} table_groups_name + * @property {number} table_ct + * @property {number} column_ct + * @property {number} approx_record_ct + * @property {number} record_ct + * @property {number} approx_data_point_ct + * @property {number} data_point_ct * @property {string?} dq_score * @property {string?} dq_score_profiling * @property {string?} dq_score_testing * @property {string?} latest_profile_id * @property {number?} latest_profile_start - * @property {number} latest_profile_table_ct - * @property {number} latest_profile_column_ct - * @property {number} latest_profile_data_point_ct * @property {number} latest_anomalies_ct * @property {number} latest_anomalies_definite_ct * @property {number} latest_anomalies_likely_ct @@ -124,6 +127,7 @@ const ProjectDashboard = (/** @type Properties */ props) => { } const TableGroupCard = (/** @type TableGroupSummary */ tableGroup) => { + const useApprox = tableGroup.record_ct === null || tableGroup.record_ct === undefined; return Card({ testId: 'table-group-summary-card', border: true, @@ -139,9 +143,12 @@ const TableGroupCard = (/** @type TableGroupSummary */ tableGroup) => { ), span( { class: 'text-caption mt-1 mb-3 tg-overview--subtitle' }, - `${formatNumber(tableGroup.latest_profile_table_ct ?? 0)} tables | - ${formatNumber(tableGroup.latest_profile_column_ct ?? 0)} columns | - ${formatNumber(tableGroup.latest_profile_data_point_ct ?? 0)} data points`, + `${formatNumber(tableGroup.table_ct ?? 0)} tables | + ${formatNumber(tableGroup.column_ct ?? 0)} columns | + ${formatNumber(useApprox ? tableGroup.approx_record_ct : tableGroup.record_ct)} rows + ${useApprox ? '*' : ''} | + ${formatNumber(useApprox ? tableGroup.approx_data_point_ct : tableGroup.data_point_ct)} data points + ${useApprox ? '*' : ''}`, ), TableGroupTestSuiteSummary(tableGroup.test_suites), ), @@ -149,6 +156,9 @@ const TableGroupCard = (/** @type TableGroupSummary */ tableGroup) => { ), hr({ class: 'tg-overview--table-group-divider' }), TableGroupLatestProfile(tableGroup), + useApprox + ? span({ class: 'text-caption text-right' }, '* Approximate counts based on server statistics') + : null, ) }); }; diff --git a/testgen/ui/components/frontend/js/pages/run_profiling_dialog.js b/testgen/ui/components/frontend/js/pages/run_profiling_dialog.js index f5fd0f1..59c17a1 100644 --- a/testgen/ui/components/frontend/js/pages/run_profiling_dialog.js +++ b/testgen/ui/components/frontend/js/pages/run_profiling_dialog.js @@ -1,14 +1,17 @@ /** - * @import { TableGroup } from '../components/table_group_form.js'; + * @import { TableGroupStats } from '../components/table_group_stats.js' * * @typedef Result * @type {object} * @property {boolean} success * @property {string?} message + * @property {boolean?} show_link * * @typedef Properties * @type {object} - * @property {TableGroup} table_group + * @property {TableGroupStats[]} table_groups + * @property {string} selected_id + * @property {boolean} allow_selection * @property {Result?} result */ import van from '../van.min.js'; @@ -16,80 +19,108 @@ import { Streamlit } from '../streamlit.js'; import { Alert } from '../components/alert.js'; import { ExpanderToggle } from '../components/expander_toggle.js'; import { Icon } from '../components/icon.js'; -import { emitEvent, getValue, resizeFrameHeightOnDOMChange, resizeFrameHeightToElement } from '../utils.js'; +import { emitEvent, getValue, loadStylesheet, resizeFrameHeightOnDOMChange, resizeFrameHeightToElement } from '../utils.js'; import { Code } from '../components/code.js'; import { Button } from '../components/button.js'; +import { Select } from '../components/select.js'; +import { TableGroupStats } from '../components/table_group_stats.js'; -const { div, em, span, strong } = van.tags; +const { div, span, strong } = van.tags; /** * @param {Properties} props */ const RunProfilingDialog = (props) => { + loadStylesheet('run-profiling', stylesheet); Streamlit.setFrameHeight(1); window.testgen.isPage = true; - const wrapperId = 'runprogiling-wrapper'; + const wrapperId = 'run-profiling-wrapper'; resizeFrameHeightToElement(wrapperId); resizeFrameHeightOnDOMChange(wrapperId); - const tableGroup = getValue(props.table_group); + const tableGroups = getValue(props.table_groups); + const allowSelection = getValue(props.allow_selection); + const selectedId = van.state(getValue(props.selected_id)); + const selectedTableGroup = van.derive(() => tableGroups.find(({ id }) => id === selectedId.val)); const showCLICommand = van.state(false); return div( - { id: wrapperId, class: 'flex-column fx-gap-3' }, + { id: wrapperId }, div( - { class: 'flex-row fx-gap-1' }, - span({}, 'Execute profiling for the table group'), - strong({}, tableGroup.table_groups_name), - span({}, '?'), - ), - div( - { class: 'flex-row fx-gap-1' }, - Icon({}, 'info'), - em({}, ' Profiling will be performed in a background process.'), - ), - ExpanderToggle({ - collapseLabel: 'Collapse', - expandLabel: 'Show CLI command', - onCollapse: () => showCLICommand.val = false, - onExpand: () => showCLICommand.val = true, - }), - Code({ class: () => showCLICommand.val ? '' : 'hidden' }, `testgen run-profile --table-group-id ${tableGroup.id}`), - () => { - const result = getValue(props.result) ?? {}; - return result.message - ? Alert({ type: result.success ? 'success' : 'error' }, span(result.message)) - : ''; - }, - div( - { class: 'flex-row fx-justify-content-flex-end' }, + { class: `flex-column fx-gap-3 ${allowSelection ? 'run-profiling--allow-selection' : ''}` }, + allowSelection + ? Select({ + label: 'Table Group', + value: selectedId, + options: tableGroups.map(({ id, table_groups_name }) => ({ label: table_groups_name, value: id })), + portalClass: 'run-profiling--select', + }) + : span( + 'Run profiling for the table group ', + strong({}, selectedTableGroup.val.table_groups_name), + '?', + ), + () => selectedTableGroup.val + ? div( + TableGroupStats({ class: 'mt-1 mb-3' }, selectedTableGroup.val), + ExpanderToggle({ + collapseLabel: 'Collapse', + expandLabel: 'Show CLI command', + onCollapse: () => showCLICommand.val = false, + onExpand: () => showCLICommand.val = true, + }), + Code({ class: () => showCLICommand.val ? '' : 'hidden' }, `testgen run-profile --table-group-id ${selectedTableGroup.val.id}`), + ) + : div({ style: 'margin: auto;' }, 'Select a table group to profile.'), () => { - const result = getValue(props.result); - - if (result && result.success) { - return Button({ - type: 'stroked', - color: 'primary', - label: 'Go to Profiling Runs', - width: 'auto', - icon: 'chevron_right', - onclick: () => emitEvent('GoToProfilingRunsClicked', { payload: tableGroup.id }), - }); - } - - return Button({ + const result = getValue(props.result) ?? {}; + return result.message + ? Alert({ type: result.success ? 'success' : 'error' }, span(result.message)) + : ''; + }, + ), + () => !getValue(props.result) + ? div( + { class: 'flex-row fx-justify-space-between mt-3' }, + div( + { class: 'flex-row fx-gap-1' }, + Icon({ size: 16 }, 'info'), + span({ class: 'text-caption' }, ' Profiling will be performed in a background process.'), + ), + Button({ label: 'Run Profiling', type: 'stroked', color: 'primary', width: 'auto', style: 'width: auto;', - onclick: () => emitEvent('RunProfilingConfirmed', { payload: tableGroup.id }), - }); - } - ) + disabled: !selectedTableGroup.val, + onclick: () => emitEvent('RunProfilingConfirmed', { payload: selectedTableGroup.val }), + }), + ) : '', + () => getValue(props.result)?.show_link + ? Button({ + type: 'stroked', + color: 'primary', + label: 'Go to Profiling Runs', + style: 'width: auto; margin-left: auto; margin-top: 12px;', + icon: 'chevron_right', + onclick: () => emitEvent('GoToProfilingRunsClicked', { payload: selectedTableGroup.val.id }), + }) + : '', ); }; +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.run-profiling--allow-selection { + min-height: 225px; +} + +.run-profiling--select { + max-height: 200px !important; +} +`); + export { RunProfilingDialog }; \ No newline at end of file diff --git a/testgen/ui/components/frontend/js/pages/table_group_wizard.js b/testgen/ui/components/frontend/js/pages/table_group_wizard.js index 916506a..074bff2 100644 --- a/testgen/ui/components/frontend/js/pages/table_group_wizard.js +++ b/testgen/ui/components/frontend/js/pages/table_group_wizard.js @@ -23,6 +23,7 @@ import van from '../van.min.js'; import { Streamlit } from '../streamlit.js'; import { TableGroupForm } from '../components/table_group_form.js'; import { TableGroupTest } from '../components/table_group_test.js'; +import { TableGroupStats } from '../components/table_group_stats.js'; import { emitEvent, getValue, resizeFrameHeightOnDOMChange, resizeFrameHeightToElement } from '../utils.js'; import { Button } from '../components/button.js'; import { Alert } from '../components/alert.js'; @@ -147,7 +148,6 @@ const TableGroupWizard = (props) => { } return TableGroupTest( - tableGroup.table_group_schema ?? '--', props.table_group_preview, { onVerifyAcess: () => { @@ -175,6 +175,7 @@ const TableGroupWizard = (props) => { return RunProfilingStep( stepsState.tableGroup.rawVal, runProfiling, + props.table_group_preview, results?.success ?? false, ); }); @@ -246,10 +247,11 @@ const TableGroupWizard = (props) => { /** * @param {object} tableGroup * @param {boolean} runProfiling + * @param {TableGroupPreview?} preview * @param {boolean?} disabled * @returns */ -const RunProfilingStep = (tableGroup, runProfiling, disabled) => { +const RunProfilingStep = (tableGroup, runProfiling, preview, disabled) => { return div( { class: 'flex-column fx-gap-3' }, Checkbox({ @@ -263,12 +265,18 @@ const RunProfilingStep = (tableGroup, runProfiling, disabled) => { disabled: disabled ?? false, onChange: (value) => runProfiling.val = value, }), + () => runProfiling.val && preview.val + ? TableGroupStats({ class: 'mt-1 mb-1' }, preview.val.stats) + : '', div( { class: 'flex-row fx-gap-1' }, - Icon({}, 'info'), - () => runProfiling.val - ? i('Profiling will be performed in a background process.') - : i('Profiling will be skipped. You can run this step later from the Profiling Runs page.'), + Icon({ size: 16 }, 'info'), + span( + { class: 'text-caption' }, + () => runProfiling.val + ? 'Profiling will be performed in a background process.' + : 'Profiling will be skipped. You can run this step later from the Profiling Runs page.', + ), ), ); }; diff --git a/testgen/ui/components/frontend/js/pages/test_runs.js b/testgen/ui/components/frontend/js/pages/test_runs.js index 04a00b1..05cb59e 100644 --- a/testgen/ui/components/frontend/js/pages/test_runs.js +++ b/testgen/ui/components/frontend/js/pages/test_runs.js @@ -90,7 +90,7 @@ const TestRuns = (/** @type Properties */ props) => { () => testRuns.val.length ? div( div( - { class: 'table' }, + { class: 'table pb-0' }, () => { const selectedItems = testRuns.val.filter(i => selectedRuns[i.test_run_id]?.val ?? false); const someRunSelected = selectedItems.length > 0; diff --git a/testgen/ui/queries/profiling_queries.py b/testgen/ui/queries/profiling_queries.py index 89119ef..7216fd1 100644 --- a/testgen/ui/queries/profiling_queries.py +++ b/testgen/ui/queries/profiling_queries.py @@ -92,6 +92,7 @@ def get_profiling_results(profiling_run_id: str, table_name: str | None = None, -- Profile Run profile_run_id::VARCHAR, run_date AS profile_run_date, + query_error AS profiling_error, {COLUMN_PROFILING_FIELDS}, -- Extra fields for sorting and exporting position, @@ -103,7 +104,8 @@ def get_profiling_results(profiling_run_id: str, table_name: str | None = None, WHERE profile_run_id = profile_results.profile_run_id AND table_name = profile_results.table_name AND column_name = profile_results.column_name - ) THEN 'Yes' END AS hygiene_issues + ) THEN 'Yes' END AS hygiene_issues, + CASE WHEN query_error IS NOT NULL THEN 'Error: ' || query_error ELSE NULL END AS result_details FROM profile_results WHERE profile_run_id = :profiling_run_id AND table_name ILIKE :table_name @@ -200,9 +202,9 @@ def get_tables_by_condition( table_chars.table_groups_id::VARCHAR AS table_group_id, -- Characteristics functional_table_type, - record_ct, + approx_record_ct, + table_chars.record_ct, table_chars.column_ct, - data_point_ct, add_date, last_refresh_date, drop_date, @@ -368,6 +370,7 @@ def get_columns_by_condition( column_chars.last_complete_profile_run_id::VARCHAR AS profile_run_id, run_date AS profile_run_date, TRUE AS is_latest_profile, + query_error AS profiling_error, {""" -- Has Test Runs EXISTS( @@ -394,6 +397,7 @@ def get_columns_by_condition( column_chars.dq_score_profiling, column_chars.dq_score_testing, """ if include_scores else ""} + table_chars.approx_record_ct, {COLUMN_PROFILING_FIELDS} FROM data_column_chars column_chars {""" diff --git a/testgen/ui/queries/table_group_queries.py b/testgen/ui/queries/table_group_queries.py index c698212..58bd282 100644 --- a/testgen/ui/queries/table_group_queries.py +++ b/testgen/ui/queries/table_group_queries.py @@ -1,18 +1,36 @@ +from collections.abc import Callable from typing import TypedDict +from uuid import UUID -from sqlalchemy.engine import Row +import streamlit as st -from testgen.commands.queries.profiling_query import CProfilingSQL -from testgen.common.database.database_service import get_flavor_service +from testgen.commands.queries.refresh_data_chars_query import ColumnChars, RefreshDataCharsSQL +from testgen.commands.run_refresh_data_chars import write_data_chars +from testgen.common import date_service from testgen.common.models.connection import Connection from testgen.common.models.table_group import TableGroup from testgen.ui.services.database_service import fetch_from_target_db +class StatsPreview(TypedDict): + id: UUID + table_groups_name: str + table_group_schema: str + table_ct: int | None + column_ct: int | None + approx_record_ct: int | None + approx_data_point_ct: int | None + +class TablePreview(TypedDict): + column_ct: int + approx_record_ct: int | None + approx_data_point_ct: int | None + can_access: bool | None + + class TableGroupPreview(TypedDict): - schema: str - tables: dict[str, bool] - column_count: int + stats: StatsPreview + tables: dict[str, TablePreview] success: bool message: str | None @@ -21,52 +39,40 @@ def get_table_group_preview( table_group: TableGroup, connection: Connection | None = None, verify_table_access: bool = False, -) -> TableGroupPreview: +) -> tuple[TableGroupPreview, Callable[[UUID], None]]: table_group_preview: TableGroupPreview = { - "schema": table_group.table_group_schema, + "stats": { + "id": table_group.id, + "table_groups_name": table_group.table_groups_name, + "table_group_schema": table_group.table_group_schema, + }, "tables": {}, - "column_count": 0, "success": True, "message": None, } + save_data_chars = None + if connection or table_group.connection_id: try: connection = connection or Connection.get(table_group.connection_id) + table_group_preview, data_chars, sql_generator = _get_preview(table_group, connection) - table_group_results = _fetch_table_group_columns(connection, table_group) - - for column in table_group_results: - table_group_preview["schema"] = column["table_schema"] - table_group_preview["tables"][column["table_name"]] = None - table_group_preview["column_count"] += 1 - - if len(table_group_results) <= 0: - table_group_preview["success"] = False - table_group_preview["message"] = ( - "No tables found matching the criteria. Please check the Table Group configuration" - " or the database permissions." - ) + def save_data_chars(table_group_id: UUID) -> None: + # Unsaved table groups will not have an ID, so we have to update it after saving + sql_generator.table_group.id = table_group_id + write_data_chars(data_chars, sql_generator, date_service.get_now_as_string()) if verify_table_access: - schema_name = table_group_preview["schema"] - flavor_service = get_flavor_service(connection.sql_flavor) - quote = flavor_service.quote_character - for table_name in table_group_preview["tables"].keys(): + tables_preview = table_group_preview["tables"] + for table_name in tables_preview.keys(): try: - results = fetch_from_target_db( - connection, - ( - f"SELECT 1 FROM {quote}{schema_name}{quote}.{quote}{table_name}{quote} LIMIT 1" - if not flavor_service.use_top - else f"SELECT TOP 1 * FROM {quote}{schema_name}{quote}.{quote}{table_name}{quote}" - ), - ) + results = fetch_from_target_db(connection, *sql_generator.verify_access(table_name)) except Exception as error: - table_group_preview["tables"][table_name] = False + tables_preview[table_name]["can_access"] = False else: - table_group_preview["tables"][table_name] = results is not None and len(results) > 0 + tables_preview[table_name]["can_access"] = results is not None and len(results) > 0 - if not all(table_group_preview["tables"].values()): + if not all(table["can_access"] for table in tables_preview.values()): table_group_preview["message"] = ( "Some tables were not accessible. Please the check the database permissions." ) @@ -75,30 +81,79 @@ def get_table_group_preview( table_group_preview["message"] = error.args[0] else: table_group_preview["success"] = False - table_group_preview["message"] = "No connection selected. Please select a connection to preview the Table Group." - return table_group_preview - - -def _fetch_table_group_columns(connection: Connection, table_group: TableGroup) -> list[Row]: - profiling_table_set = table_group.profiling_table_set - - sql_generator = CProfilingSQL(table_group.project_code, connection.sql_flavor) - - sql_generator.table_groups_id = table_group.id - sql_generator.connection_id = str(table_group.connection_id) - sql_generator.profile_run_id = "" - sql_generator.data_schema = table_group.table_group_schema - sql_generator.parm_table_set = ( - ",".join([f"'{item.strip()}'" for item in profiling_table_set.split(",")]) - if profiling_table_set - else profiling_table_set - ) - sql_generator.parm_table_include_mask = table_group.profiling_include_mask - sql_generator.parm_table_exclude_mask = table_group.profiling_exclude_mask - sql_generator.profile_id_column_mask = table_group.profile_id_column_mask - sql_generator.profile_sk_column_mask = table_group.profile_sk_column_mask - sql_generator.profile_use_sampling = "Y" if table_group.profile_use_sampling else "N" - sql_generator.profile_sample_percent = table_group.profile_sample_percent - sql_generator.profile_sample_min_count = table_group.profile_sample_min_count - - return fetch_from_target_db(connection, *sql_generator.GetDDFQuery()) + table_group_preview["message"] = ( + "No connection selected. Please select a connection to preview the Table Group." + ) + + return table_group_preview, save_data_chars + + +def reset_table_group_preview() -> None: + _get_preview.clear() + + +@st.cache_data( + show_spinner=False, + hash_funcs={ + TableGroup: lambda x: ( + x.table_group_schema, + x.profiling_table_set, + x.profiling_include_mask, + x.profiling_exclude_mask, + ), + Connection: lambda x: x.to_dict(), + }, +) +def _get_preview( + table_group: TableGroup, + connection: Connection, +) -> tuple[TableGroupPreview, list[ColumnChars], RefreshDataCharsSQL]: + sql_generator = RefreshDataCharsSQL(connection, table_group) + data_chars = fetch_from_target_db(connection, *sql_generator.get_schema_ddf()) + data_chars = [ColumnChars(**column) for column in data_chars] + + preview: TableGroupPreview = { + "stats": { + "id": table_group.id, + "table_groups_name": table_group.table_groups_name, + "table_group_schema": table_group.table_group_schema, + "table_ct": 0, + "column_ct": 0, + "approx_record_ct": None, + "approx_data_point_ct": None, + }, + "tables": {}, + "success": True, + "message": None, + } + stats = preview["stats"] + tables = preview["tables"] + + for column in data_chars: + if not tables.get(column.table_name): + tables[column.table_name] = { + "column_ct": 0, + "approx_record_ct": column.approx_record_ct, + "approx_data_point_ct": None, + "can_access": None, + } + stats["table_ct"] += 1 + if column.approx_record_ct is not None: + stats["approx_record_ct"] = (stats["approx_record_ct"] or 0) + column.approx_record_ct + + stats["column_ct"] += 1 + tables[column.table_name]["column_ct"] += 1 + if column.approx_record_ct is not None: + stats["approx_data_point_ct"] = (stats["approx_data_point_ct"] or 0) + column.approx_record_ct + tables[column.table_name]["approx_data_point_ct"] = ( + tables[column.table_name]["approx_data_point_ct"] or 0 + ) + column.approx_record_ct + + if len(data_chars) <= 0: + preview["success"] = False + preview["message"] = ( + "No tables found matching the criteria. Please check the Table Group configuration" + " or the database permissions." + ) + + return preview, data_chars, sql_generator diff --git a/testgen/ui/views/connections.py b/testgen/ui/views/connections.py index c492dde..dfb5dc4 100644 --- a/testgen/ui/views/connections.py +++ b/testgen/ui/views/connections.py @@ -15,7 +15,7 @@ from sqlalchemy.exc import DatabaseError, DBAPIError import testgen.ui.services.database_service as db -from testgen.commands.run_profiling_bridge import run_profiling_in_background +from testgen.commands.run_profiling import run_profiling_in_background from testgen.common.database.database_service import empty_cache, get_flavor_service from testgen.common.models import with_database_session from testgen.common.models.connection import Connection, ConnectionMinimal @@ -148,6 +148,10 @@ def on_test_connection_clicked(updated_connection: dict) -> None: set_check_status(True) set_updated_connection(self._sanitize_connection_input(updated_connection)) + def on_setup_table_group_clicked(*_args) -> None: + table_group_queries.reset_table_group_preview() + self.setup_data_configuration(project_code, connection.connection_id) + results = None for key, value in get_updated_connection().items(): setattr(connection, key, value) @@ -188,7 +192,7 @@ def on_test_connection_clicked(updated_connection: dict) -> None: on_change_handlers={ "TestConnectionClicked": on_test_connection_clicked, "SaveConnectionClicked": on_save_connection_clicked, - "SetupTableGroupClicked": lambda _: self.setup_data_configuration(project_code, connection.connection_id), + "SetupTableGroupClicked": on_setup_table_group_clicked, "ConnectionUpdated": on_connection_updated, }, ) @@ -266,6 +270,7 @@ def on_save_table_group_clicked(payload: dict) -> None: run_profiling: bool = payload.get("run_profiling", False) set_new_table_group(table_group) + mark_for_preview(True) set_table_group_verified(table_group_verified) set_run_profiling(run_profiling) mark_for_save(True) @@ -328,8 +333,9 @@ def on_preview_table_group(payload: dict) -> None: ) table_group_preview = None + save_data_chars = None if should_preview(): - table_group_preview = table_group_queries.get_table_group_preview( + table_group_preview, save_data_chars = table_group_queries.get_table_group_preview( table_group, verify_table_access=should_verify_access(), ) @@ -346,6 +352,12 @@ def on_preview_table_group(payload: dict) -> None: monitor_schedule_timezone=st.session_state["browser_timezone"] or "UTC", ) + if save_data_chars: + try: + save_data_chars(table_group.id) + except Exception: + LOG.exception("Data characteristics refresh encountered errors") + if should_run_profiling: try: run_profiling_in_background(table_group.id) diff --git a/testgen/ui/views/data_catalog.py b/testgen/ui/views/data_catalog.py index f498457..3c8d2fe 100644 --- a/testgen/ui/views/data_catalog.py +++ b/testgen/ui/views/data_catalog.py @@ -113,7 +113,7 @@ def render(self, project_code: str, table_group_id: str | None = None, selected: "RunProfilingClicked": partial( run_profiling_dialog, project_code, - selected_table_group, + selected_table_group.id, ), "TableGroupSelected": on_table_group_selected, "ItemSelected": on_item_selected, @@ -234,7 +234,7 @@ def get_excel_report_data(update_progress: PROGRESS_UPDATE_TYPE, table_group: Ta "add_date": {"header": "First detected"}, "last_mod_date": {"header": "Modification detected"}, "drop_date": {"header": "Drop detected"}, - "record_ct": {"header": "Record count"}, + "record_ct": {"header": "Row count"}, "value_ct": {"header": "Value count"}, "distinct_value_ct": {"header": "Distinct values"}, "null_value_ct": {"header": "Null values"}, diff --git a/testgen/ui/views/dialogs/run_profiling_dialog.py b/testgen/ui/views/dialogs/run_profiling_dialog.py index 3d5b6d6..74d6dc0 100644 --- a/testgen/ui/views/dialogs/run_profiling_dialog.py +++ b/testgen/ui/views/dialogs/run_profiling_dialog.py @@ -1,84 +1,72 @@ import time +from uuid import UUID import streamlit as st -from testgen.commands.run_profiling_bridge import run_profiling_in_background -from testgen.common.models import with_database_session -from testgen.common.models.table_group import TableGroup, TableGroupMinimal +from testgen.commands.run_profiling import run_profiling_in_background +from testgen.common.models.profiling_run import ProfilingRun +from testgen.common.models.table_group import TableGroup from testgen.ui.components import widgets as testgen -from testgen.ui.session import session -from testgen.utils import to_dataframe +from testgen.ui.navigation.router import Router +from testgen.ui.session import session, temp_value -LINK_KEY = "run_profiling_dialog:keys:go-to-runs" LINK_HREF = "profiling-runs" @st.dialog(title="Run Profiling") -@with_database_session -def run_profiling_dialog(project_code: str, table_group: TableGroupMinimal | None = None, default_table_group_id: str | None = None) -> None: - if table_group: - table_group_id: str = str(table_group.id) - table_group_name: str = table_group.table_groups_name - else: - table_groups = TableGroup.select_minimal_where(TableGroup.project_code == project_code) - table_groups_df = to_dataframe(table_groups, TableGroupMinimal.columns()) - table_group_id: str = testgen.select( - label="Table Group", - options=table_groups_df, - value_column="id", - display_column="table_groups_name", - default_value=default_table_group_id, - required=True, - placeholder="Select table group to profile", - ) - if table_group_id: - table_group_name: str = table_groups_df.loc[table_groups_df["id"] == table_group_id, "table_groups_name"].iloc[0] - testgen.whitespace(1) +def run_profiling_dialog(project_code: str, table_group_id: str | UUID | None = None, allow_selection: bool = False) -> None: + if not table_group_id and not allow_selection: + raise ValueError("Table Group ID must be specified when selection is not allowed") - if table_group_id: - with st.container(): - st.markdown(f"Execute profiling for the table group **{table_group_name}**?") - st.markdown(":material/info: _Profiling will be performed in a background process._") + def on_go_to_profiling_runs_clicked(table_group_id: str) -> None: + set_navigation_params({"project_code": project_code, "table_group_id": table_group_id}) - if testgen.expander_toggle(expand_label="Show CLI command", key="test_suite:keys:run-tests-show-cli"): - st.code(f"testgen run-profile --table-group-id {table_group_id}", language="shellSession") + def on_run_profiling_confirmed(table_group: dict) -> None: + set_table_group(table_group) + set_run_profiling(True) - button_container = st.empty() - status_container = st.empty() + get_navigation_params, set_navigation_params = temp_value("run_profiling_dialog:go_to_profiling_run", default=None) + if params := get_navigation_params(): + Router().navigate(to=LINK_HREF, with_args=params) - with button_container: - _, button_column = st.columns([.85, .15]) - with button_column: - profile_button = st.button("Run Profiling", use_container_width=True, disabled=not table_group_id) + should_run_profiling, set_run_profiling = temp_value("run_profiling_dialog:run_profiling", default=False) + get_table_group, set_table_group = temp_value("run_profiling_dialog:table_group", default=None) - if profile_button: - button_container.empty() - status_container.info("Starting profiling run ...") + table_groups = TableGroup.select_stats( + project_code=project_code, + table_group_id=table_group_id if not allow_selection else None, + ) + + result = None + if should_run_profiling(): + selected_table_group = get_table_group() + success = True + message = f"Profiling run started for table group '{selected_table_group['table_groups_name']}'." + show_link = session.current_page != LINK_HREF try: - run_profiling_in_background(table_group_id) - except Exception as e: - status_container.error(f"Profiling run encountered errors: {e!s}.") + run_profiling_in_background(selected_table_group["id"]) + except Exception as error: + success = False + message = f"Profiling run could not be started: {error!s}." + show_link = False + result = {"success": success, "message": message, "show_link": show_link} - # The second condition is needed for the link to work - if profile_button or st.session_state.get(LINK_KEY): - with status_container.container(): - st.success( - f"Profiling run started for table group **{table_group_name}**." - ) + testgen.testgen_component( + "run_profiling_dialog", + props={ + "table_groups": [table_group.to_dict(json_safe=True) for table_group in table_groups], + "selected_id": str(table_group_id), + "allow_selection": allow_selection, + "result": result, + }, + on_change_handlers={ + "GoToProfilingRunsClicked": on_go_to_profiling_runs_clicked, + "RunProfilingConfirmed": on_run_profiling_confirmed, + }, + ) - if session.current_page != LINK_HREF: - testgen.link( - label="Go to Profiling Runs", - href=LINK_HREF, - params={ "project_code": project_code, "table_group": table_group_id }, - right_icon="chevron_right", - underline=False, - height=40, - key=LINK_KEY, - style="margin-left: auto; border-radius: 4px; border: var(--button-stroked-border); padding: 8px 8px 8px 16px; color: var(--primary-color)", - ) - else: - time.sleep(2) - st.cache_data.clear() - st.rerun() + if result and result["success"] and not result["show_link"]: + time.sleep(2) + ProfilingRun.select_summary.clear() + st.rerun() diff --git a/testgen/ui/views/profiling_results.py b/testgen/ui/views/profiling_results.py index e789f3a..0c5deff 100644 --- a/testgen/ui/views/profiling_results.py +++ b/testgen/ui/views/profiling_results.py @@ -125,8 +125,8 @@ def render(self, run_id: str, table_name: str | None = None, column_name: str | selected, selected_row = fm.render_grid_select( df, - ["table_name", "column_name", "db_data_type", "semantic_data_type", "hygiene_issues"], - ["Table", "Column", "Data Type", "Semantic Data Type", "Hygiene Issues"], + ["table_name", "column_name", "db_data_type", "semantic_data_type", "hygiene_issues", "result_details"], + ["Table", "Column", "Data Type", "Semantic Data Type", "Hygiene Issues", "Details"], id_column="id", reset_pagination=filters_changed, bind_to_query=True, @@ -225,7 +225,7 @@ def get_excel_report_data( "db_data_type": {"header": "Data type"}, "datatype_suggestion": {"header": "Suggested data type"}, "semantic_data_type": {}, - "record_ct": {"header": "Record count"}, + "record_ct": {"header": "Row count"}, "value_ct": {"header": "Value count"}, "distinct_value_ct": {"header": "Distinct values"}, "null_value_ct": {"header": "Null values"}, @@ -269,6 +269,7 @@ def get_excel_report_data( "within_1mo_date_ct": {"header": "Within 1 month"}, "future_date_ct": {"header": "Future dates"}, "boolean_true_ct": {"header": "Boolean true values"}, + "result_details": {"header": "Details"}, } return get_excel_file_data( data, diff --git a/testgen/ui/views/profiling_runs.py b/testgen/ui/views/profiling_runs.py index ffea8d1..59ad015 100644 --- a/testgen/ui/views/profiling_runs.py +++ b/testgen/ui/views/profiling_runs.py @@ -75,7 +75,7 @@ def render(self, project_code: str, table_group_id: str | None = None, **_kwargs on_change_handlers={ "FilterApplied": on_profiling_runs_filtered, "RunSchedulesClicked": lambda *_: ProfilingScheduleDialog().open(project_code), - "RunProfilingClicked": lambda *_: run_profiling_dialog(project_code, None, table_group_id), + "RunProfilingClicked": lambda *_: run_profiling_dialog(project_code, table_group_id, allow_selection=True), "RefreshData": refresh_data, "RunsDeleted": partial(on_delete_runs, project_code, table_group_id), }, @@ -171,7 +171,7 @@ def on_delete_confirmed(*_args) -> None: if profiling_run.status == "Running": process_status, _ = process_service.kill_profile_run(to_int(profiling_run.process_id)) if process_status: - ProfilingRun.update_status(profiling_run.profiling_run_id, "Cancelled") + ProfilingRun.update_status(profiling_run.id, "Cancelled") ProfilingRun.cascade_delete(profiling_run_ids) st.rerun() except Exception: diff --git a/testgen/ui/views/table_groups.py b/testgen/ui/views/table_groups.py index 78da867..3bf9ca9 100644 --- a/testgen/ui/views/table_groups.py +++ b/testgen/ui/views/table_groups.py @@ -7,7 +7,7 @@ import streamlit as st from sqlalchemy.exc import IntegrityError -from testgen.commands.run_profiling_bridge import run_profiling_in_background +from testgen.commands.run_profiling import run_profiling_in_background from testgen.common.models import with_database_session from testgen.common.models.connection import Connection from testgen.common.models.project import Project @@ -18,6 +18,7 @@ from testgen.ui.queries import table_group_queries from testgen.ui.session import session, temp_value from testgen.ui.views.connections import FLAVOR_OPTIONS, format_connection +from testgen.ui.views.dialogs.run_profiling_dialog import run_profiling_dialog from testgen.ui.views.profiling_runs import ProfilingScheduleDialog LOG = logging.getLogger("testgen") @@ -63,6 +64,14 @@ def render( table_groups = TableGroup.select_minimal_where(*table_group_filters) connections = self._get_connections(project_code) + def on_add_table_group_clicked(*_args) -> None: + table_group_queries.reset_table_group_preview() + self.add_table_group_dialog(project_code, connection_id) + + def on_edit_table_group_clicked(table_group_id: str) -> None: + table_group_queries.reset_table_group_preview() + self.edit_table_group_dialog(project_code, table_group_id) + return testgen.testgen_component( "table_group_list", props={ @@ -77,10 +86,10 @@ def render( }, on_change_handlers={ "RunSchedulesClicked": lambda *_: ProfilingScheduleDialog().open(project_code), - "AddTableGroupClicked": partial(self.add_table_group_dialog, project_code, connection_id), - "EditTableGroupClicked": partial(self.edit_table_group_dialog, project_code), + "AddTableGroupClicked": on_add_table_group_clicked, + "EditTableGroupClicked": on_edit_table_group_clicked, "DeleteTableGroupClicked": partial(self.delete_table_group_dialog, project_code), - "RunProfilingClicked": partial(self.run_profiling_dialog, project_code), + "RunProfilingClicked": partial(run_profiling_dialog, project_code), "TableGroupsFiltered": lambda params: self.router.queue_navigation( to="table-groups", with_args={"project_code": project_code, **params}, @@ -90,7 +99,7 @@ def render( @st.dialog(title="Add Table Group") @with_database_session - def add_table_group_dialog(self, project_code: str, connection_id: str | None, *_args): + def add_table_group_dialog(self, project_code: str, connection_id: str | None): return self._table_group_wizard( project_code, connection_id=connection_id, @@ -134,6 +143,7 @@ def on_save_table_group_clicked(payload: dict): table_group_verified: bool = payload.get("table_group_verified", False) run_profiling: bool = payload.get("run_profiling", False) + mark_for_preview(True) set_save(True) set_table_group(table_group) set_table_group_verified(table_group_verified) @@ -182,6 +192,7 @@ def on_go_to_profiling_runs(params: dict) -> None: setattr(table_group, key, value) table_group_preview = None + save_data_chars = None if is_table_group_used: table_group.table_group_schema = original_table_group_schema @@ -201,7 +212,7 @@ def on_go_to_profiling_runs(params: dict) -> None: ] if should_preview(): - table_group_preview = table_group_queries.get_table_group_preview( + table_group_preview, save_data_chars = table_group_queries.get_table_group_preview( table_group, verify_table_access=should_verify_access(), ) @@ -217,6 +228,13 @@ def on_go_to_profiling_runs(params: dict) -> None: add_monitor_test_suite=add_monitor_test_suite, monitor_schedule_timezone=st.session_state["browser_timezone"] or "UTC", ) + + if save_data_chars: + try: + save_data_chars(table_group.id) + except Exception: + LOG.exception("Data characteristics refresh encountered errors") + if should_run_profiling(): try: run_profiling_in_background(table_group.id) @@ -286,52 +304,6 @@ def _format_table_group_list( return formatted_list - @st.dialog(title="Run Profiling") - def run_profiling_dialog(self, project_code: str, table_group_id: str) -> None: - def on_go_to_profiling_runs_clicked(table_group_id: str) -> None: - set_navigation_params({ "project_code": project_code, "table_group_id": table_group_id }) - - def on_run_profiling_confirmed(*_args) -> None: - set_run_profiling(True) - - get_navigation_params, set_navigation_params = temp_value( - f"table_groups:{table_group_id}:go_to_profiling_run", - default=None, - ) - if (params := get_navigation_params()): - self.router.navigate(to="profiling-runs", with_args=params) - - should_run_profiling, set_run_profiling = temp_value( - f"table_groups:{table_group_id}:run_profiling", - default=False, - ) - - table_group = TableGroup.get_minimal(table_group_id) - result = None - if should_run_profiling(): - success = True - message = "Profiling run started" - - try: - run_profiling_in_background(table_group_id) - except Exception as error: - success = False - message = f"Profiling run encountered errors: {error!s}." - result = {"success": success, "message": message} - - return testgen.testgen_component( - "run_profiling_dialog", - props={ - "project_code": project_code, - "table_group": table_group.to_dict(json_safe=True), - "result": result, - }, - on_change_handlers={ - "GoToProfilingRunsClicked": on_go_to_profiling_runs_clicked, - "RunProfilingConfirmed": on_run_profiling_confirmed, - }, - ) - @st.dialog(title="Delete Table Group") @with_database_session def delete_table_group_dialog(self, project_code: str, table_group_id: str): diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index 8b8a2a8..e5a3fb1 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -1217,7 +1217,7 @@ def get_columns(table_groups_id: str) -> list[dict]: def validate_test(test_definition, table_group: TableGroupMinimal): schema = test_definition["schema_name"] table_name = test_definition["table_name"] - connection = Connection.get_by_table_group(table_group.id) + connection = Connection.get(table_group.connection_id) if test_definition["test_type"] == "Condition_Flag": condition = test_definition["custom_query"] diff --git a/testgen/utils/__init__.py b/testgen/utils/__init__.py index 3e86403..2b295ff 100644 --- a/testgen/utils/__init__.py +++ b/testgen/utils/__init__.py @@ -58,6 +58,10 @@ def try_json(value: str | None, default: T | None) -> T: return json.loads(value) except: return default + + +def get_exception_message(exception: Exception) -> str: + return exception.args[0].rstrip() if exception.args and isinstance(exception.args[0], str) else str(exception) # https://github.com/streamlit/streamlit/issues/798#issuecomment-1647759949 diff --git a/tests/unit/test_profiling_query.py b/tests/unit/test_profiling_query.py deleted file mode 100644 index 368fb5b..0000000 --- a/tests/unit/test_profiling_query.py +++ /dev/null @@ -1,68 +0,0 @@ -import pytest - -from testgen.commands.queries.profiling_query import CProfilingSQL - - -@pytest.mark.unit -def test_include_exclude_mask_basic(): - # test configuration - project_code = "dummy_project_code" - flavor = "postgresql" - profiling_query = CProfilingSQL(project_code, flavor) - profiling_query.parm_table_set = "" - profiling_query.parm_table_include_mask = "important%, %useful%" - profiling_query.parm_table_exclude_mask = "temp%,tmp%,raw_slot_utilization%,gps_product_step_change_log" - - # test run - query, _ = profiling_query.GetDDFQuery() - - # test assertions - assert "SELECT 'dummy_project_code'" in query - assert r"""AND ( - (c.table_name LIKE 'important%' ) OR (c.table_name LIKE '%useful%' ) - )""" in query - assert r"""AND NOT ( - (c.table_name LIKE 'temp%' ) OR (c.table_name LIKE 'tmp%' ) OR (c.table_name LIKE 'raw\_slot\_utilization%' ) OR (c.table_name LIKE 'gps\_product\_step\_change\_log' ) - )""" in query - - -@pytest.mark.unit -@pytest.mark.parametrize("mask", ("", None)) -def test_include_empty_exclude_mask(mask): - # test configuration - project_code = "dummy_project_code" - flavor = "snowflake" - profiling_query = CProfilingSQL(project_code, flavor) - profiling_query.parm_table_set = "" - profiling_query.parm_table_include_mask = mask - profiling_query.parm_table_exclude_mask = "temp%,tmp%,raw_slot_utilization%,gps_product_step_change_log" - - # test run - query, _ = profiling_query.GetDDFQuery() - print(query) - - # test assertions - assert r"""AND NOT ( - (c.table_name LIKE 'temp%' ESCAPE '\\') OR (c.table_name LIKE 'tmp%' ESCAPE '\\') OR (c.table_name LIKE 'raw\\_slot\\_utilization%' ESCAPE '\\') OR (c.table_name LIKE 'gps\\_product\\_step\\_change\\_log' ESCAPE '\\') - )""" in query - - -@pytest.mark.unit -@pytest.mark.parametrize("mask", ("", None)) -def test_include_empty_include_mask(mask): - # test configuration - project_code = "dummy_project_code" - flavor = "mssql" - profiling_query = CProfilingSQL(project_code, flavor) - profiling_query.parm_table_set = "" - profiling_query.parm_table_include_mask = "important%, %useful_%" - profiling_query.parm_table_exclude_mask = mask - - # test run - query, _ = profiling_query.GetDDFQuery() - print(query) - - # test assertions - assert r"""AND ( - (c.table_name LIKE 'important%' ) OR (c.table_name LIKE '%useful[_]%' ) - )""" in query diff --git a/tests/unit/test_refresh_data_chars_query.py b/tests/unit/test_refresh_data_chars_query.py new file mode 100644 index 0000000..a84bc13 --- /dev/null +++ b/tests/unit/test_refresh_data_chars_query.py @@ -0,0 +1,62 @@ +import pytest + +from testgen.commands.queries.refresh_data_chars_query import RefreshDataCharsSQL +from testgen.common.models.connection import Connection +from testgen.common.models.table_group import TableGroup + + +@pytest.mark.unit +def test_include_exclude_mask_basic(): + connection = Connection(sql_flavor="postgresql") + table_group = TableGroup( + table_group_schema="test_schema", + profiling_table_set="", + profiling_include_mask="important%, %useful%", + profiling_exclude_mask="temp%,tmp%,raw_slot_utilization%,gps_product_step_change_log" + ) + sql_generator = RefreshDataCharsSQL(connection, table_group) + query, _ = sql_generator.get_schema_ddf() + + assert "WHERE c.table_schema = 'test_schema'" in query + assert r"""AND ( + (c.table_name LIKE 'important%' ) OR (c.table_name LIKE '%useful%' ) + )""" in query + assert r"""AND NOT ( + (c.table_name LIKE 'temp%' ) OR (c.table_name LIKE 'tmp%' ) OR (c.table_name LIKE 'raw\_slot\_utilization%' ) OR (c.table_name LIKE 'gps\_product\_step\_change\_log' ) + )""" in query + + +@pytest.mark.unit +@pytest.mark.parametrize("mask", ("", None)) +def test_include_empty_exclude_mask(mask): + connection = Connection(sql_flavor="snowflake") + table_group = TableGroup( + table_group_schema="test_schema", + profiling_table_set="", + profiling_include_mask=mask, + profiling_exclude_mask="temp%,tmp%,raw_slot_utilization%,gps_product_step_change_log" + ) + sql_generator = RefreshDataCharsSQL(connection, table_group) + query, _ = sql_generator.get_schema_ddf() + + assert r"""AND NOT ( + (c.table_name LIKE 'temp%' ESCAPE '\\') OR (c.table_name LIKE 'tmp%' ESCAPE '\\') OR (c.table_name LIKE 'raw\\_slot\\_utilization%' ESCAPE '\\') OR (c.table_name LIKE 'gps\\_product\\_step\\_change\\_log' ESCAPE '\\') + )""" in query + + +@pytest.mark.unit +@pytest.mark.parametrize("mask", ("", None)) +def test_include_empty_include_mask(mask): + connection = Connection(sql_flavor="mssql") + table_group = TableGroup( + table_group_schema="test_schema", + profiling_table_set="", + profiling_include_mask="important%, %useful_%", + profiling_exclude_mask=mask, + ) + sql_generator = RefreshDataCharsSQL(connection, table_group) + query, _ = sql_generator.get_schema_ddf() + + assert r"""AND ( + (c.table_name LIKE 'important%' ) OR (c.table_name LIKE '%useful[_]%' ) + )""" in query From 2e0bc6407d2280c6a31479c993d572c28262344d Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Thu, 23 Oct 2025 23:08:01 -0400 Subject: [PATCH 05/28] fix: address pr feedback --- testgen/__main__.py | 5 ++--- testgen/commands/run_execute_tests.py | 5 ++--- testgen/commands/run_profiling.py | 7 ++++--- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/testgen/__main__.py b/testgen/__main__.py index a6578e7..130de69 100644 --- a/testgen/__main__.py +++ b/testgen/__main__.py @@ -114,15 +114,14 @@ def cli(ctx: Context, verbose: bool): @register_scheduler_job @cli.command("run-profile", help="Generates a new profile of the table group.") -@pass_configuration @click.option( "-tg", "--table-group-id", required=True, type=click.STRING, - help="The identifier for the table group used during a profile run. Use a table_group_id shown in list-table-groups.", + help="ID of the table group to profile. Use a table_group_id shown in list-table-groups.", ) -def run_profile(configuration: Configuration, table_group_id: str): +def run_profile(table_group_id: str): click.echo(f"run-profile with table_group_id: {table_group_id}") message = run_profiling(table_group_id) click.echo("\n" + message) diff --git a/testgen/commands/run_execute_tests.py b/testgen/commands/run_execute_tests.py index 9a16bb2..a99f600 100644 --- a/testgen/commands/run_execute_tests.py +++ b/testgen/commands/run_execute_tests.py @@ -58,7 +58,7 @@ def run_test_queries( minutes_offset: int = 0, spinner: Spinner | None = None, ): - has_errors = False + errors = None error_msg = "" LOG.info("CurrentStep: Initializing Query Generator") @@ -100,7 +100,6 @@ def run_test_queries( if lstTestResults: write_to_app_db(lstTestResults, colResultNames, "test_results") if errors: - has_errors = True error_msg = ( f"Errors were encountered executing Referential Tests. ({len(errors)} errors occurred.) " "Please check log. " @@ -118,7 +117,7 @@ def run_test_queries( raise else: - return has_errors, error_msg + return bool(errors), error_msg def run_execution_steps_in_background(project_code, test_suite): diff --git a/testgen/commands/run_profiling.py b/testgen/commands/run_profiling.py index 344e437..528c767 100644 --- a/testgen/commands/run_profiling.py +++ b/testgen/commands/run_profiling.py @@ -2,6 +2,7 @@ import subprocess import threading from datetime import UTC, datetime +from uuid import UUID import testgen.common.process_service as process_service from testgen import settings @@ -33,7 +34,7 @@ LOG = logging.getLogger("testgen") -def run_profiling_in_background(table_group_id): +def run_profiling_in_background(table_group_id: str | UUID) -> None: msg = f"Triggering profiling run for table group {table_group_id}" if settings.IS_DEBUG: LOG.info(msg + ". Running in debug mode (new thread instead of new process).") @@ -50,7 +51,7 @@ def run_profiling_in_background(table_group_id): @with_database_session -def run_profiling(table_group_id: str, username: str | None = None, minutes_offset: int = 0): +def run_profiling(table_group_id: str | UUID, username: str | None = None, minutes_offset: int = 0) -> str: if table_group_id is None: raise ValueError("Table Group ID was not specified") @@ -73,7 +74,7 @@ def run_profiling(table_group_id: str, username: str | None = None, minutes_offs profiling_run.set_progress("data_chars", "Running") profiling_run.save() - LOG.info(f"Profiling run: {profiling_run.id}, Connection: {connection.connection_name}, Table group: {table_group.table_groups_name}") + LOG.info(f"Profiling run: {profiling_run.id}, Table group: {table_group.table_groups_name}, Connection: {connection.connection_name}") try: data_chars = run_data_chars_refresh(connection, table_group, profiling_run.profiling_starttime) distinct_tables = {(column.table_name, column.record_ct) for column in data_chars} From e91ff88ca92ca341a4198f8a8fab0ca4826ad78b Mon Sep 17 00:00:00 2001 From: Luis Date: Sat, 25 Oct 2025 09:46:50 -0400 Subject: [PATCH 06/28] load score cards history in one go and change all other related objects to be fetched as needed. for an unknown reason this required me to strip the timezone info from the datetime object passed to ScoreDefinitionResultHistoryEntry before saving it. --- testgen/commands/run_launch_db_config.py | 9 --- testgen/commands/run_quick_start.py | 11 ++++ .../run_refresh_score_cards_results.py | 17 +++-- testgen/common/models/scores.py | 65 +++++++++++++++++-- testgen/ui/queries/scoring_queries.py | 5 +- 5 files changed, 82 insertions(+), 25 deletions(-) diff --git a/testgen/commands/run_launch_db_config.py b/testgen/commands/run_launch_db_config.py index 65ae07d..2533244 100644 --- a/testgen/commands/run_launch_db_config.py +++ b/testgen/commands/run_launch_db_config.py @@ -7,8 +7,6 @@ from testgen.common.database.database_service import get_queries_for_command from testgen.common.encrypt import EncryptText, encrypt_ui_password from testgen.common.models import with_database_session -from testgen.common.models.scores import ScoreDefinition -from testgen.common.models.table_group import TableGroup from testgen.common.read_file import get_template_files from testgen.common.read_yaml_metadata_records import import_metadata_records_from_yaml @@ -89,13 +87,6 @@ def run_launch_db_config(delete_db: bool, drop_users_and_roles: bool = True) -> ) import_metadata_records_from_yaml(params_mapping) - ScoreDefinition.from_table_group( - TableGroup( - project_code=settings.PROJECT_KEY, - table_groups_name=settings.DEFAULT_TABLE_GROUPS_NAME, - ) - ).save() - def get_app_db_params_mapping() -> dict: return _get_params_mapping() diff --git a/testgen/commands/run_quick_start.py b/testgen/commands/run_quick_start.py index fd973d9..5c9ea32 100644 --- a/testgen/commands/run_quick_start.py +++ b/testgen/commands/run_quick_start.py @@ -12,6 +12,9 @@ set_target_db_params, ) from testgen.common.database.flavor.flavor_service import ConnectionParams +from testgen.common.models import with_database_session +from testgen.common.models.scores import ScoreDefinition +from testgen.common.models.table_group import TableGroup from testgen.common.read_file import read_template_sql_file LOG = logging.getLogger("testgen") @@ -135,6 +138,14 @@ def run_quick_start(delete_target_db: bool) -> None: use_target_db=True, ) + score_definition = ScoreDefinition.from_table_group( + TableGroup( + project_code=settings.PROJECT_KEY, + table_groups_name=settings.DEFAULT_TABLE_GROUPS_NAME, + ) + ) + with_database_session(score_definition.save)() + def run_quick_start_increment(iteration): params_mapping = _get_params_mapping(iteration) diff --git a/testgen/commands/run_refresh_score_cards_results.py b/testgen/commands/run_refresh_score_cards_results.py index 5475496..7f0015f 100644 --- a/testgen/commands/run_refresh_score_cards_results.py +++ b/testgen/commands/run_refresh_score_cards_results.py @@ -36,6 +36,7 @@ def run_refresh_score_cards_results( return db_session = get_current_session() + for definition in definitions: LOG.info( "Refreshing results for scorecard %s in project %s", @@ -45,10 +46,7 @@ def run_refresh_score_cards_results( try: fresh_score_card = definition.as_score_card() - definition.results = [] - definition.breakdown = [] - db_session.flush([definition]) - + definition.clear_results() definition.results = _score_card_to_results(fresh_score_card) definition.breakdown = _score_definition_to_results_breakdown(definition) if add_history_entry: @@ -58,6 +56,7 @@ def run_refresh_score_cards_results( definition.project_code, ) + last_added_entry = None historical_categories = ["score", "cde_score"] for result in definition.results: if result.category in historical_categories: @@ -65,10 +64,14 @@ def run_refresh_score_cards_results( definition_id=result.definition_id, category=result.category, score=result.score, - last_run_time=_refresh_date, + last_run_time=_refresh_date.replace(tzinfo=None), ) - definition.history.append(history_entry) - history_entry.add_as_cutoff() + db_session.add(history_entry) + db_session.flush([history_entry]) + last_added_entry = history_entry + + if last_added_entry: + last_added_entry.add_as_cutoff() definition.save() except Exception: LOG.exception( diff --git a/testgen/common/models/scores.py b/testgen/common/models/scores.py index c6db830..61c3ceb 100644 --- a/testgen/common/models/scores.py +++ b/testgen/common/models/scores.py @@ -12,9 +12,9 @@ from typing import Literal, Self, TypedDict from uuid import UUID, uuid4 -from sqlalchemy import Boolean, Column, DateTime, Enum, Float, ForeignKey, Integer, String, select, text +from sqlalchemy import Boolean, Column, DateTime, Enum, Float, ForeignKey, Integer, String, delete, func, select, text from sqlalchemy.dialects import postgresql -from sqlalchemy.orm import relationship +from sqlalchemy.orm import aliased, attributes, relationship from testgen.common import read_template_sql_file from testgen.common.models import Base, get_current_session @@ -79,7 +79,7 @@ class ScoreDefinition(Base): criteria: ScoreDefinitionCriteria = relationship( "ScoreDefinitionCriteria", cascade="all, delete-orphan", - lazy="joined", + lazy="select", uselist=False, single_parent=True, ) @@ -93,7 +93,7 @@ class ScoreDefinition(Base): "ScoreDefinitionBreakdownItem", cascade="all, delete-orphan", order_by="ScoreDefinitionBreakdownItem.impact.desc()", - lazy="joined", + lazy="select", ) history: Iterable[ScoreDefinitionResultHistoryEntry] = relationship( "ScoreDefinitionResultHistoryEntry", @@ -136,16 +136,50 @@ def all( project_code: str | None = None, name_filter: str | None = None, sorted_by: str | None = "name", + last_history_items: int = 0, ) -> Iterable[Self]: definitions = [] db_session = get_current_session() - query = select(ScoreDefinition) + query = select(ScoreDefinition).options() if name_filter: query = query.where(ScoreDefinition.name.ilike(f"%{name_filter}%")) if project_code: query = query.where(ScoreDefinition.project_code == project_code) + query = query.order_by(text(sorted_by)) definitions = db_session.scalars(query).unique().all() + definitions_map = {} + + if last_history_items > 0: + for definition in definitions: + definitions_map[str(definition.id)] = definition + db_session.expunge(definition) + attributes.set_committed_value(definition, "history", []) + + HistoryEntry = aliased(ScoreDefinitionResultHistoryEntry) + history_subquery = select( + HistoryEntry.definition_id, + HistoryEntry.category, + HistoryEntry.score, + HistoryEntry.last_run_time, + func.row_number().over( + partition_by=HistoryEntry.definition_id, + order_by=HistoryEntry.last_run_time.desc(), + ) + .label("rn"), + ).subquery() + history_query = select(history_subquery).where(history_subquery.c.rn <= last_history_items) + + history_entries = db_session.execute(history_query).unique().all() + for entry in history_entries: + if (definition := definitions_map.get(str(entry.definition_id))): + definition.history.append(ScoreDefinitionResultHistoryEntry( + definition_id=entry.definition_id, + category=entry.category, + score=entry.score, + last_run_time=entry.last_run_time, + )) + return definitions def save(self) -> None: @@ -161,6 +195,23 @@ def delete(self) -> None: db_session.delete(self) db_session.commit() + def clear_results(self) -> None: + db_session = get_current_session() + + delete_results_query = delete(ScoreDefinitionResult).where( + ScoreDefinitionResult.definition_id == self.id + ) + delete_breakdown_query = delete(ScoreDefinitionBreakdownItem).where( + ScoreDefinitionBreakdownItem.definition_id == self.id + ) + + db_session.execute(delete_results_query) + db_session.execute(delete_breakdown_query) + db_session.flush() + + self.results = [] + self.breakdown = [] + def as_score_card(self) -> ScoreCard: """ Executes and combines two raw queries to build a fresh score @@ -223,7 +274,7 @@ def as_score_card(self) -> ScoreCard: "definition": self, } - def as_cached_score_card(self) -> ScoreCard: + def as_cached_score_card(self, include_definition: bool = False) -> ScoreCard: """Reads the cached values to build a scorecard""" root_keys: list[str] = ["score", "profiling_score", "testing_score", "cde_score"] score_card: ScoreCard = { @@ -232,7 +283,7 @@ def as_cached_score_card(self) -> ScoreCard: "name": self.name, "categories": [], "history": [], - "definition": self, + "definition": self if include_definition else None, } for result in sorted(self.results, key=lambda r: r.category): diff --git a/testgen/ui/queries/scoring_queries.py b/testgen/ui/queries/scoring_queries.py index 38b7387..64ecbf7 100644 --- a/testgen/ui/queries/scoring_queries.py +++ b/testgen/ui/queries/scoring_queries.py @@ -8,10 +8,11 @@ @st.cache_data(show_spinner="Loading data :gray[:small[(This might take a few minutes)]] ...") def get_all_score_cards(project_code: str) -> list["ScoreCard"]: - return [ + results = [ definition.as_cached_score_card() - for definition in ScoreDefinition.all(project_code=project_code) + for definition in ScoreDefinition.all(project_code=project_code, last_history_items=50) ] + return results def get_score_card_issue_reports(selected_issues: list["SelectedIssue"]) -> list[dict]: From 056024ac777a7ecb031d0cbaa365ba4289bde025 Mon Sep 17 00:00:00 2001 From: Luis Date: Tue, 28 Oct 2025 16:51:48 -0400 Subject: [PATCH 07/28] fix(scores): profiling rollup queries retrieval --- testgen/commands/queries/rollup_scores_query.py | 12 +++++++----- testgen/commands/run_rollup_scores.py | 6 +----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/testgen/commands/queries/rollup_scores_query.py b/testgen/commands/queries/rollup_scores_query.py index 7255ec6..38f03cb 100644 --- a/testgen/commands/queries/rollup_scores_query.py +++ b/testgen/commands/queries/rollup_scores_query.py @@ -6,27 +6,29 @@ class RollupScoresSQL: run_id: str - table_group_id: str + table_group_id: str | None def __init__(self, run_id: str, table_group_id: str | UUID | None = None): self.run_id = run_id - self.table_group_id = str(table_group_id) + self.table_group_id = str(table_group_id) if table_group_id is not None else None def _get_query(self, template_file_name: str, sub_directory: str | None = "rollup_scores") -> tuple[str, dict]: query = read_template_sql_file(template_file_name, sub_directory) params = { "RUN_ID": self.run_id, - "TABLE_GROUPS_ID": self.table_group_id or "" + "TABLE_GROUPS_ID": self.table_group_id or "", } query = replace_params(query, params) return query, params def rollup_profiling_scores(self) -> list[tuple[str, dict]]: # Runs on App database - return [ + queries = [ self._get_query("rollup_scores_profile_run.sql"), - self._get_query("rollup_scores_profile_table_group.sql"), ] + if self.table_group_id: + queries.append(self._get_query("rollup_scores_profile_table_group.sql")) + return queries def GetRollupScoresTestRunQuery(self) -> tuple[str, dict]: # Runs on App database diff --git a/testgen/commands/run_rollup_scores.py b/testgen/commands/run_rollup_scores.py index 707e50f..45b0393 100644 --- a/testgen/commands/run_rollup_scores.py +++ b/testgen/commands/run_rollup_scores.py @@ -11,12 +11,8 @@ def run_profile_rollup_scoring_queries(project_code: str, run_id: str, table_gro LOG.info("CurrentStep: Initializing Profiling Scores Rollup") sql_generator = RollupScoresSQL(run_id, table_group_id) - queries = [sql_generator.GetRollupScoresProfileRunQuery()] - if table_group_id: - queries.append(sql_generator.GetRollupScoresProfileTableGroupQuery()) - LOG.info("CurrentStep: Rolling up profiling scores") - execute_db_queries(queries) + execute_db_queries(sql_generator.rollup_profiling_scores()) run_refresh_score_cards_results(project_code=project_code) From da60a0da316969167cd280fae37367203f59e6d0 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Mon, 3 Nov 2025 16:37:04 -0500 Subject: [PATCH 08/28] feat(sampling): Apply sampling to subqueries --- testgen/commands/queries/profiling_query.py | 9 +++----- .../profiling/project_profiling_query.yaml | 5 +--- .../profiling/project_profiling_query.yaml | 23 +++++++++++-------- .../project_secondary_profiling_query.sql | 17 ++++++++------ .../profiling/project_profiling_query.yaml | 23 +++++++++++-------- .../project_secondary_profiling_query.sql | 17 ++++++++------ .../profiling/project_profiling_query.yaml | 23 +++++++++++-------- .../project_secondary_profiling_query.sql | 15 +++++++----- .../profiling/project_profiling_query.yaml | 23 +++++++++++-------- .../project_secondary_profiling_query.sql | 15 +++++++----- .../profiling/project_profiling_query.yaml | 23 +++++++++++-------- .../project_secondary_profiling_query.sql | 15 +++++++----- .../profiling/project_profiling_query.yaml | 23 +++++++++++-------- .../project_secondary_profiling_query.sql | 17 +++++++++----- .../profiling/project_profiling_query.yaml | 23 +++++++++++-------- 15 files changed, 160 insertions(+), 111 deletions(-) diff --git a/testgen/commands/queries/profiling_query.py b/testgen/commands/queries/profiling_query.py index 3d8c0e6..d215ddd 100644 --- a/testgen/commands/queries/profiling_query.py +++ b/testgen/commands/queries/profiling_query.py @@ -147,7 +147,7 @@ def _process_conditionals(self, query: str, extra_params: dict | None = None) -> raise ValueError("Template conditional misused") return "".join(updated_query) - + def _get_profiling_template(self) -> dict: if not self._profiling_template: self._profiling_template = read_template_yaml_file( @@ -250,22 +250,19 @@ def run_column_profiling(self, column_chars: ColumnChars, table_sampling: TableS query += template["12_B" if general_type == "B" else "12_else"] query += template["14_A" if general_type == "A" else "14_else"] query += template["16_all"] - query += template["98_sampling" if table_sampling else "98_else"] + query += template["98_all"] if general_type == "N": query += template["99_N_sampling" if table_sampling else "99_N"] else: query += template["99_else"] - if table_sampling: - query += template["100_sampling"] - params = self._get_params(column_chars, table_sampling) query = replace_params(query, params) query = replace_templated_functions(query, self.flavor) return query, params - + def get_profiling_errors(self, column_errors: list[tuple[ColumnChars, str]]) -> list[list[str | UUID | int]]: return [ [ diff --git a/testgen/template/flavors/bigquery/profiling/project_profiling_query.yaml b/testgen/template/flavors/bigquery/profiling/project_profiling_query.yaml index 03c7a4a..0a9c635 100644 --- a/testgen/template/flavors/bigquery/profiling/project_profiling_query.yaml +++ b/testgen/template/flavors/bigquery/profiling/project_profiling_query.yaml @@ -238,8 +238,7 @@ 16_all: " '{PROFILE_RUN_ID}' as profile_run_id " -98_sampling: ' FROM target_table' -98_else: ' FROM target_table' +98_all: ' FROM target_table' 99_N: | , @@ -256,5 +255,3 @@ APPROX_QUANTILES(`{COL_NAME}`, 100)[OFFSET(75)] AS pct_75 FROM `{DATA_SCHEMA}.{DATA_TABLE}` LIMIT 1) pctile 99_else: ; - -100_sampling: ' ' diff --git a/testgen/template/flavors/databricks/profiling/project_profiling_query.yaml b/testgen/template/flavors/databricks/profiling/project_profiling_query.yaml index 32c41ab..2fc9350 100644 --- a/testgen/template/flavors/databricks/profiling/project_profiling_query.yaml +++ b/testgen/template/flavors/databricks/profiling/project_profiling_query.yaml @@ -1,6 +1,14 @@ --- -01_sampling: "SELECT " -01_else: "SELECT " +01_sampling: | + WITH target_table AS ( + SELECT * FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) + ) + SELECT +01_else: | + WITH target_table AS ( + SELECT * FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` + ) + SELECT 01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, @@ -146,11 +154,11 @@ ) AS pattern, COUNT(*) AS ct, ct || ' | ' || pattern AS ct_pattern - FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` + FROM target_table WHERE trim(`{COL_NAME}`) != '' AND ( - (SELECT MAX(LEN(`{COL_NAME}`)) FROM `{DATA_SCHEMA}`.`{DATA_TABLE}`) BETWEEN 3 and 25 + (SELECT MAX(LEN(`{COL_NAME}`)) FROM target_table) BETWEEN 3 and 25 ) GROUP BY pattern HAVING len(pattern) > 0 @@ -240,7 +248,7 @@ 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN' ) ) AS pattern_ct - FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` + FROM target_table WHERE `{COL_NAME}` > ' ' ) AS distinct_pattern_ct, SUM(CAST(SIGN(LEN(TRIM(`{COL_NAME}`)) - LEN(REPLACE(TRIM(`{COL_NAME}`),' ',''))) AS BIGINT)) AS embedded_space_ct, AVG(CAST(LEN(TRIM(`{COL_NAME}`)) - LEN(REPLACE(TRIM(`{COL_NAME}`),' ','')) AS FLOAT)) AS avg_embedded_spaces, @@ -250,8 +258,7 @@ 16_all: " '{PROFILE_RUN_ID}' as profile_run_id" -98_sampling: ' FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT)' -98_else: ' FROM `{DATA_SCHEMA}`.`{DATA_TABLE}`' +98_all: ' FROM target_table' 99_N: | , (SELECT @@ -266,5 +273,3 @@ PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_75 FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) LIMIT 1 ) pctile 99_else: ' ' - -100_sampling: ' ' diff --git a/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query.sql b/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query.sql index 7def8c7..c3bb409 100644 --- a/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query.sql +++ b/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query.sql @@ -1,13 +1,17 @@ --- Get Freqs for selected columns -WITH ranked_vals +WITH target_table AS - (SELECT `{COL_NAME}`, - COUNT(*) AS ct, - ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) AS rn + (SELECT * FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` -- TG-IF do_sample_bool TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) -- TG-ENDIF + ), +-- Get Freqs for selected columns +ranked_vals +AS (SELECT `{COL_NAME}`, + COUNT(*) AS ct, + ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) AS rn + FROM target_table WHERE `{COL_NAME}` > ' ' GROUP BY `{COL_NAME}` ), @@ -32,7 +36,6 @@ SELECT '{PROJECT_CODE}' as project_code, (left, right) -> CASE WHEN CAST(SPLIT(left, '\\|')[0] AS INT) < CAST(SPLIT(right, '\\|')[0] AS INT) THEN -1 ELSE 1 END )), '^#^', '\n') AS top_freq_values, (SELECT MD5(CONCAT_WS('|', ARRAY_SORT(COLLECT_LIST(NULLIF(dist_col_name,''))))) as dvh - FROM (SELECT DISTINCT `{COL_NAME}` as dist_col_name - FROM `{DATA_SCHEMA}`.`{DATA_TABLE}`) a + FROM (SELECT DISTINCT `{COL_NAME}` as dist_col_name FROM target_table) a ) as distinct_value_hash FROM consol_vals; diff --git a/testgen/template/flavors/mssql/profiling/project_profiling_query.yaml b/testgen/template/flavors/mssql/profiling/project_profiling_query.yaml index 58e04ca..77ec98c 100644 --- a/testgen/template/flavors/mssql/profiling/project_profiling_query.yaml +++ b/testgen/template/flavors/mssql/profiling/project_profiling_query.yaml @@ -1,6 +1,14 @@ --- -01_sampling: "SELECT " -01_else: "SELECT " +01_sampling: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK) + ) + SELECT +01_else: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK) + ) + SELECT 01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, @@ -148,9 +156,9 @@ 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN' ) AS pattern - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK) + FROM target_table WHERE "{COL_NAME}" > ' ' AND ((SELECT MAX(LEN("{COL_NAME}")) - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)) BETWEEN 3 and {MAX_PATTERN_LENGTH})) p + FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH})) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC @@ -238,7 +246,7 @@ 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN' ) ) AS pattern_ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK) + FROM target_table WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(CAST(SIGN(LEN(RTRIM(LTRIM("{COL_NAME}"))) - LEN(REPLACE(RTRIM(LTRIM("{COL_NAME}")),' ',''))) AS BIGINT)) AS embedded_space_ct, AVG(CAST(LEN(RTRIM(LTRIM("{COL_NAME}"))) - LEN(REPLACE(RTRIM(LTRIM("{COL_NAME}")),' ','')) AS FLOAT)) AS avg_embedded_spaces, @@ -248,8 +256,7 @@ 16_all: " '{PROFILE_RUN_ID}' as profile_run_id" -98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK)' -98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)' +98_all: ' FROM target_table ' 99_N: | , (SELECT TOP 1 @@ -264,5 +271,3 @@ PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK)) pctile 99_else: ' ' - -100_sampling: ' ' diff --git a/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query.sql b/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query.sql index 5450560..4a52c3d 100644 --- a/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query.sql +++ b/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query.sql @@ -1,13 +1,17 @@ +WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-IF do_sample_bool + TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) +-- TG-ENDIF + WITH (NOLOCK) + ), -- Get Freqs for selected columns -WITH ranked_vals +ranked_vals AS (SELECT "{COL_NAME}", COUNT(*) AS ct, ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) AS rn - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" --- TG-IF do_sample_bool - TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) --- TG-ENDIF + FROM target_table WHERE "{COL_NAME}" > ' ' GROUP BY "{COL_NAME}" ), @@ -31,8 +35,7 @@ SELECT '{PROJECT_CODE}' as project_code, REPLACE(STRING_AGG(CONVERT(NVARCHAR(max), val), '^#^') WITHIN GROUP (ORDER BY min_rn), '^#^', CHAR(10)) AS top_freq_values, (SELECT CONVERT(VARCHAR(40), HASHBYTES('MD5', STRING_AGG( NULLIF(dist_col_name,''), '|') WITHIN GROUP (ORDER BY dist_col_name)), 2) as dvh - FROM (SELECT DISTINCT "{COL_NAME}" as dist_col_name - FROM "{DATA_SCHEMA}"."{DATA_TABLE}") a + FROM (SELECT DISTINCT "{COL_NAME}" as dist_col_name FROM target_table) a ) as distinct_value_hash FROM consol_vals; diff --git a/testgen/template/flavors/postgresql/profiling/project_profiling_query.yaml b/testgen/template/flavors/postgresql/profiling/project_profiling_query.yaml index a9e65d0..67156d7 100644 --- a/testgen/template/flavors/postgresql/profiling/project_profiling_query.yaml +++ b/testgen/template/flavors/postgresql/profiling/project_profiling_query.yaml @@ -1,6 +1,14 @@ --- -01_sampling: "SELECT " -01_else: "SELECT " +01_sampling: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64) + ) + SELECT +01_else: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + ) + SELECT 01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, @@ -124,9 +132,9 @@ "{COL_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') AS pattern - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LENGTH("{COL_NAME}")) - FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {MAX_PATTERN_LENGTH}) p + FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC @@ -215,7 +223,7 @@ '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') ) AS pattern_ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(SIGN(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REGEXP_REPLACE(TRIM("{COL_NAME}"), ' ', '', 'g')))::BIGINT) AS embedded_space_ct, AVG(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REGEXP_REPLACE(TRIM("{COL_NAME}"), ' ', '', 'g'))::FLOAT) AS avg_embedded_spaces, @@ -225,8 +233,7 @@ 16_all: " '{PROFILE_RUN_ID}' as profile_run_id" -98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64)' -98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ' +98_all: ' FROM target_table ' 99_N: | , (SELECT @@ -241,5 +248,3 @@ PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64) LIMIT 1) pctile 99_else: ' ' - -100_sampling: ' ' diff --git a/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query.sql b/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query.sql index b9b0c3d..86db6e4 100644 --- a/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query.sql +++ b/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query.sql @@ -1,12 +1,15 @@ -- Get Freqs for selected columns -WITH ranked_vals AS ( - SELECT "{COL_NAME}", - COUNT(*) AS ct, - ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" -- TG-IF do_sample_bool TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64) -- TG-ENDIF +), +ranked_vals AS ( + SELECT "{COL_NAME}", + COUNT(*) AS ct, + ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn + FROM target_table WHERE "{COL_NAME}" > ' ' GROUP BY "{COL_NAME}" ), @@ -27,5 +30,5 @@ SELECT '{PROJECT_CODE}' as project_code, '{COL_NAME}' as column_name, REPLACE(STRING_AGG(val, '^#^' ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values, ( SELECT MD5(STRING_AGG(DISTINCT "{COL_NAME}", '|' ORDER BY "{COL_NAME}")) as dvh - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ) as distinct_value_hash + FROM target_table ) as distinct_value_hash FROM consol_vals; diff --git a/testgen/template/flavors/redshift/profiling/project_profiling_query.yaml b/testgen/template/flavors/redshift/profiling/project_profiling_query.yaml index 7cbbfd4..1055ecd 100644 --- a/testgen/template/flavors/redshift/profiling/project_profiling_query.yaml +++ b/testgen/template/flavors/redshift/profiling/project_profiling_query.yaml @@ -1,6 +1,14 @@ --- -01_sampling: "SELECT " -01_else: "SELECT " +01_sampling: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO} + ) + SELECT +01_else: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + ) + SELECT 01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, @@ -103,9 +111,9 @@ "{COL_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') AS pattern - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}")) - FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {MAX_PATTERN_LENGTH}) p + FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC) as ps) AS top_patterns, @@ -169,7 +177,7 @@ '[A-Z]', 'A'), '[0-9]', 'N') ) AS pattern_ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct, AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces, @@ -179,8 +187,7 @@ 16_all: " '{PROFILE_RUN_ID}' as profile_run_id" -98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ' -98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' +98_all: ' FROM target_table' 99_N: | , (SELECT @@ -195,5 +202,3 @@ PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile 99_else: ' ' - -100_sampling: 'WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO}' diff --git a/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query.sql b/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query.sql index 58b8651..794275c 100644 --- a/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query.sql +++ b/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query.sql @@ -1,13 +1,16 @@ +WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-IF do_sample_bool + WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO} +-- TG-ENDIF +), -- Get Freqs for selected columns -WITH ranked_vals AS ( +ranked_vals AS ( SELECT "{COL_NAME}", COUNT(*) AS ct, ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' --- TG-IF do_sample_bool - AND RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO} --- TG-ENDIF GROUP BY "{COL_NAME}" ), consol_vals AS ( @@ -28,5 +31,5 @@ SELECT '{PROJECT_CODE}' as project_code, REPLACE(LISTAGG(val, '^#^') WITHIN GROUP (ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values, ( SELECT MD5(LISTAGG(DISTINCT "{COL_NAME}", '|') WITHIN GROUP (ORDER BY "{COL_NAME}")) as dvh - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ) as distinct_value_hash + FROM target_table ) as distinct_value_hash FROM consol_vals; diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.yaml b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.yaml index cfdbb33..0e0b640 100644 --- a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.yaml +++ b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.yaml @@ -1,6 +1,14 @@ --- -01_sampling: "SELECT " -01_else: "SELECT " +01_sampling: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO} + ) + SELECT +01_else: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + ) + SELECT 01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, @@ -103,9 +111,9 @@ "{COL_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') AS pattern - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}")) - FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {MAX_PATTERN_LENGTH}) p + FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC) as ps) AS top_patterns, @@ -169,7 +177,7 @@ '[A-Z]', 'A'), '[0-9]', 'N') ) AS pattern_ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct, AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces, @@ -179,8 +187,7 @@ 16_all: " '{PROFILE_RUN_ID}' as profile_run_id" -98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ' -98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' +98_all: ' FROM target_table' 99_N: | , (SELECT @@ -195,5 +202,3 @@ PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile 99_else: ' ' - -100_sampling: 'WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO}' diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query.sql b/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query.sql index 58b8651..794275c 100644 --- a/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query.sql +++ b/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query.sql @@ -1,13 +1,16 @@ +WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-IF do_sample_bool + WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO} +-- TG-ENDIF +), -- Get Freqs for selected columns -WITH ranked_vals AS ( +ranked_vals AS ( SELECT "{COL_NAME}", COUNT(*) AS ct, ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' --- TG-IF do_sample_bool - AND RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO} --- TG-ENDIF GROUP BY "{COL_NAME}" ), consol_vals AS ( @@ -28,5 +31,5 @@ SELECT '{PROJECT_CODE}' as project_code, REPLACE(LISTAGG(val, '^#^') WITHIN GROUP (ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values, ( SELECT MD5(LISTAGG(DISTINCT "{COL_NAME}", '|') WITHIN GROUP (ORDER BY "{COL_NAME}")) as dvh - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ) as distinct_value_hash + FROM target_table ) as distinct_value_hash FROM consol_vals; diff --git a/testgen/template/flavors/snowflake/profiling/project_profiling_query.yaml b/testgen/template/flavors/snowflake/profiling/project_profiling_query.yaml index 3788d28..5c04fce 100644 --- a/testgen/template/flavors/snowflake/profiling/project_profiling_query.yaml +++ b/testgen/template/flavors/snowflake/profiling/project_profiling_query.yaml @@ -1,6 +1,14 @@ --- -01_sampling: "SELECT " -01_else: "SELECT " +01_sampling: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_SIZE} rows) + ) + SELECT +01_else: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + ) + SELECT 01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, @@ -111,9 +119,9 @@ "{COL_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') AS pattern - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}")) - FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {MAX_PATTERN_LENGTH}) p + FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC) as ps) AS top_patterns, @@ -174,7 +182,7 @@ '[A-Z]', 'A'), '[0-9]', 'N') ) AS pattern_ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' '))::BIGINT) AS embedded_space_ct, AVG(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' ')::FLOAT) AS avg_embedded_spaces, @@ -184,8 +192,7 @@ 16_all: " '{PROFILE_RUN_ID}' as profile_run_id " -98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_SIZE} rows)' -98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' +98_all: ' FROM target_table ' 99_N: | , @@ -202,5 +209,3 @@ PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_SIZE} rows) LIMIT 1 ) pctile 99_else: ; - -100_sampling: ' ' diff --git a/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query.sql b/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query.sql index 7b80fc7..2c4264d 100644 --- a/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query.sql +++ b/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query.sql @@ -1,12 +1,17 @@ +WITH target_table +AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-IF do_sample_bool + SAMPLE ({SAMPLE_SIZE} rows) +-- TG-ENDIF +), -- Get Freqs for selected columns -WITH ranked_vals AS ( +ranked_vals +AS ( SELECT "{COL_NAME}", COUNT(*) AS ct, ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" --- TG-IF do_sample_bool - SAMPLE ({SAMPLE_SIZE} rows) --- TG-ENDIF + FROM target_table WHERE "{COL_NAME}" > ' ' GROUP BY "{COL_NAME}" ), @@ -28,5 +33,5 @@ SELECT '{PROJECT_CODE}' as project_code, REPLACE(LISTAGG(val, '^#^') WITHIN GROUP (ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values, ( SELECT MD5(LISTAGG(DISTINCT NULLIF("{COL_NAME}", ''), '|') WITHIN GROUP (ORDER BY NULLIF("{COL_NAME}", ''))) as dvh - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ) as distinct_value_hash + FROM target_table ) as distinct_value_hash FROM consol_vals; diff --git a/testgen/template/flavors/trino/profiling/project_profiling_query.yaml b/testgen/template/flavors/trino/profiling/project_profiling_query.yaml index 126a5cb..3346003 100644 --- a/testgen/template/flavors/trino/profiling/project_profiling_query.yaml +++ b/testgen/template/flavors/trino/profiling/project_profiling_query.yaml @@ -1,6 +1,14 @@ --- -01_sampling: "SELECT " -01_else: "SELECT " +01_sampling: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE SYSTEM ({SAMPLE_PERCENT_CALC}) + ) + SELECT +01_else: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + ) + SELECT 01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, @@ -123,9 +131,9 @@ "{COL_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') AS pattern - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LENGTH("{COL_NAME}")) - FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {MAX_PATTERN_LENGTH}) p + FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC LIMIT 5) as ps) AS top_patterns, @@ -212,7 +220,7 @@ '[A-Z]', 'A'), '[0-9]', 'N') ) AS pattern_ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(CAST(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')) AS BIGINT)) AS embedded_space_ct, AVG(CAST(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ') AS REAL)) AS avg_embedded_spaces, @@ -222,8 +230,7 @@ 16_all: " '{PROFILE_RUN_ID}' as profile_run_id" -98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE SYSTEM ({SAMPLE_PERCENT_CALC})' -98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' +98_all: ' FROM target_table' 99_N: | , (SELECT @@ -238,5 +245,3 @@ APPROX_PERCENTILE("{COL_NAME}", 0.75) AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE SYSTEM ({SAMPLE_PERCENT_CALC}) ) pctile 99_else: ' ' - -100_sampling: ' ' From 1d1f68f40bba9c8a23c2fbb0e0b4c94ff0f1c595 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Thu, 23 Oct 2025 23:21:09 -0400 Subject: [PATCH 09/28] feat(test-execution): add progress and error handling --- testgen/__main__.py | 53 ++- .../queries/execute_cat_tests_query.py | 123 ------ .../commands/queries/execute_tests_query.py | 413 ++++++++++++------ .../commands/queries/generate_tests_query.py | 5 +- testgen/commands/queries/profiling_query.py | 12 +- .../queries/refresh_data_chars_query.py | 3 +- .../commands/queries/rollup_scores_query.py | 21 +- .../test_parameter_validation_query.py | 78 ---- testgen/commands/run_execute_cat_tests.py | 148 ------- testgen/commands/run_execute_tests.py | 199 --------- testgen/commands/run_launch_db_config.py | 4 +- testgen/commands/run_profiling.py | 79 ++-- testgen/commands/run_refresh_data_chars.py | 6 +- testgen/commands/run_test_execution.py | 319 ++++++++++++++ .../commands/run_test_parameter_validation.py | 119 ----- testgen/commands/run_test_validation.py | 105 +++++ testgen/common/clean_sql.py | 23 +- testgen/common/database/database_service.py | 11 +- testgen/common/date_service.py | 49 +-- testgen/common/get_pipeline_parms.py | 22 - testgen/common/models/entity.py | 4 + testgen/common/models/test_definition.py | 12 +- testgen/common/models/test_run.py | 49 ++- testgen/common/models/test_suite.py | 14 - .../030_initialize_new_schema_structure.sql | 64 +-- .../dbsetup/060_create_standard_views.sql | 9 +- .../dbsetup/075_grant_role_rights.sql | 2 - .../dbupgrade/0158_incremental_upgrade.sql | 35 ++ .../ex_cat_build_agg_table_tests.sql | 148 ------- .../ex_cat_get_distinct_tables.sql | 11 - .../exec_cat_tests/ex_cat_results_parse.sql | 68 --- .../ex_cat_retrieve_agg_test_parms.sql | 8 - .../exec_cat_tests/ex_cat_test_query.sql | 7 - .../disable_invalid_test_definitions.sql | 6 + .../execution/ex_get_tests_non_cat.sql | 51 --- .../execution/ex_update_test_suite.sql | 13 - .../execution/get_active_test_definitions.sql | 46 ++ ...t_n.sql => update_historic_thresholds.sql} | 2 +- ...un_results.sql => update_test_results.sql} | 0 ...un_table.sql => update_test_run_stats.sql} | 18 +- .../ex_data_match_bigquery.sql | 6 +- .../ex_relative_entropy_bigquery.sql | 6 +- .../ex_table_changed_bigquery.sql | 6 +- .../ex_window_match_no_drops_bigquery.sql | 6 +- .../ex_window_match_same_bigquery.sql | 14 +- .../ex_get_project_column_list.sql | 3 - .../validate_tests/get_target_identifiers.sql | 5 + .../ex_window_match_no_drops_databricks.sql | 6 +- .../ex_window_match_same_databricks.sql | 6 +- .../ex_aggregate_match_no_drops_generic.sql | 6 +- .../ex_aggregate_match_percent_generic.sql | 6 +- .../ex_aggregate_match_range_generic.sql | 6 +- .../ex_aggregate_match_same_generic.sql | 6 +- .../ex_custom_query_generic.sql | 6 +- .../ex_data_match_2way_generic.sql | 6 +- .../ex_data_match_generic.sql | 6 +- .../exec_query_tests/ex_dupe_rows_generic.sql | 6 +- .../ex_prior_match_generic.sql | 38 -- .../ex_relative_entropy_generic.sql | 6 +- .../ex_table_changed_generic.sql | 6 +- .../ex_window_match_no_drops_generic.sql | 6 +- .../ex_window_match_same_generic.sql | 6 +- .../ex_get_project_column_list.sql | 3 - .../validate_tests/get_target_identifiers.sql | 5 + .../ex_relative_entropy_mssql.sql | 6 +- .../ex_table_changed_mssql.sql | 6 +- .../ex_window_match_no_drops_postgresql.sql | 6 +- .../ex_window_match_same_postgresql.sql | 6 +- .../ex_get_project_column_list.sql | 3 - .../validate_tests/get_target_identifiers.sql | 5 + .../template/get_entities/get_test_info.sql | 1 - .../template/get_entities/get_test_suite.sql | 2 - .../observability/get_test_results.sql | 1 - .../template/parms/parms_test_execution.sql | 14 - .../quick_start/initial_data_seeding.sql | 5 +- .../calc_prevalence_test_results.sql} | 6 +- .../ex_disable_tests_test_definitions.sql | 4 - .../ex_flag_tests_test_definitions.sql | 7 - .../ex_get_test_column_list_tg.sql | 98 ----- .../ex_prep_flag_tests_test_definitions.sql | 6 - .../ex_write_test_val_errors.sql | 30 -- testgen/ui/components/frontend/css/shared.css | 2 +- .../frontend/js/pages/profiling_runs.js | 2 +- .../components/frontend/js/pages/test_runs.js | 124 ++++-- testgen/ui/queries/source_data_queries.py | 12 +- testgen/ui/queries/table_group_queries.py | 4 +- testgen/ui/queries/test_result_queries.py | 11 +- testgen/ui/views/dialogs/run_tests_dialog.py | 6 +- testgen/ui/views/hygiene_issues.py | 2 +- testgen/ui/views/test_results.py | 4 +- testgen/ui/views/test_suites.py | 4 - 91 files changed, 1146 insertions(+), 1766 deletions(-) delete mode 100644 testgen/commands/queries/execute_cat_tests_query.py delete mode 100644 testgen/commands/queries/test_parameter_validation_query.py delete mode 100644 testgen/commands/run_execute_cat_tests.py delete mode 100644 testgen/commands/run_execute_tests.py create mode 100644 testgen/commands/run_test_execution.py delete mode 100644 testgen/commands/run_test_parameter_validation.py create mode 100644 testgen/commands/run_test_validation.py create mode 100644 testgen/template/dbupgrade/0158_incremental_upgrade.sql delete mode 100644 testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql delete mode 100644 testgen/template/exec_cat_tests/ex_cat_get_distinct_tables.sql delete mode 100644 testgen/template/exec_cat_tests/ex_cat_results_parse.sql delete mode 100644 testgen/template/exec_cat_tests/ex_cat_retrieve_agg_test_parms.sql delete mode 100644 testgen/template/exec_cat_tests/ex_cat_test_query.sql create mode 100644 testgen/template/execution/disable_invalid_test_definitions.sql delete mode 100644 testgen/template/execution/ex_get_tests_non_cat.sql delete mode 100644 testgen/template/execution/ex_update_test_suite.sql create mode 100644 testgen/template/execution/get_active_test_definitions.sql rename testgen/template/execution/{ex_update_history_threshold_last_n.sql => update_historic_thresholds.sql} (95%) rename testgen/template/execution/{ex_finalize_test_run_results.sql => update_test_results.sql} (100%) rename testgen/template/execution/{ex_update_test_record_in_testrun_table.sql => update_test_run_stats.sql} (55%) delete mode 100644 testgen/template/flavors/bigquery/validate_tests/ex_get_project_column_list.sql create mode 100644 testgen/template/flavors/bigquery/validate_tests/get_target_identifiers.sql delete mode 100644 testgen/template/flavors/generic/exec_query_tests/ex_prior_match_generic.sql delete mode 100644 testgen/template/flavors/generic/validate_tests/ex_get_project_column_list.sql create mode 100644 testgen/template/flavors/generic/validate_tests/get_target_identifiers.sql delete mode 100644 testgen/template/flavors/redshift_spectrum/validate_tests/ex_get_project_column_list.sql create mode 100644 testgen/template/flavors/redshift_spectrum/validate_tests/get_target_identifiers.sql delete mode 100644 testgen/template/parms/parms_test_execution.sql rename testgen/template/{execution/ex_calc_prevalence_test_results.sql => rollup_scores/calc_prevalence_test_results.sql} (96%) delete mode 100644 testgen/template/validate_tests/ex_disable_tests_test_definitions.sql delete mode 100644 testgen/template/validate_tests/ex_flag_tests_test_definitions.sql delete mode 100644 testgen/template/validate_tests/ex_get_test_column_list_tg.sql delete mode 100644 testgen/template/validate_tests/ex_prep_flag_tests_test_definitions.sql delete mode 100644 testgen/template/validate_tests/ex_write_test_val_errors.sql diff --git a/testgen/__main__.py b/testgen/__main__.py index 130de69..8217727 100644 --- a/testgen/__main__.py +++ b/testgen/__main__.py @@ -4,13 +4,12 @@ import subprocess import sys from dataclasses import dataclass, field +from datetime import UTC, datetime, timedelta import click from click.core import Context -from progress.spinner import MoonSpinner from testgen import settings -from testgen.commands.run_execute_tests import run_execution_steps from testgen.commands.run_generate_tests import run_test_gen_queries from testgen.commands.run_get_entities import ( run_get_results, @@ -31,6 +30,7 @@ from testgen.commands.run_observability_exporter import run_observability_exporter from testgen.commands.run_profiling import run_profiling from testgen.commands.run_quick_start import run_quick_start, run_quick_start_increment +from testgen.commands.run_test_execution import run_test_execution from testgen.commands.run_test_metadata_exporter import run_test_metadata_exporter from testgen.commands.run_upgrade_db_config import get_schema_revision, is_db_revision_up_to_date, run_upgrade_db_config from testgen.common import ( @@ -45,6 +45,7 @@ from testgen.common.models import with_database_session from testgen.common.models.profiling_run import ProfilingRun from testgen.common.models.test_run import TestRun +from testgen.common.models.test_suite import TestSuite from testgen.scheduler import register_scheduler_job, run_scheduler from testgen.utils import plugins @@ -159,10 +160,17 @@ def run_test_generation(configuration: Configuration, table_group_id: str, test_ @register_scheduler_job @cli.command("run-tests", help="Performs tests defined for a test suite.") +@click.option( + "-t", + "--test-suite-id", + required=False, + type=click.STRING, + help="ID of the test suite to run. Use a test_suite_id shown in list-test-suites.", +) @click.option( "-pk", "--project-key", - help="The identifier for a TestGen project. Use a project_key shown in list-projects.", + help="DEPRECATED. Use --test-suite-id instead.", required=False, type=click.STRING, default=settings.PROJECT_KEY, @@ -170,17 +178,21 @@ def run_test_generation(configuration: Configuration, table_group_id: str, test_ @click.option( "-ts", "--test-suite-key", - help="The identifier for a test suite. Use a test_suite_key shown in list-test-suites.", + help="DEPRECATED. Use --test-suite-id instead.", required=False, default=settings.DEFAULT_TEST_SUITE_KEY, ) -@pass_configuration -def run_tests(configuration: Configuration, project_key: str, test_suite_key: str): - click.echo(f"run-tests for suite: {test_suite_key}") - spinner = None - if not configuration.verbose: - spinner = MoonSpinner("Processing ... ") - message = run_execution_steps(project_key, test_suite_key, spinner=spinner) +def run_tests(test_suite_id: str | None = None, project_key: str | None = None, test_suite_key: str | None = None): + click.echo(f"run-tests for suite: {test_suite_id or test_suite_key}") + # For backward compatibility + if not test_suite_id: + test_suites = TestSuite.select_minimal_where( + TestSuite.project_code == project_key, + TestSuite.test_suite == test_suite_key, + ) + if test_suites: + test_suite_id = test_suites[0].id + message = run_test_execution(test_suite_id) click.echo("\n" + message) @@ -366,24 +378,27 @@ def quick_start( click.echo("loading initial data") run_quick_start_increment(0) - minutes_offset = -30*24*60 # 1 month ago - table_group_id="0ea85e17-acbe-47fe-8394-9970725ad37d" + now_date = datetime.now(UTC) + time_delta = timedelta(days=-30) # 1 month ago + table_group_id = "0ea85e17-acbe-47fe-8394-9970725ad37d" + test_suite_id = "9df7489d-92b3-49f9-95ca-512160d7896f" click.echo(f"run-profile with table_group_id: {table_group_id}") - message = run_profiling(table_group_id, minutes_offset=minutes_offset) + message = run_profiling(table_group_id, run_date=now_date + time_delta) click.echo("\n" + message) LOG.info(f"run-test-generation with table_group_id: {table_group_id} test_suite: {settings.DEFAULT_TEST_SUITE_KEY}") message = run_test_gen_queries(table_group_id, settings.DEFAULT_TEST_SUITE_KEY) click.echo("\n" + message) - run_execution_steps(settings.PROJECT_KEY, settings.DEFAULT_TEST_SUITE_KEY, minutes_offset=minutes_offset) + run_test_execution(test_suite_id, run_date=now_date + time_delta) - for iteration in range(1, 4): - click.echo(f"Running iteration: {iteration} / 3") - minutes_offset = -10*24*60 * (3-iteration) + total_iterations = 3 + for iteration in range(1, total_iterations + 1): + click.echo(f"Running iteration: {iteration} / {total_iterations}") + run_date = now_date + timedelta(days=-10 * (total_iterations - iteration)) # 10 day increments run_quick_start_increment(iteration) - run_execution_steps(settings.PROJECT_KEY, settings.DEFAULT_TEST_SUITE_KEY, minutes_offset=minutes_offset) + run_test_execution(test_suite_id, run_date=run_date) click.echo("Quick start has successfully finished.") diff --git a/testgen/commands/queries/execute_cat_tests_query.py b/testgen/commands/queries/execute_cat_tests_query.py deleted file mode 100644 index 7ff5347..0000000 --- a/testgen/commands/queries/execute_cat_tests_query.py +++ /dev/null @@ -1,123 +0,0 @@ -from typing import ClassVar, TypedDict - -from testgen.commands.queries.rollup_scores_query import RollupScoresSQL -from testgen.common import date_service, read_template_sql_file -from testgen.common.database.database_service import get_flavor_service, replace_params -from testgen.common.read_file import replace_templated_functions - - -class CATTestParams(TypedDict): - schema_name: str - table_name: str - cat_sequence: int - test_measures: str - test_conditions: str - - -class CCATExecutionSQL: - project_code = "" - flavor = "" - test_suite = "" - run_date = "" - test_run_id = "" - table_groups_id = "" - max_query_chars = "" - exception_message = "" - target_schema = "" - target_table = "" - cat_test_params: ClassVar[CATTestParams] = {} - - _rollup_scores_sql: RollupScoresSQL = None - - def __init__(self, strProjectCode, strTestSuiteId, strTestSuite, strSQLFlavor, max_query_chars, minutes_offset=0): - # Defaults - self.test_suite_id = strTestSuiteId - self.test_suite = strTestSuite - self.project_code = strProjectCode - self.flavor_service = get_flavor_service(strSQLFlavor) - self.flavor = strSQLFlavor - self.max_query_chars = max_query_chars - self.today = date_service.get_now_as_string_with_offset(minutes_offset) - self.minutes_offset = minutes_offset - - def _get_rollup_scores_sql(self) -> RollupScoresSQL: - if not self._rollup_scores_sql: - self._rollup_scores_sql = RollupScoresSQL(self.test_run_id, self.table_groups_id) - - return self._rollup_scores_sql - - def _get_query(self, template_file_name: str, sub_directory: str | None = "exec_cat_tests", no_bind: bool = False) -> tuple[str, dict | None]: - query = read_template_sql_file(template_file_name, sub_directory) - params = { - "MAX_QUERY_CHARS": self.max_query_chars, - "TEST_RUN_ID": self.test_run_id, - "PROJECT_CODE": self.project_code, - "TEST_SUITE": self.test_suite, - "TEST_SUITE_ID": self.test_suite_id, - "TABLE_GROUPS_ID": self.table_groups_id, - "SQL_FLAVOR": self.flavor, - "QUOTE": self.flavor_service.quote_character, - "VARCHAR_TYPE": self.flavor_service.varchar_type, - "CONCAT_OPERATOR": self.flavor_service.concat_operator, - "SCHEMA_NAME": self.target_schema, - "TABLE_NAME": self.target_table, - "NOW_DATE": "GETDATE()", - "START_TIME": self.today, - "NOW_TIMESTAMP": date_service.get_now_as_string_with_offset(self.minutes_offset), - "EXCEPTION_MESSAGE": self.exception_message.strip(), - **{key.upper(): value for key, value in self.cat_test_params.items()}, - # This has to be replaced at the end - "RUN_DATE": self.run_date, - } - query = replace_params(query, params) - query = replace_templated_functions(query, self.flavor) - - if no_bind: - # Adding escape character where ':' is referenced - query = query.replace(":", "\\:") - - return query, None if no_bind else params - - def GetDistinctTablesSQL(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_cat_get_distinct_tables.sql") - - def GetAggregateTableTestSQL(self) -> tuple[str, None]: - # Runs on App database - return self._get_query("ex_cat_build_agg_table_tests.sql", no_bind=True) - - def GetAggregateTestParmsSQL(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_cat_retrieve_agg_test_parms.sql") - - def PrepCATQuerySQL(self) -> tuple[str, None]: - # Runs on Target database - return self._get_query("ex_cat_test_query.sql", no_bind=True) - - def GetCATResultsParseSQL(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_cat_results_parse.sql") - - def FinalizeTestResultsSQL(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_finalize_test_run_results.sql", "execution") - - def PushTestRunStatusUpdateSQL(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_update_test_record_in_testrun_table.sql", "execution") - - def FinalizeTestSuiteUpdateSQL(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_update_test_suite.sql", "execution") - - def CalcPrevalenceTestResultsSQL(self) -> tuple[str, None]: - # Runs on App database - return self._get_query("ex_calc_prevalence_test_results.sql", "execution", no_bind=True) - - def TestScoringRollupRunSQL(self) -> tuple[str, dict]: - # Runs on App database - return self._get_rollup_scores_sql().GetRollupScoresTestRunQuery() - - def TestScoringRollupTableGroupSQL(self) -> tuple[str, dict]: - # Runs on App database - return self._get_rollup_scores_sql().GetRollupScoresTestTableGroupQuery() diff --git a/testgen/commands/queries/execute_tests_query.py b/testgen/commands/queries/execute_tests_query.py index 65679ad..b794971 100644 --- a/testgen/commands/queries/execute_tests_query.py +++ b/testgen/commands/queries/execute_tests_query.py @@ -1,19 +1,21 @@ -from typing import ClassVar, TypedDict +import dataclasses +from collections.abc import Iterable +from datetime import datetime +from typing import Literal, TypedDict +from uuid import UUID -from testgen.common import date_service, read_template_sql_file -from testgen.common.clean_sql import CleanSQL, ConcatColumnList, quote_identifiers +from testgen.common import read_template_sql_file +from testgen.common.clean_sql import concat_columns from testgen.common.database.database_service import get_flavor_service, replace_params +from testgen.common.models.connection import Connection +from testgen.common.models.table_group import TableGroup +from testgen.common.models.test_run import TestRun +from testgen.common.read_file import replace_templated_functions +TestRunType = Literal["QUERY", "CAT", "METADATA"] -class TestParams(TypedDict): - test_type: str - test_definition_id: str - test_description: str - test_action: str - schema_name: str - table_name: str - column_name: str - skip_errors: str +@dataclasses.dataclass +class InputParameters: baseline_ct: str baseline_unique_ct: str baseline_value: str @@ -35,139 +37,298 @@ class TestParams(TypedDict): match_subset_condition: str match_groupby_names: str match_having_condition: str + +@dataclasses.dataclass +class TestExecutionDef(InputParameters): + id: UUID + test_type: str + schema_name: str + table_name: str + column_name: str + skip_errors: int custom_query: str + run_type: TestRunType + test_scope: Literal["column", "referential", "table", "custom"] template_name: str + measure: str + test_operator: str + test_condition: str + # Runtime attributes + column_type: str = None + measure_expression: str = None + condition_expression: str = None + errors: list[str] = dataclasses.field(default_factory=list) +class AggregateResult(TypedDict): + query_index: int + result_measures: str + result_codes: str -class CTestExecutionSQL: - flavor = "" - run_date = "" - project_code = "" - test_suite_id = "" - test_suite = "" - test_run_id = "" - exception_message = "" - process_id = "" - test_params: ClassVar[TestParams] = {} - - _use_clean = False - - def __init__(self, strProjectCode, strFlavor, strTestSuiteId, strTestSuite, minutes_offset=0): - self.project_code = strProjectCode - self.flavor = strFlavor - self.flavor_service = get_flavor_service(strFlavor) - self.test_suite_id = strTestSuiteId - self.test_suite = strTestSuite - self.today = date_service.get_now_as_string_with_offset(minutes_offset) - self.minutes_offset = minutes_offset - - def _get_input_parameters(self): - param_keys = [ - "column_name", - "skip_errors", - "baseline_ct", - "baseline_unique_ct", - "baseline_value", - "baseline_value_ct", - "baseline_sum", - "baseline_avg", - "baseline_sd", - "lower_tolerance", - "upper_tolerance", - "subset_condition", - "groupby_names", - "having_condition", - "window_date_column", - "window_days", - "match_column_names", - "match_subset_condition", - "match_schema_name", - "match_table_name", - "match_groupby_names", - "match_having_condition", - ] - input_parameters = "; ".join( - f"{key}={self.test_params[key]}" - for key in param_keys - if key.lower() in self.test_params and self.test_params[key] not in [None, ""] - ) - return input_parameters.replace("'", "`") - def _get_query( - self, template_file_name: str, sub_directory: str | None = "execution", no_bind: bool = False - ) -> tuple[str, dict | None]: - query = read_template_sql_file(template_file_name, sub_directory) +class TestExecutionSQL: + + null_value = "" + test_results_table = "test_results" + result_columns = ( + "test_run_id", + "test_suite_id", + "test_time", + "test_definition_id", + "test_type", + "schema_name", + "table_name", + "column_names", + "skip_errors", + "input_parameters", + "result_code", + "result_status", + "result_message", + "result_measure", + ) + + def __init__(self, connection: Connection, table_group: TableGroup, test_run: TestRun): + self.connection = connection + self.table_group = table_group + self.test_run = test_run + self.run_date = test_run.test_starttime.strftime("%Y-%m-%d %H:%M:%S") + self.flavor = connection.sql_flavor + self.flavor_service = get_flavor_service(self.flavor) + + def _get_input_parameters(self, test_def: TestExecutionDef) -> str: + return "; ".join( + f"{field.name}={getattr(test_def, field.name)}" + for field in dataclasses.fields(InputParameters) + if getattr(test_def, field.name, None) not in [None, ""] + ).replace("'", "`") + + def _get_params(self, test_def: TestExecutionDef | None = None) -> dict: + quote = self.flavor_service.quote_character params = { - "PROJECT_CODE": self.project_code, - "TEST_SUITE_ID": self.test_suite_id, - "TEST_SUITE": self.test_suite, - "SQL_FLAVOR": self.flavor, - "QUOTE": self.flavor_service.quote_character, - "TEST_RUN_ID": self.test_run_id, - "INPUT_PARAMETERS": self._get_input_parameters(), + "TEST_SUITE_ID": self.test_run.test_suite_id, + "TEST_RUN_ID": self.test_run.id, "RUN_DATE": self.run_date, - "EXCEPTION_MESSAGE": self.exception_message, - "START_TIME": self.today, - "PROCESS_ID": self.process_id, + "SQL_FLAVOR": self.flavor, "VARCHAR_TYPE": self.flavor_service.varchar_type, - "NOW_TIMESTAMP": date_service.get_now_as_string_with_offset(self.minutes_offset), - **{key.upper(): value or "" for key, value in self.test_params.items()}, + "QUOTE": quote, } - if self.test_params: - column_name = self.test_params["column_name"] - params["COLUMN_NAME"] = quote_identifiers(column_name, self.flavor) if column_name else "" - # Shows contents without double-quotes for display and aggregate expressions - params["COLUMN_NAME_NO_QUOTES"] = column_name or "" - # Concatenates column list into single expression for relative entropy - params["CONCAT_COLUMNS"] = ConcatColumnList(column_name, "") if column_name else "" - - match_groupby_names = self.test_params["match_groupby_names"] - # Concatenates column list into single expression for relative entropy - params["CONCAT_MATCH_GROUPBY"] = ( - ConcatColumnList(match_groupby_names, "") if match_groupby_names else "" - ) + if test_def: + params.update({ + "TEST_TYPE": test_def.test_type, + "TEST_DEFINITION_ID": test_def.id, + "SCHEMA_NAME": test_def.schema_name, + "TABLE_NAME": test_def.table_name, + "COLUMN_NAME": f"{quote}{test_def.column_name or ''}{quote}", + "COLUMN_NAME_NO_QUOTES": test_def.column_name, + "CONCAT_COLUMNS": concat_columns(test_def.column_name, self.null_value) if test_def.column_name else "", + "SKIP_ERRORS": test_def.skip_errors or 0, + "BASELINE_CT": test_def.baseline_ct, + "BASELINE_UNIQUE_CT": test_def.baseline_unique_ct, + "BASELINE_VALUE": test_def.baseline_value, + "BASELINE_VALUE_CT": test_def.baseline_value_ct, + "THRESHOLD_VALUE": test_def.threshold_value, + "BASELINE_SUM": test_def.baseline_sum, + "BASELINE_AVG": test_def.baseline_avg, + "BASELINE_SD": test_def.baseline_sd, + "LOWER_TOLERANCE": test_def.lower_tolerance, + "UPPER_TOLERANCE": test_def.upper_tolerance, + "SUBSET_CONDITION": test_def.subset_condition or "1=1", + "GROUPBY_NAMES": test_def.groupby_names, + "HAVING_CONDITION": f"HAVING {test_def.having_condition}" if test_def.having_condition else "", + "WINDOW_DATE_COLUMN": test_def.window_date_column, + "WINDOW_DAYS": test_def.window_days or 0, + "MATCH_SCHEMA_NAME": test_def.match_schema_name, + "MATCH_TABLE_NAME": test_def.match_table_name, + "MATCH_COLUMN_NAMES": test_def.match_column_names, + "MATCH_SUBSET_CONDITION": test_def.match_subset_condition or "1=1", + "MATCH_GROUPBY_NAMES": test_def.match_groupby_names, + "CONCAT_MATCH_GROUPBY": concat_columns(test_def.match_groupby_names, self.null_value) if test_def.match_groupby_names else "", + "MATCH_HAVING_CONDITION": f"HAVING {test_def.match_having_condition}" if test_def.match_having_condition else "", + "CUSTOM_QUERY": test_def.custom_query, + "COLUMN_TYPE": test_def.column_type, + "INPUT_PARAMETERS": self._get_input_parameters(test_def), + }) + return params - subset_condition = self.test_params["subset_condition"] - params["SUBSET_DISPLAY"] = subset_condition.replace( - "'", self.flavor_service.escaped_single_quote - ) if subset_condition else "" + def _get_query( + self, + template_file_name: str, + sub_directory: str | None = "execution", + no_bind: bool = False, + extra_params: dict | None = None, + test_def: TestExecutionDef | None = None, + ) -> tuple[str, dict | None]: + query = read_template_sql_file(template_file_name, sub_directory) + params = self._get_params(test_def) + if extra_params: + params.update(extra_params) query = replace_params(query, params) if no_bind: - # Adding escape character where ':' is referenced query = query.replace(":", "\\:") return query, None if no_bind else params - - def GetTestsNonCAT(self) -> tuple[str, dict]: + + def get_active_test_definitions(self) -> tuple[dict]: # Runs on App database - query, params = self._get_query("ex_get_tests_non_cat.sql") - if self._use_clean: - query = CleanSQL(query) - return query, params - - def GetHistoricThresholdUpdate(self) -> tuple[str, dict]: - query, params = self._get_query("ex_update_history_threshold_last_n.sql") - if self._use_clean: - query = CleanSQL(query) - return query, params - - def PushTestRunStatusUpdateSQL(self) -> tuple[str, dict]: + return self._get_query("get_active_test_definitions.sql") + + def get_target_identifiers(self, schemas: Iterable[str]) -> tuple[str, dict]: + # Runs on Target database + filename = "get_target_identifiers.sql" + params = { + "DATA_SCHEMA": self.table_group.table_group_schema, + "TEST_SCHEMAS": ", ".join([f"'{item}'" for item in schemas]), + } + try: + return self._get_query(filename, f"flavors/{self.connection.sql_flavor}/validate_tests", extra_params=params) + except ModuleNotFoundError: + return self._get_query(filename, "flavors/generic/validate_tests", extra_params=params) + + def get_test_errors(self, test_defs: list[TestExecutionDef]) -> list[list[UUID | str | datetime]]: + return [ + [ + self.test_run.id, + self.test_run.test_suite_id, + self.test_run.test_starttime, + td.id, + td.test_type, + td.schema_name, + td.table_name, + td.column_name, + td.skip_errors or 0, + self._get_input_parameters(td), + None, # No result_code on errors + "Error", + ". ".join(td.errors)[:1000], + None, # No result_measure on errors + ] for td in test_defs if td.errors + ] + + def disable_invalid_test_definitions(self) -> tuple[str, dict]: # Runs on App database - return self._get_query("ex_update_test_record_in_testrun_table.sql") - - def GetTestQuery(self) -> tuple[str, None]: + return self._get_query("disable_invalid_test_definitions.sql") + + def update_historic_thresholds(self) -> tuple[str, dict]: + # Runs on App database + return self._get_query("update_historic_thresholds.sql") + + def run_query_test(self, test_def: TestExecutionDef) -> tuple[str, dict]: # Runs on Target database - if template_name := self.test_params["template_name"]: - template_flavor = "generic" if template_name.endswith("_generic.sql") else self.flavor - query, params = self._get_query(template_name, f"flavors/{template_flavor}/exec_query_tests", no_bind=True) - # Final replace to cover parm within CUSTOM_QUERY parm - query = replace_params(query, {"DATA_SCHEMA": self.test_params["schema_name"]}) - - if self._use_clean: - query = CleanSQL(query) - return query, params + folder = "generic" if test_def.template_name.endswith("_generic.sql") else self.flavor + return self._get_query( + test_def.template_name, + f"flavors/{folder}/exec_query_tests", + no_bind=True, + # Final replace in CUSTOM_QUERY + extra_params={"DATA_SCHEMA": test_def.schema_name}, + test_def=test_def, + ) + + def aggregate_cat_tests( + self, + test_defs: list[TestExecutionDef], + single: bool = False, + ) -> tuple[list[tuple[str, None]], list[list[TestExecutionDef]]]: + varchar_type = self.flavor_service.varchar_type + concat_operator = self.flavor_service.concat_operator + quote = self.flavor_service.quote_character + + for td in test_defs: + # Don't recalculate expressions if it was already done before + if not td.measure_expression or not td.condition_expression: + params = self._get_params(td) + + measure = replace_params(td.measure, params) + measure = replace_templated_functions(measure, self.flavor) + td.measure_expression = f"COALESCE(CAST({measure} AS {varchar_type}) {concat_operator} '|', '{self.null_value}|')" + + condition = replace_params(f"{td.measure}{td.test_operator}{td.test_condition}", params) + condition = replace_templated_functions(condition, self.flavor) + td.condition_expression = f"CASE WHEN {condition} THEN '0,' ELSE '1,' END" + + aggregate_queries: list[tuple[str, None]] = [] + aggregate_test_defs: list[list[TestExecutionDef]] = [] + + def add_query(test_defs: list[TestExecutionDef]) -> str: + query = ( + f"SELECT {len(aggregate_queries)} AS query_index, " + f"{concat_operator.join([td.measure_expression for td in test_defs])} AS result_measures, " + f"{concat_operator.join([td.condition_expression for td in test_defs])} AS result_codes " + f"FROM {quote}{test_defs[0].schema_name}{quote}.{quote}{test_defs[0].table_name}{quote}" + ) + query = query.replace(":", "\\:") + + aggregate_queries.append((query, None)) + aggregate_test_defs.append(test_defs) + + if single: + for td in test_defs: + # Add separate query for each test + add_query([td]) else: - raise ValueError(f"No query template assigned to test_type {self.test_params["test_type"]}") + test_defs_by_table: dict[tuple[str, str], list[TestExecutionDef]] = {} + for td in test_defs: + table = (td.schema_name, td.table_name) + if not test_defs_by_table.get(table): + test_defs_by_table[table] = [] + test_defs_by_table[table].append(td) + + max_query_chars = self.connection.max_query_chars - 400 + for test_defs in test_defs_by_table.values(): + # Add new query for each table + current_chars = 0 + current_test_defs = [] + + for td in test_defs: + current_chars += len(td.measure_expression) + len(td.condition_expression) + 2 * len(concat_operator) + # Add new query if current query will become bigger than character limit + if current_chars > max_query_chars: + add_query(current_test_defs) + current_chars = 0 + current_test_defs = [] + current_test_defs.append(td) + + if current_test_defs: + add_query(current_test_defs) + + return aggregate_queries, aggregate_test_defs + + def get_cat_test_results( + self, + aggregate_results: list[AggregateResult], + aggregate_test_defs: list[list[TestExecutionDef]], + ) -> list[list[UUID | str | datetime | int | None]]: + test_results: list[list[TestExecutionDef]] = [] + for result in aggregate_results: + test_defs = aggregate_test_defs[result["query_index"]] + result_measures = result["result_measures"].split("|") + result_codes = result["result_codes"].split(",") + + for index, td in enumerate(test_defs): + test_results.append([ + self.test_run.id, + self.test_run.test_suite_id, + self.test_run.test_starttime, + td.id, + td.test_type, + td.schema_name, + td.table_name, + td.column_name, + td.skip_errors or 0, + self._get_input_parameters(td), + result_codes[index], + None, # result_status will be calculated later + None, # No result_message + result_measures[index] if result_measures[index] != self.null_value else None, + ]) + + return test_results + + def update_test_results(self) -> list[tuple[str, dict]]: + # Runs on App database + return [ + self._get_query("update_test_results.sql"), + self._get_query("update_test_run_stats.sql"), + ] diff --git a/testgen/commands/queries/generate_tests_query.py b/testgen/commands/queries/generate_tests_query.py index bf23b7b..cece2d3 100644 --- a/testgen/commands/queries/generate_tests_query.py +++ b/testgen/commands/queries/generate_tests_query.py @@ -1,7 +1,8 @@ import logging +from datetime import UTC, datetime from typing import ClassVar, TypedDict -from testgen.common import CleanSQL, date_service, read_template_sql_file +from testgen.common import CleanSQL, read_template_sql_file from testgen.common.database.database_service import get_flavor_service, replace_params from testgen.common.read_file import get_template_files @@ -33,7 +34,7 @@ def __init__(self, flavor): self.sql_flavor = flavor self.flavor_service = get_flavor_service(flavor) - today = date_service.get_now_as_string() + today = datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S") self.run_date = today self.as_of_date = today diff --git a/testgen/commands/queries/profiling_query.py b/testgen/commands/queries/profiling_query.py index d215ddd..a5fd7ba 100644 --- a/testgen/commands/queries/profiling_query.py +++ b/testgen/commands/queries/profiling_query.py @@ -54,18 +54,12 @@ class ProfilingSQL: max_pattern_length = 25 max_error_length = 2000 - def __init__( - self, - connection: Connection, - table_group: TableGroup, - profiling_run: ProfilingRun, - minutes_offset: int = 0, - ): + def __init__(self, connection: Connection, table_group: TableGroup, profiling_run: ProfilingRun): self.connection = connection self.table_group = table_group self.profiling_run = profiling_run + self.run_date = profiling_run.profiling_starttime.strftime("%Y-%m-%d %H:%M:%S") self.flavor = connection.sql_flavor - self.minutes_offset = minutes_offset self._profiling_template: dict = None def _get_params(self, column_chars: ColumnChars | None = None, table_sampling: TableSampling | None = None) -> dict: @@ -74,7 +68,7 @@ def _get_params(self, column_chars: ColumnChars | None = None, table_sampling: T "CONNECTION_ID": self.connection.connection_id, "TABLE_GROUPS_ID": self.table_group.id, "PROFILE_RUN_ID": self.profiling_run.id, - "RUN_DATE": self.profiling_run.profiling_starttime, + "RUN_DATE": self.run_date, "SQL_FLAVOR": self.flavor, "DATA_SCHEMA": self.table_group.table_group_schema, "PROFILE_ID_COLUMN_MASK": self.table_group.profile_id_column_mask, diff --git a/testgen/commands/queries/refresh_data_chars_query.py b/testgen/commands/queries/refresh_data_chars_query.py index 325c61e..9ef0250 100644 --- a/testgen/commands/queries/refresh_data_chars_query.py +++ b/testgen/commands/queries/refresh_data_chars_query.py @@ -1,5 +1,6 @@ import dataclasses from collections.abc import Iterable +from datetime import datetime from testgen.common import read_template_sql_file from testgen.common.database.database_service import get_flavor_service, replace_params @@ -127,7 +128,7 @@ def verify_access(self, table_name: str) -> tuple[str, None]: ) return (query, None) - def get_staging_data_chars(self, data_chars: list[ColumnChars], run_date: str) -> list[list[str | bool | int]]: + def get_staging_data_chars(self, data_chars: list[ColumnChars], run_date: datetime) -> list[list[str | bool | int]]: return [ [ self.table_group.id, diff --git a/testgen/commands/queries/rollup_scores_query.py b/testgen/commands/queries/rollup_scores_query.py index 38f03cb..90d76e9 100644 --- a/testgen/commands/queries/rollup_scores_query.py +++ b/testgen/commands/queries/rollup_scores_query.py @@ -12,14 +12,19 @@ def __init__(self, run_id: str, table_group_id: str | UUID | None = None): self.run_id = run_id self.table_group_id = str(table_group_id) if table_group_id is not None else None - def _get_query(self, template_file_name: str, sub_directory: str | None = "rollup_scores") -> tuple[str, dict]: + def _get_query( + self, + template_file_name: str, + sub_directory: str | None = "rollup_scores", + no_bind: bool = False, + ) -> tuple[str, dict]: query = read_template_sql_file(template_file_name, sub_directory) params = { "RUN_ID": self.run_id, "TABLE_GROUPS_ID": self.table_group_id or "", } query = replace_params(query, params) - return query, params + return query, None if no_bind else params def rollup_profiling_scores(self) -> list[tuple[str, dict]]: # Runs on App database @@ -30,10 +35,10 @@ def rollup_profiling_scores(self) -> list[tuple[str, dict]]: queries.append(self._get_query("rollup_scores_profile_table_group.sql")) return queries - def GetRollupScoresTestRunQuery(self) -> tuple[str, dict]: + def rollup_test_scores(self) -> list[tuple[str, dict]]: # Runs on App database - return self._get_query("rollup_scores_test_run.sql") - - def GetRollupScoresTestTableGroupQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("rollup_scores_test_table_group.sql") + return [ + self._get_query("calc_prevalence_test_results.sql", no_bind=True), + self._get_query("rollup_scores_test_run.sql"), + self._get_query("rollup_scores_test_table_group.sql"), + ] diff --git a/testgen/commands/queries/test_parameter_validation_query.py b/testgen/commands/queries/test_parameter_validation_query.py deleted file mode 100644 index c7f40c3..0000000 --- a/testgen/commands/queries/test_parameter_validation_query.py +++ /dev/null @@ -1,78 +0,0 @@ -import typing - -from testgen.common import CleanSQL, date_service, read_template_sql_file -from testgen.common.database.database_service import get_flavor_service, replace_params - - -class CTestParamValidationSQL: - flavor = "" - run_date = "" - test_run_id = "" - test_schemas: str = "" - message = "" - test_ids: typing.ClassVar = [] - exception_message = "" - flag_val = "" - tg_schema = "" - - _use_clean = False - - def __init__(self, strFlavor, strTestSuiteId): - self.flavor = strFlavor - self.flavor_service = get_flavor_service(strFlavor) - self.test_suite_id = strTestSuiteId - self.today = date_service.get_now_as_string() - - def _get_query(self, template_file_name: str, sub_directory: str | None = "validate_tests") -> tuple[str, dict]: - query = read_template_sql_file(template_file_name, sub_directory) - params = { - "TEST_SUITE_ID": self.test_suite_id, - "RUN_DATE": self.run_date, - "TEST_RUN_ID": self.test_run_id, - "FLAG": self.flag_val, - "TEST_SCHEMAS": self.test_schemas, - "EXCEPTION_MESSAGE": self.exception_message, - "MESSAGE": self.message, - "CAT_TEST_IDS": tuple(self.test_ids or []), - "START_TIME": self.today, - "NOW_TIMESTAMP": date_service.get_now_as_string(), - "DATA_SCHEMA": self.tg_schema, - "QUOTE": self.flavor_service.quote_character, - } - query = replace_params(query, params) - return query, params - - def GetTestValidationColumns(self) -> tuple[str, dict]: - # Runs on App database - query, params = self._get_query("ex_get_test_column_list_tg.sql") - if self._use_clean: - query = CleanSQL(query) - return query, params - - def GetProjectTestValidationColumns(self) -> tuple[str, dict]: - # Runs on Target database - filename = "ex_get_project_column_list.sql" - try: - return self._get_query(filename, f"flavors/{self.flavor}/validate_tests") - except ModuleNotFoundError: - return self._get_query(filename, "flavors/generic/validate_tests") - - def PrepFlagTestsWithFailedValidation(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_prep_flag_tests_test_definitions.sql") - - def FlagTestsWithFailedValidation(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_flag_tests_test_definitions.sql") - - def DisableTestsWithFailedValidation(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_disable_tests_test_definitions.sql") - - def ReportTestValidationErrors(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_write_test_val_errors.sql") - - def PushTestRunStatusUpdateSQL(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_update_test_record_in_testrun_table.sql", "execution") diff --git a/testgen/commands/run_execute_cat_tests.py b/testgen/commands/run_execute_cat_tests.py deleted file mode 100644 index 0f6935a..0000000 --- a/testgen/commands/run_execute_cat_tests.py +++ /dev/null @@ -1,148 +0,0 @@ -import logging -from datetime import UTC, datetime - -from progress.spinner import Spinner - -from testgen import settings -from testgen.commands.queries.execute_cat_tests_query import CCATExecutionSQL -from testgen.commands.run_refresh_score_cards_results import run_refresh_score_cards_results -from testgen.common import ( - date_service, - execute_db_queries, - fetch_dict_from_db, - fetch_from_db_threaded, - write_to_app_db, -) -from testgen.common.get_pipeline_parms import TestExecutionParams -from testgen.common.mixpanel_service import MixpanelService - -LOG = logging.getLogger("testgen") - - -def FinalizeTestRun(clsCATExecute: CCATExecutionSQL, username: str | None = None): - _, row_counts = execute_db_queries([ - clsCATExecute.FinalizeTestResultsSQL(), - clsCATExecute.PushTestRunStatusUpdateSQL(), - clsCATExecute.FinalizeTestSuiteUpdateSQL(), - ]) - end_time = datetime.now(UTC) - - try: - execute_db_queries([ - clsCATExecute.CalcPrevalenceTestResultsSQL(), - clsCATExecute.TestScoringRollupRunSQL(), - clsCATExecute.TestScoringRollupTableGroupSQL(), - ]) - run_refresh_score_cards_results( - project_code=clsCATExecute.project_code, - add_history_entry=True, - refresh_date=date_service.parse_now(clsCATExecute.run_date), - ) - except Exception: - LOG.exception("Error refreshing scores after test run") - pass - - MixpanelService().send_event( - "run-tests", - source=settings.ANALYTICS_JOB_SOURCE, - username=username, - sql_flavor=clsCATExecute.flavor, - test_count=row_counts[0], - run_duration=(end_time - date_service.parse_now(clsCATExecute.run_date)).total_seconds(), - scoring_duration=(datetime.now(UTC) - end_time).total_seconds(), - ) - - -def run_cat_test_queries( - params: TestExecutionParams, - test_run_id: str, - test_time: str, - project_code: str, - test_suite: str, - error_msg: str, - username: str | None = None, - minutes_offset: int = 0, - spinner: Spinner | None = None -): - has_errors = False - - LOG.info("CurrentStep: Initializing CAT Query Generator") - clsCATExecute = CCATExecutionSQL( - project_code, params["test_suite_id"], test_suite, params["sql_flavor"], params["max_query_chars"], minutes_offset - ) - clsCATExecute.test_run_id = test_run_id - clsCATExecute.run_date = test_time - clsCATExecute.table_groups_id = params["table_groups_id"] - clsCATExecute.exception_message += error_msg - - # START TEST EXECUTION - - if spinner: - spinner.next() - - lstAllResults = [] - - try: - # Retrieve distinct target tables from metadata - LOG.info("CurrentStep: Retrieving Target Tables") - # Gets distinct list of tables to be tested, to aggregate tests by table, from dk db - lstTables = fetch_dict_from_db(*clsCATExecute.GetDistinctTablesSQL()) - LOG.info("Test Tables Identified: %s", len(lstTables)) - - if lstTables: - LOG.info("CurrentStep: Aggregating CAT Tests per Table") - for dctTable in lstTables: - clsCATExecute.target_schema = dctTable["schema_name"] - clsCATExecute.target_table = dctTable["table_name"] - # Writes records of aggregated tests per table and sequence number - # (to prevent table queries from getting too large) to dk db. - execute_db_queries([clsCATExecute.GetAggregateTableTestSQL()]) - - LOG.info("CurrentStep: Retrieving CAT Tests to Run") - # Retrieves records of aggregated tests to run as queries from dk db - lstCATParms = fetch_dict_from_db(*clsCATExecute.GetAggregateTestParmsSQL()) - - lstCATQueries = [] - # Prepares CAT Queries and populates query list - LOG.info("CurrentStep: Preparing CAT Queries") - for dctCATQuery in lstCATParms: - clsCATExecute.target_schema = dctCATQuery["schema_name"] - clsCATExecute.target_table = dctCATQuery["table_name"] - clsCATExecute.cat_test_params = dctCATQuery - lstCATQueries.append(clsCATExecute.PrepCATQuerySQL()) - - if lstCATQueries: - LOG.info("CurrentStep: Performing CAT Tests") - lstAllResults, lstResultColumnNames, errors = fetch_from_db_threaded( - lstCATQueries, use_target_db=True, max_threads=params["max_threads"], - ) - - if lstAllResults: - LOG.info("CurrentStep: Saving CAT Results") - # Write aggregate result records to aggregate result table at dk db - write_to_app_db(lstAllResults, lstResultColumnNames, "working_agg_cat_results") - LOG.info("CurrentStep: Parsing CAT Results") - # Parses aggregate results to individual test_result records at dk db - execute_db_queries([clsCATExecute.GetCATResultsParseSQL()]) - LOG.info("Test results successfully parsed.") - if errors: - has_errors = True - cat_error_msg = f"Errors were encountered executing aggregate tests. ({len(errors)} errors occurred.) Please check log." - LOG.warning(cat_error_msg) - clsCATExecute.exception_message += cat_error_msg - else: - LOG.info("No valid tests were available to perform") - - except Exception as e: - has_errors = True - sqlsplit = e.args[0].split("[SQL", 1) - errorline = sqlsplit[0].replace("'", "''") if len(sqlsplit) > 0 else "unknown error" - clsCATExecute.exception_message += f"{type(e).__name__}: {errorline}" - raise - - else: - return has_errors - - finally: - LOG.info("Finalizing test run") - FinalizeTestRun(clsCATExecute, username) diff --git a/testgen/commands/run_execute_tests.py b/testgen/commands/run_execute_tests.py deleted file mode 100644 index a99f600..0000000 --- a/testgen/commands/run_execute_tests.py +++ /dev/null @@ -1,199 +0,0 @@ -import logging -import subprocess -import threading -import uuid - -from progress.spinner import Spinner - -import testgen.common.process_service as process_service -from testgen import settings -from testgen.commands.queries.execute_tests_query import CTestExecutionSQL -from testgen.common import ( - date_service, - execute_db_queries, - fetch_dict_from_db, - fetch_from_db_threaded, - get_test_execution_params, - set_target_db_params, - write_to_app_db, -) -from testgen.common.database.database_service import empty_cache -from testgen.common.get_pipeline_parms import TestExecutionParams -from testgen.common.models import with_database_session -from testgen.common.models.connection import Connection -from testgen.common.models.table_group import TableGroup -from testgen.ui.session import session - -from .run_execute_cat_tests import run_cat_test_queries -from .run_refresh_data_chars import run_data_chars_refresh -from .run_test_parameter_validation import run_parameter_validation_queries - -LOG = logging.getLogger("testgen") - - -def add_test_run_record(test_run_id: str, test_suite_id: str, test_time: str, process_id: int): - execute_db_queries([( - """ - INSERT INTO test_runs(id, test_suite_id, test_starttime, process_id) - (SELECT :test_run_id as id, - :test_suite_id as test_suite_id, - :test_time as test_starttime, - :process_id as process_id); - """, - { - "test_run_id": test_run_id, - "test_suite_id": test_suite_id, - "test_time": test_time, - "process_id": process_id, - } - )]) - - -def run_test_queries( - params: TestExecutionParams, - test_run_id: str, - test_time: str, - project_code: str, - test_suite: str, - minutes_offset: int = 0, - spinner: Spinner | None = None, -): - errors = None - error_msg = "" - - LOG.info("CurrentStep: Initializing Query Generator") - - clsExecute = CTestExecutionSQL(project_code, params["sql_flavor"], params["test_suite_id"], test_suite, minutes_offset) - clsExecute.run_date = test_time - clsExecute.test_run_id = test_run_id - clsExecute.process_id = process_service.get_current_process_id() - - try: - # Update Historic Test Thresholds - LOG.info("CurrentStep: Updating Historic Test Thresholds") - execute_db_queries([clsExecute.GetHistoricThresholdUpdate()]) - - # Retrieve non-CAT Queries - LOG.info("CurrentStep: Retrieve Non-CAT Queries") - lstTestSet = fetch_dict_from_db(*clsExecute.GetTestsNonCAT()) - - if len(lstTestSet) == 0: - LOG.debug("0 non-CAT Queries retrieved.") - - if lstTestSet: - LOG.info("CurrentStep: Preparing Non-CAT Tests") - lstTestQueries = [] - for dctTest in lstTestSet: - clsExecute.test_params = dctTest - lstTestQueries.append(clsExecute.GetTestQuery()) - if spinner: - spinner.next() - - # Execute list, returning test results - LOG.info("CurrentStep: Executing Non-CAT Test Queries") - lstTestResults, colResultNames, errors = fetch_from_db_threaded( - lstTestQueries, use_target_db=True, max_threads=params["max_threads"], - ) - - # Copy test results to DK DB - LOG.info("CurrentStep: Saving Non-CAT Test Results") - if lstTestResults: - write_to_app_db(lstTestResults, colResultNames, "test_results") - if errors: - error_msg = ( - f"Errors were encountered executing Referential Tests. ({len(errors)} errors occurred.) " - "Please check log. " - ) - LOG.warning(error_msg) - else: - LOG.info("No tests found") - - except Exception as e: - sqlsplit = e.args[0].split("[SQL", 1) - errorline = sqlsplit[0].replace("'", "''") if len(sqlsplit) > 0 else "unknown error" - clsExecute.exception_message = f"{type(e).__name__}: {errorline}" - LOG.info("Updating the test run record with exception message") - execute_db_queries([clsExecute.PushTestRunStatusUpdateSQL()]) - raise - - else: - return bool(errors), error_msg - - -def run_execution_steps_in_background(project_code, test_suite): - msg = f"Starting run_execution_steps_in_background against test suite: {test_suite}" - if settings.IS_DEBUG: - LOG.info(msg + ". Running in debug mode (new thread instead of new process).") - empty_cache() - username = None - if session.auth: - username = session.auth.user_display - background_thread = threading.Thread( - target=run_execution_steps, - args=(project_code, test_suite, username), - ) - background_thread.start() - else: - LOG.info(msg) - script = ["testgen", "run-tests", "--project-key", project_code, "--test-suite-key", test_suite] - subprocess.Popen(script) # NOQA S603 - - -@with_database_session -def run_execution_steps( - project_code: str, - test_suite: str, - username: str | None = None, - minutes_offset: int = 0, - spinner: Spinner | None = None, -) -> str: - # Initialize required parms for all steps - has_errors = False - error_msg = "" - - test_run_id = str(uuid.uuid4()) - test_time = date_service.get_now_as_string_with_offset(minutes_offset) - - if spinner: - spinner.next() - - LOG.info("CurrentStep: Retrieving TestExec Parameters") - test_exec_params = get_test_execution_params(project_code, test_suite) - - # Add a record in Test Run table for the new Test Run - add_test_run_record( - test_run_id, test_exec_params["test_suite_id"], test_time, process_service.get_current_process_id() - ) - - LOG.info("CurrentStep: Assigning Connection Parameters") - table_group = TableGroup.get(test_exec_params["table_groups_id"]) - connection = Connection.get(table_group.connection_id) - set_target_db_params(connection.__dict__) - test_exec_params["sql_flavor"] = connection.sql_flavor - test_exec_params["max_query_chars"] = connection.max_query_chars - test_exec_params["max_threads"] = connection.max_threads - - try: - run_data_chars_refresh(connection, table_group, test_time) - except Exception: - LOG.warning("Data Characteristics Refresh failed", exc_info=True, stack_info=True) - pass - - LOG.info("CurrentStep: Execute Step - Test Validation") - run_parameter_validation_queries(test_exec_params, test_run_id, test_time, test_suite) - - LOG.info("CurrentStep: Execute Step - Test Execution") - has_errors, error_msg = run_test_queries( - test_exec_params, test_run_id, test_time, project_code, test_suite, minutes_offset, spinner - ) - - LOG.info("CurrentStep: Execute Step - CAT Test Execution") - if run_cat_test_queries( - test_exec_params, test_run_id, test_time, project_code, test_suite, error_msg, username, minutes_offset, spinner - ): - has_errors = True - - return f""" - Test execution completed {"with errors. Check log for details." if has_errors else "successfully."} - Run ID: {test_run_id} - """ diff --git a/testgen/commands/run_launch_db_config.py b/testgen/commands/run_launch_db_config.py index 2533244..0d926fb 100644 --- a/testgen/commands/run_launch_db_config.py +++ b/testgen/commands/run_launch_db_config.py @@ -2,7 +2,7 @@ import os from testgen import settings -from testgen.common import create_database, date_service, execute_db_queries +from testgen.common import create_database, execute_db_queries from testgen.common.credentials import get_tg_db, get_tg_schema from testgen.common.database.database_service import get_queries_for_command from testgen.common.encrypt import EncryptText, encrypt_ui_password @@ -22,14 +22,12 @@ def _get_latest_revision_number(): def _get_params_mapping() -> dict: ui_user_encrypted_password = encrypt_ui_password(settings.PASSWORD) - now = date_service.get_now_as_string() return { "UI_USER_NAME": settings.USERNAME, "UI_USER_USERNAME": settings.USERNAME, "UI_USER_EMAIL": "", "UI_USER_ENCRYPTED_PASSWORD": ui_user_encrypted_password, "SCHEMA_NAME": get_tg_schema(), - "START_DATE": now, "PROJECT_CODE": settings.PROJECT_KEY, "CONNECTION_ID": 1, "SQL_FLAVOR": settings.PROJECT_SQL_FLAVOR, diff --git a/testgen/commands/run_profiling.py b/testgen/commands/run_profiling.py index 528c767..de217cb 100644 --- a/testgen/commands/run_profiling.py +++ b/testgen/commands/run_profiling.py @@ -1,7 +1,7 @@ import logging import subprocess import threading -from datetime import UTC, datetime +from datetime import UTC, datetime, timedelta from uuid import UUID import testgen.common.process_service as process_service @@ -9,12 +9,11 @@ from testgen.commands.queries.profiling_query import HygieneIssueType, ProfilingSQL, TableSampling from testgen.commands.queries.refresh_data_chars_query import ColumnChars from testgen.commands.queries.rollup_scores_query import RollupScoresSQL -from testgen.commands.run_execute_tests import run_execution_steps_in_background from testgen.commands.run_generate_tests import run_test_gen_queries from testgen.commands.run_refresh_data_chars import run_data_chars_refresh from testgen.commands.run_refresh_score_cards_results import run_refresh_score_cards_results +from testgen.commands.run_test_execution import run_test_execution_in_background from testgen.common import ( - date_service, execute_db_queries, fetch_dict_from_db, fetch_from_db_threaded, @@ -51,11 +50,12 @@ def run_profiling_in_background(table_group_id: str | UUID) -> None: @with_database_session -def run_profiling(table_group_id: str | UUID, username: str | None = None, minutes_offset: int = 0) -> str: +def run_profiling(table_group_id: str | UUID, username: str | None = None, run_date: datetime | None = None) -> str: if table_group_id is None: raise ValueError("Table Group ID was not specified") LOG.info(f"Starting profiling run for table group {table_group_id}") + time_delta = (run_date - datetime.now(UTC)) if run_date else timedelta() LOG.info("Retrieving connection and table group parameters") table_group = TableGroup.get(table_group_id) @@ -67,7 +67,7 @@ def run_profiling(table_group_id: str | UUID, username: str | None = None, minut project_code=table_group.project_code, connection_id=connection.connection_id, table_groups_id=table_group.id, - profiling_starttime=date_service.get_now_as_string_with_offset(minutes_offset), + profiling_starttime=datetime.now(UTC) + time_delta, process_id=process_service.get_current_process_id(), ) profiling_run.init_progress() @@ -86,7 +86,7 @@ def run_profiling(table_group_id: str | UUID, username: str | None = None, minut profiling_run.data_point_ct = sum(column.record_ct for column in data_chars) if data_chars: - sql_generator = ProfilingSQL(connection, table_group, profiling_run, minutes_offset=minutes_offset) + sql_generator = ProfilingSQL(connection, table_group, profiling_run) _run_column_profiling(sql_generator, data_chars) _run_frequency_analysis(sql_generator) @@ -99,43 +99,33 @@ def run_profiling(table_group_id: str | UUID, username: str | None = None, minut LOG.info("No columns were selected to profile.") except Exception as e: LOG.exception("Profiling encountered an error.") - LOG.info("Updating profiling run record") + LOG.info("Setting profiling run status to Error") profiling_run.log_message = get_exception_message(e) - profiling_run.profiling_endtime = date_service.get_now_as_string_with_offset(minutes_offset) + profiling_run.profiling_endtime = datetime.now(UTC) + time_delta profiling_run.status = "Error" profiling_run.save() else: - LOG.info("Updating profiling run record") - profiling_run.profiling_endtime = date_service.get_now_as_string_with_offset(minutes_offset) + LOG.info("Setting profiling run status to Completed") + profiling_run.profiling_endtime = datetime.now(UTC) + time_delta profiling_run.status = "Complete" profiling_run.save() - LOG.info("Rolling up profiling scores") - execute_db_queries( - RollupScoresSQL(profiling_run.id, table_group.id).rollup_profiling_scores(), - ) - run_refresh_score_cards_results( - project_code=table_group.project_code, - add_history_entry=True, - refresh_date=date_service.parse_now(profiling_run.profiling_starttime), - ) + _rollup_profiling_scores(profiling_run, table_group) if bool(table_group.monitor_test_suite_id) and not table_group.last_complete_profile_run_id: - _generate_monitor_tests(table_group.project_code, table_group_id, table_group.monitor_test_suite_id) + _generate_monitor_tests(table_group_id, table_group.monitor_test_suite_id) finally: - if not minutes_offset: - end_time = date_service.parse_now(profiling_run.profiling_endtime) - MixpanelService().send_event( - "run-profiling", - source=settings.ANALYTICS_JOB_SOURCE, - username=username, - sql_flavor=connection.sql_flavor_code, - sampling=table_group.profile_use_sampling, - table_count=profiling_run.table_ct or 0, - column_count=profiling_run.column_ct or 0, - run_duration=(end_time - date_service.parse_now(profiling_run.profiling_starttime)).total_seconds(), - scoring_duration=(datetime.now(UTC) - end_time).total_seconds(), - ) + MixpanelService().send_event( + "run-profiling", + source=settings.ANALYTICS_JOB_SOURCE, + username=username, + sql_flavor=connection.sql_flavor_code, + sampling=table_group.profile_use_sampling, + table_count=profiling_run.table_ct or 0, + column_count=profiling_run.column_ct or 0, + run_duration=(profiling_run.profiling_endtime - profiling_run.profiling_starttime).total_seconds(), + scoring_duration=(datetime.now(UTC) + time_delta - profiling_run.profiling_endtime).total_seconds(), + ) return f""" {"Profiling encountered an error. Check log for details." if profiling_run.status == "Error" else "Profiling completed."} @@ -148,7 +138,7 @@ def _run_column_profiling(sql_generator: ProfilingSQL, data_chars: list[ColumnCh profiling_run.set_progress("col_profiling", "Running") profiling_run.save() - LOG.info("Running column profiling queries") + LOG.info(f"Running column profiling queries: {len(data_chars)}") table_group = sql_generator.table_group sampling_params: dict[str, TableSampling] = {} sample_percent = ( @@ -235,7 +225,7 @@ def _run_frequency_analysis(sql_generator: ProfilingSQL) -> None: frequency_columns = fetch_dict_from_db(*sql_generator.get_frequency_analysis_columns()) if frequency_columns: - LOG.info("Running frequency analysis queries") + LOG.info(f"Running frequency analysis queries: {len(frequency_columns)}") def update_frequency_progress(progress: ThreadedProgress) -> None: profiling_run.set_progress( @@ -304,8 +294,23 @@ def _run_hygiene_issue_detection(sql_generator: ProfilingSQL) -> None: profiling_run.set_progress("hygiene_issues", "Completed") +def _rollup_profiling_scores(profiling_run: ProfilingRun, table_group: TableGroup) -> None: + try: + LOG.info("Rolling up profiling scores") + execute_db_queries( + RollupScoresSQL(profiling_run.id, table_group.id).rollup_profiling_scores(), + ) + run_refresh_score_cards_results( + project_code=table_group.project_code, + add_history_entry=True, + refresh_date=profiling_run.profiling_starttime, + ) + except Exception: + LOG.exception("Error rolling up profiling scores") + + @with_database_session -def _generate_monitor_tests(project_code: str, table_group_id: str, test_suite_id: str) -> None: +def _generate_monitor_tests(table_group_id: str, test_suite_id: str) -> None: try: monitor_test_suite = TestSuite.get(test_suite_id) if not monitor_test_suite: @@ -313,6 +318,6 @@ def _generate_monitor_tests(project_code: str, table_group_id: str, test_suite_i else: LOG.info("Generating monitor tests") run_test_gen_queries(table_group_id, monitor_test_suite.test_suite, "Monitor") - run_execution_steps_in_background(project_code, monitor_test_suite.test_suite) + run_test_execution_in_background(test_suite_id) except Exception: LOG.exception("Error generating monitor tests") diff --git a/testgen/commands/run_refresh_data_chars.py b/testgen/commands/run_refresh_data_chars.py index 9da28de..a972f7f 100644 --- a/testgen/commands/run_refresh_data_chars.py +++ b/testgen/commands/run_refresh_data_chars.py @@ -1,4 +1,5 @@ import logging +from datetime import datetime from testgen.commands.queries.refresh_data_chars_query import ColumnChars, RefreshDataCharsSQL from testgen.common.database.database_service import ( @@ -14,7 +15,7 @@ LOG = logging.getLogger("testgen") -def run_data_chars_refresh(connection: Connection, table_group: TableGroup, run_date: str) -> list[ColumnChars]: +def run_data_chars_refresh(connection: Connection, table_group: TableGroup, run_date: datetime) -> list[ColumnChars]: sql_generator = RefreshDataCharsSQL(connection, table_group) LOG.info("Getting DDF for table group") @@ -26,6 +27,7 @@ def run_data_chars_refresh(connection: Connection, table_group: TableGroup, run_ data_chars = [ColumnChars(**column) for column in data_chars] if data_chars: distinct_tables = {column.table_name for column in data_chars} + LOG.info(f"Tables: {len(distinct_tables)}, Columns: {len(data_chars)}") count_queries = sql_generator.get_row_counts(distinct_tables) LOG.info("Getting row counts for table group") @@ -47,7 +49,7 @@ def run_data_chars_refresh(connection: Connection, table_group: TableGroup, run_ return data_chars -def write_data_chars(data_chars: list[ColumnChars], sql_generator: RefreshDataCharsSQL, run_date: str) -> None: +def write_data_chars(data_chars: list[ColumnChars], sql_generator: RefreshDataCharsSQL, run_date: datetime) -> None: staging_results = sql_generator.get_staging_data_chars(data_chars, run_date) LOG.info("Writing data characteristics to staging") diff --git a/testgen/commands/run_test_execution.py b/testgen/commands/run_test_execution.py new file mode 100644 index 0000000..3c6c0af --- /dev/null +++ b/testgen/commands/run_test_execution.py @@ -0,0 +1,319 @@ +import logging +import subprocess +import threading +from datetime import UTC, datetime, timedelta +from functools import partial +from uuid import UUID + +import testgen.common.process_service as process_service +from testgen import settings +from testgen.commands.queries.execute_tests_query import TestExecutionDef, TestExecutionSQL, TestRunType +from testgen.commands.queries.rollup_scores_query import RollupScoresSQL +from testgen.commands.run_refresh_score_cards_results import run_refresh_score_cards_results +from testgen.common import ( + execute_db_queries, + fetch_dict_from_db, + fetch_from_db_threaded, + set_target_db_params, + write_to_app_db, +) +from testgen.common.database.database_service import ThreadedProgress, empty_cache +from testgen.common.mixpanel_service import MixpanelService +from testgen.common.models import with_database_session +from testgen.common.models.connection import Connection +from testgen.common.models.table_group import TableGroup +from testgen.common.models.test_run import TestRun +from testgen.common.models.test_suite import TestSuite +from testgen.ui.session import session +from testgen.utils import get_exception_message + +from .run_refresh_data_chars import run_data_chars_refresh +from .run_test_validation import run_test_validation + +LOG = logging.getLogger("testgen") + + +def run_test_execution_in_background(test_suite_id: str | UUID): + msg = f"Triggering test run for test suite {test_suite_id}" + if settings.IS_DEBUG: + LOG.info(msg + ". Running in debug mode (new thread instead of new process).") + empty_cache() + background_thread = threading.Thread( + target=run_test_execution, + args=(test_suite_id, session.auth.user_display if session.auth else None), + ) + background_thread.start() + else: + LOG.info(msg) + script = ["testgen", "run-tests", "--test-suite-id", str(test_suite_id)] + subprocess.Popen(script) # NOQA S603 + + +@with_database_session +def run_test_execution(test_suite_id: str | UUID, username: str | None = None, run_date: datetime | None = None) -> str: + if test_suite_id is None: + raise ValueError("Test Suite ID was not specified") + + LOG.info(f"Starting test run for test suite {test_suite_id}") + time_delta = (run_date - datetime.now(UTC)) if run_date else timedelta() + + LOG.info("Retrieving connection, table group, and test suite parameters") + test_suite = TestSuite.get(test_suite_id) + table_group = TableGroup.get(test_suite.table_groups_id) + connection = Connection.get(table_group.connection_id) + set_target_db_params(connection.__dict__) + + LOG.info("Creating test run record") + test_run = TestRun( + test_suite_id=test_suite_id, + test_starttime=datetime.now(UTC) + time_delta, + process_id=process_service.get_current_process_id(), + ) + test_run.init_progress() + test_run.set_progress("data_chars", "Running") + test_run.save() + + try: + LOG.info(f"Test run: {test_run.id}, Test suite: {test_suite.test_suite}, Table group: {table_group.table_groups_name}, Connection: {connection.connection_name}") + data_chars = run_data_chars_refresh(connection, table_group, test_run.test_starttime) + test_run.set_progress("data_chars", "Completed") + + sql_generator = TestExecutionSQL(connection, table_group, test_run) + + LOG.info("Retrieving active test definitions in test suite") + test_defs = fetch_dict_from_db(*sql_generator.get_active_test_definitions()) + test_defs = [TestExecutionDef(**item) for item in test_defs] + LOG.info(f"Active test definitions: {len(test_defs)}") + + if test_defs: + test_run.set_progress("validation", "Running") + test_run.save() + + valid_test_defs = run_test_validation(sql_generator, test_defs) + invalid_count = len(test_defs) - len(valid_test_defs) + test_run.set_progress( + "validation", + "Warning" if invalid_count else "Completed", + error=f"{invalid_count} test{'s' if invalid_count > 1 else ''} had errors" if invalid_count else None, + ) + + if valid_test_defs: + LOG.info("Updating historic test thresholds") + execute_db_queries([sql_generator.update_historic_thresholds()]) + + column_types = {(col.schema_name, col.table_name, col.column_name): col.column_type for col in data_chars} + for td in valid_test_defs: + td.column_type = column_types.get((td.schema_name, td.table_name, td.column_name)) + + run_functions = { + "QUERY": partial(_run_tests, sql_generator, "QUERY"), + "METADATA": partial(_run_tests, sql_generator, "METADATA"), + "CAT": partial(_run_cat_tests, sql_generator), + } + # Run metadata tests last so that results for other tests are available to them + # TODO: TURN ON WHEN ADDING METADATA TESTS + # for run_type in ["QUERY", "CAT", "METADATA"]: + for run_type in ["QUERY", "CAT"]: + if (run_test_defs := [td for td in valid_test_defs if td.run_type == run_type]): + run_functions[run_type](run_test_defs) + else: + LOG.info(f"No {run_type} tests to run") + else: + LOG.info("No valid tests to run") + else: + LOG.info("No active tests to run") + + LOG.info("Updating test results and test run") + test_run.save() + execute_db_queries(sql_generator.update_test_results()) + test_run.refresh() + except Exception as e: + LOG.exception("Test execution encountered an error.") + LOG.info("Setting test run status to Error") + test_run.log_message = get_exception_message(e) + test_run.test_endtime = datetime.now(UTC) + time_delta + test_run.status = "Error" + test_run.save() + else: + LOG.info("Setting test run status to Completed") + test_run.test_endtime = datetime.now(UTC) + time_delta + test_run.status = "Complete" + test_run.save() + + LOG.info("Updating latest run for test suite") + test_suite.last_complete_test_run_id = test_run.id + test_suite.save() + + _rollup_test_scores(test_run, table_group) + finally: + MixpanelService().send_event( + "run-tests", + source=settings.ANALYTICS_JOB_SOURCE, + username=username, + sql_flavor=connection.sql_flavor_code, + test_count=test_run.test_ct, + run_duration=(test_run.test_endtime - test_run.test_starttime.replace(tzinfo=UTC)).total_seconds(), + scoring_duration=(datetime.now(UTC) + time_delta - test_run.test_endtime).total_seconds(), + ) + + return f""" + {"Test execution encountered an error. Check log for details." if test_run.status == "Error" else "Test execution completed."} + Run ID: {test_run.id} + """ + + +def _run_tests(sql_generator: TestExecutionSQL, run_type: TestRunType, test_defs: list[TestExecutionDef]) -> None: + test_run = sql_generator.test_run + test_run.set_progress(run_type, "Running") + test_run.save() + + LOG.info(f"Running {run_type} tests: {len(test_defs)}") + + def update_test_progress(progress: ThreadedProgress) -> None: + test_run.set_progress( + run_type, + "Running", + detail=f"{progress['processed']} of {progress['total']}", + error=f"{progress['errors']} test{'s' if progress['errors'] > 1 else ''} had errors" + if progress["errors"] + else None, + ) + test_run.save() + + test_results, result_columns, error_data = fetch_from_db_threaded( + [sql_generator.run_query_test(td) for td in test_defs], + use_target_db=run_type != "METADATA", + max_threads=sql_generator.connection.max_threads, + progress_callback=update_test_progress, + ) + + LOG.info(f"Writing {run_type} test results") + if test_results: + write_to_app_db(test_results, result_columns, sql_generator.test_results_table) + + if error_count := len(error_data): + LOG.warning(f"Errors running {run_type} tests: {error_count}") + LOG.info(f"Writing {run_type} test errors") + for index, error in error_data.items(): + test_defs[index].errors.append(error) + + error_results = sql_generator.get_test_errors(test_defs) + write_to_app_db(error_results, sql_generator.result_columns, sql_generator.test_results_table) + + test_run.set_progress( + run_type, + "Warning" if error_count else "Completed", + error=f"{error_count} test{'s' if error_count > 1 else ''} had errors" + if error_count + else None, + ) + + +def _run_cat_tests(sql_generator: TestExecutionSQL, test_defs: list[TestExecutionDef]) -> None: + test_run = sql_generator.test_run + test_run.set_progress("CAT", "Running") + test_run.save() + + total_count = len(test_defs) + LOG.info(f"Aggregating CAT tests: {total_count}") + aggregate_queries, aggregate_test_defs = sql_generator.aggregate_cat_tests(test_defs) + LOG.info(f"Running aggregated CAT test queries: {len(aggregate_queries)}") + + def update_aggegate_progress(progress: ThreadedProgress) -> None: + processed_count = sum(len(aggregate_test_defs[index]) for index in progress["indexes"]) + test_run.set_progress( + "CAT", + "Running", + detail=f"{processed_count} of {total_count}", + error=f"{progress['errors']} {'queries' if progress['errors'] > 1 else 'query'} had errors" + if progress["errors"] + else None, + ) + test_run.save() + + aggregate_results, _, aggregate_errors = fetch_from_db_threaded( + aggregate_queries, + use_target_db=True, + max_threads=sql_generator.connection.max_threads, + progress_callback=update_aggegate_progress, + ) + + if aggregate_results: + LOG.info("Writing aggregated CAT test results") + test_results = sql_generator.get_cat_test_results(aggregate_results, aggregate_test_defs) + write_to_app_db(test_results, sql_generator.result_columns, sql_generator.test_results_table) + + error_count = 0 + if aggregate_errors: + LOG.warning(f"Errors running aggregated CAT test queries: {len(aggregate_errors)}") + error_test_defs: list[TestExecutionDef] = [] + for index in aggregate_errors: + error_test_defs.extend(aggregate_test_defs[index]) + + single_queries, single_test_defs = sql_generator.aggregate_cat_tests(error_test_defs, single=True) + + LOG.info(f"Rerunning errored CAT tests singly: {len(single_test_defs)}") + test_run.set_progress( + "CAT", + "Running", + error="Rerunning errored tests singly", + ) + test_run.save() + + def update_single_progress(progress: ThreadedProgress) -> None: + test_run.set_progress( + "CAT", + "Running", + error=( + f"Rerunning errored tests singly: {progress['processed']} of {progress['total']}" + f"\n{progress['errors']} test{'s' if progress['errors'] > 1 else ''} had errors" if progress["errors"] else "" + ), + ) + test_run.save() + + single_results, _, single_errors = fetch_from_db_threaded( + single_queries, + use_target_db=True, + max_threads=sql_generator.connection.max_threads, + progress_callback=update_single_progress, + ) + + if single_results: + LOG.info("Writing single CAT test results") + test_results = sql_generator.get_cat_test_results(single_results, single_test_defs) + write_to_app_db(test_results, sql_generator.result_columns, sql_generator.test_results_table) + + if error_count := len(single_errors): + LOG.warning(f"Errors running CAT tests singly: {error_count}") + LOG.info("Writing single CAT test errors") + error_test_defs: list[TestExecutionDef] = [] + for index, error in single_errors.items(): + td = single_test_defs[index][0] + td.errors.append(error) + error_test_defs.append(td) + + error_results = sql_generator.get_test_errors(error_test_defs) + write_to_app_db(error_results, sql_generator.result_columns, sql_generator.test_results_table) + + test_run.set_progress( + "CAT", + "Warning" if error_count else "Completed", + error=f"{error_count} test{'s' if error_count > 1 else ''} had errors" + if error_count + else None, + ) + + +def _rollup_test_scores(test_run: TestRun, table_group: TableGroup) -> None: + try: + LOG.info("Rolling up test scores") + execute_db_queries( + RollupScoresSQL(test_run.id, table_group.id).rollup_test_scores(), + ) + run_refresh_score_cards_results( + project_code=table_group.project_code, + add_history_entry=True, + refresh_date=test_run.test_starttime, + ) + except Exception: + LOG.exception("Error rolling up test scores") diff --git a/testgen/commands/run_test_parameter_validation.py b/testgen/commands/run_test_parameter_validation.py deleted file mode 100644 index f31be1b..0000000 --- a/testgen/commands/run_test_parameter_validation.py +++ /dev/null @@ -1,119 +0,0 @@ -import logging -from collections import defaultdict -from itertools import chain - -from testgen.commands.queries.test_parameter_validation_query import CTestParamValidationSQL -from testgen.common import ( - execute_db_queries, - fetch_dict_from_db, - fetch_list_from_db, -) -from testgen.common.get_pipeline_parms import TestExecutionParams - -LOG = logging.getLogger("testgen") - - -def run_parameter_validation_queries( - params: TestExecutionParams, - test_run_id: str = "", - test_time: str = "", - test_suite: str = "", -): - LOG.info("CurrentStep: Initializing Test Parameter Validation") - clsExecute = CTestParamValidationSQL(params["sql_flavor"], params["test_suite_id"]) - clsExecute.run_date = test_time - clsExecute.test_run_id = test_run_id - LOG.info("CurrentStep: Validation Class successfully initialized") - - # Retrieve Test Column list - LOG.info("CurrentStep: Retrieve Test Columns for Validation") - test_columns, _ = fetch_list_from_db(*clsExecute.GetTestValidationColumns()) - - invalid_tests = [ test_ids for col, test_ids in test_columns if not col ] - invalid_tests = { item for sublist in invalid_tests for item in sublist } - test_columns = [ item for item in test_columns if item[0] ] - - if not test_columns: - LOG.warning(f"No test columns are present to validate in Test Suite {test_suite}") - missing_columns = [] - missing_tables = set() - else: - # Derive test schema list -- make CSV string from list of columns - # to be used as criteria for retrieving data dictionary - setSchemas = {col.split(".")[0] for col, _ in test_columns} - strSchemas = ", ".join([f"'{value}'" for value in setSchemas]) - - # Retrieve Current Project Column list - LOG.info("CurrentStep: Retrieve Current Columns for Validation") - clsExecute.tg_schema = params["table_group_schema"] - clsExecute.test_schemas = strSchemas - lstProjectTestColumns = fetch_dict_from_db(*clsExecute.GetProjectTestValidationColumns(), use_target_db=True) - - if len(lstProjectTestColumns) == 0: - LOG.info("Current Test Column list is empty") - - LOG.info("CurrentStep: Compare column sets") - # load results into sets - result_set1 = {col.lower() for col, _ in test_columns} - result_set2 = {item["columns"].lower() for item in set(lstProjectTestColumns)} - - # Check if all columns exist in the table - missing_columns = result_set1.difference(result_set2) - missing_columns = [ col for col in missing_columns if col.rsplit(".", 1)[1] ] - if missing_columns: - LOG.info("Missing columns: %s", ", ".join(missing_columns)) - - # Extracting schema.tables that are missing from the result sets - tables_set1 = {elem.rsplit(".", 1)[0] for elem in result_set1} - tables_set2 = {elem.rsplit(".", 1)[0] for elem in result_set2} - - # Check if all the tables exist in the schema - missing_tables = tables_set1.difference(tables_set2) - - if missing_tables: - LOG.info("Missing tables: %s", ", ".join(missing_tables)) - - if missing_columns or missing_tables or invalid_tests: - # Flag test_definitions tests with missing tables or columns - LOG.info("CurrentStep: Flagging Tests That Failed Validation") - - tests_missing_tables = defaultdict(list) - tests_missing_columns = defaultdict(list) - for column_name, test_ids in test_columns: - column_name = column_name.lower() - table_name = column_name.rsplit(".", 1)[0] - if table_name in missing_tables: - tests_missing_tables[table_name].extend(test_ids) - elif column_name in missing_columns: - tests_missing_columns[column_name].extend(test_ids) - - clsExecute.flag_val = "D" - clsExecute.test_ids = list(set(chain(*tests_missing_tables.values(), *tests_missing_columns.values(), invalid_tests))) - execute_db_queries([clsExecute.PrepFlagTestsWithFailedValidation()]) - - for column_name, test_ids in tests_missing_columns.items(): - clsExecute.message = f"Missing column: {column_name}" - clsExecute.test_ids = test_ids - execute_db_queries([clsExecute.FlagTestsWithFailedValidation()]) - - for table_name, test_ids in tests_missing_tables.items(): - clsExecute.message = f"Missing table: {table_name}" - clsExecute.test_ids = test_ids - execute_db_queries([clsExecute.FlagTestsWithFailedValidation()]) - - if invalid_tests: - clsExecute.message = "Invalid test: schema, table, or column not defined" - clsExecute.test_ids = invalid_tests - execute_db_queries([clsExecute.FlagTestsWithFailedValidation()]) - - # Copy test results to DK DB, using temporary flagged D value to identify - LOG.info("CurrentStep: Saving error results for invalid tests") - execute_db_queries([clsExecute.ReportTestValidationErrors()]) - - # Set to Inactive those test_definitions tests that are flagged D: set to N - LOG.info("CurrentStep: Disabling Tests That Failed Validation") - execute_db_queries([clsExecute.DisableTestsWithFailedValidation()]) - - LOG.info("Validation Complete: Tests referencing missing tables or columns have been deactivated.") - else: - LOG.info("Validation Successful: No tables or columns missing from target database.") diff --git a/testgen/commands/run_test_validation.py b/testgen/commands/run_test_validation.py new file mode 100644 index 0000000..3d0d0af --- /dev/null +++ b/testgen/commands/run_test_validation.py @@ -0,0 +1,105 @@ +import logging +import re +from uuid import UUID + +from testgen.commands.queries.execute_tests_query import TestExecutionDef, TestExecutionSQL +from testgen.common import execute_db_queries, fetch_dict_from_db +from testgen.common.database.database_service import write_to_app_db + +LOG = logging.getLogger("testgen") + + +def run_test_validation(sql_generator: TestExecutionSQL, test_defs: list[TestExecutionDef]) -> list[TestExecutionDef]: + test_defs_by_id: dict[UUID, TestExecutionDef] = {td.id: td for td in test_defs} + identifiers_to_check: dict[tuple[str, str, str | None], set[UUID]] = {} + target_schemas = set() + quote = sql_generator.flavor_service.quote_character + + def add_identifiers(test_id: UUID, schema: str, table: str, columns: str | None = None, single_column: bool = False) -> None: + target_schemas.add(schema) + if columns: + if single_column: + identifiers = [(schema.lower(), table.lower(), columns.strip(f" {quote}").lower())] + else: + column_names = re.split(rf",(?=(?:[^\{quote}]*\{quote}[^\{quote}]*\{quote})*[^\{quote}]*$)", columns) + column_names = [col.strip(f" {quote}") for col in column_names] + identifiers = [(schema.lower(), table.lower(), col.lower()) for col in column_names if col] + else: + identifiers = [(schema.lower(), table.lower(), None)] + + for key in identifiers: + if not identifiers_to_check.get(key): + identifiers_to_check[key] = set() + identifiers_to_check[key].add(test_id) + + def add_test_error(test_ids: list[UUID], error: str) -> None: + for test_id in test_ids: + if not test_defs_by_id[test_id].errors: + test_defs_by_id[test_id].errors.append("Deactivated") + test_defs_by_id[test_id].errors.append(error) + + for td in test_defs: + # No validation needed for custom query + if td.test_type == "CUSTOM": + continue + + if td.schema_name and td.table_name and (td.column_name or td.test_scope in ["table", "custom"]): + if td.test_scope in ["table", "custom"] or td.test_type.startswith("Aggregate_"): + # Validate only table for these test types - column is meaningless or uses aggregation functions + add_identifiers(td.id, td.schema_name, td.table_name) + else: + add_identifiers(td.id, td.schema_name, td.table_name, td.column_name, single_column=td.test_scope == "column") + + if td.groupby_names: + add_identifiers(td.id, td.schema_name, td.table_name, td.groupby_names) + + if td.test_scope == "referential": + if td.window_date_column: + add_identifiers(td.id, td.schema_name, td.table_name, td.window_date_column) + + if td.match_column_names or td.match_groupby_names: + if td.match_schema_name and td.match_table_name: + if td.match_column_names and not td.test_type.startswith("Aggregate_"): + add_identifiers(td.id, td.match_schema_name, td.match_table_name, td.match_column_names) + if td.match_groupby_names: + add_identifiers(td.id, td.match_schema_name, td.match_table_name, td.match_groupby_names) + else: + add_test_error([td.id], "Invalid test: match schema, table, or column not defined") + else: + add_test_error([td.id], "Invalid test: schema, table, or column not defined") + + if target_schemas: + LOG.info("Getting tables and columns in target schemas for validation") + target_identifiers = fetch_dict_from_db( + *sql_generator.get_target_identifiers(target_schemas), + use_target_db=True, + ) + if not target_identifiers: + LOG.info("No tables or columns present in target schemas") + + # Normalize identifiers before validating + target_tables = {(item["schema_name"].lower(), item["table_name"].lower()) for item in target_identifiers} + target_columns = { + (item["schema_name"].lower(), item["table_name"].lower(), item["column_name"].lower()) + for item in target_identifiers + } + + for identifier, test_ids in identifiers_to_check.items(): + table = (identifier[0], identifier[1]) + if table not in target_tables: + add_test_error(test_ids, f"Missing table: {".".join(table)}") + elif identifier[2] and identifier not in target_columns: + add_test_error(test_ids, f"Missing column: {".".join(identifier)}") + + error_results = sql_generator.get_test_errors(test_defs_by_id.values()) + if error_results: + LOG.warning(f"Tests in test suite failed validation: {len(error_results)}") + LOG.info("Writing test validation errors to test results") + write_to_app_db(error_results, sql_generator.result_columns, sql_generator.test_results_table) + + LOG.info("Disabling tests in test suite that failed validation") + execute_db_queries([sql_generator.disable_invalid_test_definitions()]) + else: + LOG.info("No tests in test suite failed validation") + + return [td for td in test_defs if not td.errors] diff --git a/testgen/common/clean_sql.py b/testgen/common/clean_sql.py index 2729961..8d1856e 100644 --- a/testgen/common/clean_sql.py +++ b/testgen/common/clean_sql.py @@ -17,6 +17,9 @@ def CleanSQL(strInput: str) -> str: def quote_identifiers(identifiers: str, flavor: str) -> str: + if not identifiers: + return "" + # Keywords -- identifiers to quote keywords = [ "select", @@ -47,15 +50,13 @@ def quote_identifiers(identifiers: str, flavor: str) -> str: return ", ".join(quoted_values) -def ConcatColumnList(str_column_list, str_null_value): - # Prepares SQL expression to concatenate comma-separated column list into single SQL expression - str_expression = "" - if str_column_list: - if "," in str_column_list: - # Split each comma separated column name into individual list items - cols = [s.strip() for s in str_column_list.split(",")] - str_each = [f"COALESCE({i}, '{str_null_value}')" for i in cols] - str_expression = "CONCAT(" + ", ".join(str_each) + ")" +def concat_columns(columns: str, null_value: str): + # Prepares SQL expression to concatenate comma-separated column list + expression = "" + if columns: + if "," in columns: + column_list = [f"COALESCE({col.strip()}, '{null_value}')" for col in columns.split(",")] + expression = f"CONCAT({', '.join(column_list)})" else: - str_expression = str_column_list - return str_expression + expression = columns + return expression diff --git a/testgen/common/database/database_service.py b/testgen/common/database/database_service.py index b46f4d8..cc71240 100644 --- a/testgen/common/database/database_service.py +++ b/testgen/common/database/database_service.py @@ -166,7 +166,7 @@ class ThreadedProgress(TypedDict): processed: int errors: int total: int - + indexes: list[int] def fetch_from_db_threaded( queries: list[tuple[str, dict | None]], @@ -200,6 +200,7 @@ def fetch_data(query: str, params: dict | None, index: int) -> tuple[list[Legacy query_count = len(queries) processed_count = 0 + processed_indexes: list[int] = [] max_threads = max(1, min(10, max_threads)) with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: @@ -216,8 +217,14 @@ def fetch_data(query: str, params: dict | None, index: int) -> tuple[list[Legacy error_data[index] = error processed_count += 1 + processed_indexes.append(index) if progress_callback: - progress_callback({"processed": processed_count, "errors": len(error_data), "total": query_count}) + progress_callback({ + "processed": processed_count, + "errors": len(error_data), + "total": query_count, + "indexes": processed_indexes, + }) LOG.debug(f"Processed {processed_count} of {query_count} threaded queries") # Flatten nested lists diff --git a/testgen/common/date_service.py b/testgen/common/date_service.py index 41e3412..fc3ae5f 100644 --- a/testgen/common/date_service.py +++ b/testgen/common/date_service.py @@ -1,28 +1,10 @@ -from datetime import UTC, datetime, timedelta +from datetime import UTC, datetime import pandas as pd -def get_today_as_string(): - return datetime.utcnow().strftime("%Y-%m-%d") - - -def get_now_as_string(): - return datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") - - -def parse_now(value: str) -> datetime: - return datetime.strptime(value, "%Y-%m-%d %H:%M:%S").replace(tzinfo=UTC) - - -def get_now_as_string_with_offset(minutes_offset): - ret = datetime.utcnow() - ret = ret + timedelta(minutes=minutes_offset) - return ret.strftime("%Y-%m-%d %H:%M:%S") - - def get_now_as_iso_timestamp(): - return as_iso_timestamp(datetime.utcnow()) + return as_iso_timestamp(datetime.now(UTC)) def as_iso_timestamp(date: datetime) -> str | None: @@ -47,14 +29,6 @@ def accommodate_dataframe_to_timezone(df, streamlit_session, time_columns=None): df[time_column] = df[time_column].dt.strftime("%Y-%m-%d %H:%M:%S") -def create_timezoned_column_in_dataframe(streamlit_session, df, new_column_name, existing_column_name): - if new_column_name and existing_column_name and "browser_timezone" in streamlit_session: - timezone = streamlit_session["browser_timezone"] - df[new_column_name] = ( - df[existing_column_name].dt.tz_localize("UTC").dt.tz_convert(timezone).dt.strftime("%Y-%m-%d %H:%M:%S") - ) - - def get_timezoned_timestamp(streamlit_session, value, dateformat="%b %-d, %-I:%M %p"): ret = None if value and "browser_timezone" in streamlit_session: @@ -67,22 +41,5 @@ def get_timezoned_timestamp(streamlit_session, value, dateformat="%b %-d, %-I:%M def get_timezoned_now(streamlit_session): - value = datetime.utcnow() + value = datetime.now(UTC) return get_timezoned_timestamp(streamlit_session, value) - - -def get_formatted_duration(duration: str | None) -> str: - if not duration: - return "--" - - hour, minute, second = duration.split(":") - formatted = "" - if int(hour): - formatted += f"{int(hour)!s}h" - if int(minute): - formatted += f" {int(minute)!s}m" - if int(second): - formatted += f" {int(second)!s}s" - - formatted = formatted.strip() or "< 1s" - return formatted diff --git a/testgen/common/get_pipeline_parms.py b/testgen/common/get_pipeline_parms.py index e5ced2f..1f10b36 100644 --- a/testgen/common/get_pipeline_parms.py +++ b/testgen/common/get_pipeline_parms.py @@ -14,18 +14,6 @@ class TestGenerationParams(BaseParams): profiling_as_of_date: str -class TestExecutionParams(BaseParams): - test_suite_id: str - table_groups_id: str - table_group_schema: str - profiling_table_set: str - profiling_include_mask: str - profiling_exclude_mask: str - sql_flavor: str - max_threads: int - max_query_chars: int - - def get_test_generation_params(table_group_id: str, test_suite: str) -> TestGenerationParams: results = fetch_dict_from_db( read_template_sql_file("parms_test_gen.sql", "parms"), @@ -34,13 +22,3 @@ def get_test_generation_params(table_group_id: str, test_suite: str) -> TestGene if not results: raise ValueError("Connection parameters not found for test generation.") return TestGenerationParams(results[0]) - - -def get_test_execution_params(project_code: str, test_suite: str) -> TestExecutionParams: - results = fetch_dict_from_db( - read_template_sql_file("parms_test_execution.sql", "parms"), - {"PROJECT_CODE": project_code, "TEST_SUITE": test_suite} - ) - if not results: - raise ValueError("Connection parameters not found for test execution.") - return TestExecutionParams(results[0]) diff --git a/testgen/common/models/entity.py b/testgen/common/models/entity.py index 8545b3d..2fe3ac9 100644 --- a/testgen/common/models/entity.py +++ b/testgen/common/models/entity.py @@ -138,6 +138,10 @@ def clear_cache(cls) -> None: @classmethod def columns(cls) -> list[str]: return list(cls.__annotations__.keys()) + + def refresh(self) -> None: + db_session = get_current_session() + db_session.refresh(self) def save(self) -> None: is_new = self.id is None diff --git a/testgen/common/models/test_definition.py b/testgen/common/models/test_definition.py index b193dff..8422301 100644 --- a/testgen/common/models/test_definition.py +++ b/testgen/common/models/test_definition.py @@ -6,10 +6,8 @@ import streamlit as st from sqlalchemy import ( - BigInteger, Column, ForeignKey, - Identity, String, Text, TypeDecorator, @@ -147,14 +145,12 @@ class TestType(Entity): class TestDefinition(Entity): __tablename__ = "test_definitions" - id: UUID = Column(postgresql.UUID(as_uuid=True), server_default=text("gen_random_uuid()")) - cat_test_id: int = Column(BigInteger, Identity(), primary_key=True) + id: UUID = Column(postgresql.UUID(as_uuid=True), server_default=text("gen_random_uuid()"), primary_key=True) table_groups_id: UUID = Column(postgresql.UUID(as_uuid=True)) profile_run_id: UUID = Column(postgresql.UUID(as_uuid=True)) test_type: str = Column(String) test_suite_id: UUID = Column(postgresql.UUID(as_uuid=True), ForeignKey("test_suites.id"), nullable=False) test_description: str = Column(NullIfEmptyString) - test_action: str = Column(String) schema_name: str = Column(String) table_name: str = Column(NullIfEmptyString) column_name: str = Column(NullIfEmptyString) @@ -203,12 +199,10 @@ class TestDefinition(Entity): _minimal_columns = TestDefinitionMinimal.__annotations__.keys() _update_exclude_columns = ( id, - cat_test_id, table_groups_id, profile_run_id, test_type, test_suite_id, - test_action, schema_name, test_mode, watch_level, @@ -272,6 +266,7 @@ def set_status_attribute( ) UPDATE test_definitions SET {status_type} = :value + {", test_definition_status = NULL" if status_type == "test_active" and value else ""} FROM test_definitions td INNER JOIN selected ON (td.id = selected.id::UUID) WHERE td.id = test_definitions.id; @@ -331,7 +326,6 @@ def copy( target_table_name: str | None = None, target_column_name: str | None = None, ) -> None: - id_columns = (cls.id, cls.cat_test_id) modified_columns = [cls.table_groups_id, cls.profile_run_id, cls.test_suite_id] select_columns = [ @@ -352,7 +346,7 @@ def copy( select_columns.append(literal(target_column_name).label("column_name")) other_columns = [ - column for column in cls.__table__.columns if column not in modified_columns and column not in id_columns + column for column in cls.__table__.columns if column not in modified_columns and column != cls.id ] select_columns.extend(other_columns) diff --git a/testgen/common/models/test_run.py b/testgen/common/models/test_run.py index ed1a01f..4c2d464 100644 --- a/testgen/common/models/test_run.py +++ b/testgen/common/models/test_run.py @@ -1,12 +1,13 @@ from collections.abc import Iterable from dataclasses import dataclass from datetime import UTC, datetime -from typing import Literal, NamedTuple -from uuid import UUID +from typing import Literal, NamedTuple, TypedDict +from uuid import UUID, uuid4 import streamlit as st from sqlalchemy import BigInteger, Column, Float, ForeignKey, Integer, String, Text, desc, func, select, text, update from sqlalchemy.dialects import postgresql +from sqlalchemy.orm.attributes import flag_modified from sqlalchemy.sql.expression import case from testgen.common.models import get_current_session @@ -15,6 +16,15 @@ from testgen.utils import is_uuid4 TestRunStatus = Literal["Running", "Complete", "Error", "Cancelled"] +ProgressKey = Literal["data_chars", "validation", "QUERY", "CAT", "METADATA"] +ProgressStatus = Literal["Pending", "Running", "Completed", "Warning"] + +class ProgressStep(TypedDict): + key: ProgressKey + status: ProgressStatus + label: str + detail: str + error: str @dataclass @@ -37,6 +47,7 @@ class TestRunSummary(EntityMinimal): table_groups_name: str test_suite: str status: TestRunStatus + progress: list[ProgressStep] process_id: int log_message: str test_ct: int @@ -57,13 +68,13 @@ class LatestTestRun(NamedTuple): class TestRun(Entity): __tablename__ = "test_runs" - id: UUID = Column(postgresql.UUID(as_uuid=True), primary_key=True, nullable=False) + id: UUID = Column(postgresql.UUID(as_uuid=True), primary_key=True, nullable=False, default=uuid4) test_suite_id: UUID = Column(postgresql.UUID(as_uuid=True), ForeignKey("test_suites.id"), nullable=False) test_starttime: datetime = Column(postgresql.TIMESTAMP) test_endtime: datetime = Column(postgresql.TIMESTAMP) status: TestRunStatus = Column(String, default="Running") + progress: list[ProgressStep] = Column(postgresql.JSONB, default=[]) log_message: str = Column(Text) - duration: str = Column(String) test_ct: int = Column(Integer) passed_ct: int = Column(Integer) failed_ct: int = Column(Integer) @@ -187,6 +198,7 @@ def select_summary( table_groups.table_groups_name, test_suites.test_suite, test_runs.status, + test_runs.progress, test_runs.process_id, test_runs.log_message, test_runs.test_ct, @@ -243,12 +255,6 @@ def update_status(cls, run_id: str | UUID, status: TestRunStatus) -> None: @classmethod def cascade_delete(cls, ids: list[str]) -> None: query = """ - DELETE FROM working_agg_cat_results - WHERE test_run_id IN :test_run_ids; - - DELETE FROM working_agg_cat_tests - WHERE test_run_id IN :test_run_ids; - DELETE FROM test_results WHERE test_run_id IN :test_run_ids; """ @@ -263,5 +269,24 @@ def clear_cache(cls) -> bool: cls.get_minimal.clear() cls.select_summary.clear() - def save(self) -> None: - raise NotImplementedError + def init_progress(self) -> None: + self._progress = { + "data_chars": {"label": "Refreshing data catalog"}, + "validation": {"label": "Validating test definitions"}, + "QUERY": {"label": "Running query tests"}, + "CAT": {"label": "Running aggregated tests"}, + # TODO: TURN ON WHEN ADDING METADATA TESTS + # "METADATA": {"label": "Running metadata tests"}, + } + for key in self._progress: + self._progress[key].update({"key": key, "status": "Pending"}) + + def set_progress(self, key: ProgressKey, status: ProgressStatus, detail: str | None = None, error: str | None = None) -> None: + self._progress[key]["status"] = status + if detail: + self._progress[key]["detail"] = detail + if error: + self._progress[key]["error"] = error + + self.progress = list(self._progress.values()) + flag_modified(self, "progress") diff --git a/testgen/common/models/test_suite.py b/testgen/common/models/test_suite.py index 368147a..a0703a1 100644 --- a/testgen/common/models/test_suite.py +++ b/testgen/common/models/test_suite.py @@ -56,10 +56,8 @@ class TestSuite(Entity): connection_id: int = Column(BigInteger, ForeignKey("connections.connection_id")) table_groups_id: UUID = Column(postgresql.UUID(as_uuid=True)) test_suite_description: str = Column(NullIfEmptyString) - test_action: str = Column(String) severity: str = Column(NullIfEmptyString) export_to_observability: bool = Column(YNString, default="Y") - test_suite_schema: str = Column(NullIfEmptyString) component_key: str = Column(NullIfEmptyString) component_type: str = Column(NullIfEmptyString) component_name: str = Column(NullIfEmptyString) @@ -216,18 +214,6 @@ def is_in_use(cls, ids: list[str]) -> bool: @classmethod def cascade_delete(cls, ids: list[str]) -> None: query = """ - DELETE FROM working_agg_cat_results - WHERE test_run_id IN ( - SELECT id FROM test_runs - WHERE test_suite_id IN :test_suite_ids - ); - - DELETE FROM working_agg_cat_tests - WHERE test_run_id IN ( - SELECT id FROM test_runs - WHERE test_suite_id IN :test_suite_ids - ); - DELETE FROM test_runs WHERE test_suite_id IN :test_suite_ids; diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index 8892e51..4e62476 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -6,10 +6,6 @@ SET SEARCH_PATH TO {SCHEMA_NAME}; -- | This script should only be run for new schema -- no drops -- ============================================================================== -CREATE SEQUENCE test_definitions_cat_test_id_seq; - -CREATE SEQUENCE profile_results_dk_id_seq; - CREATE TABLE stg_secondary_profile_updates ( project_code VARCHAR(30), schema_name VARCHAR(50), @@ -156,10 +152,8 @@ CREATE TABLE test_suites ( REFERENCES connections, table_groups_id UUID, test_suite_description VARCHAR(1000), - test_action VARCHAR(100), severity VARCHAR(10), export_to_observability VARCHAR(5) DEFAULT 'Y', - test_suite_schema VARCHAR(100), component_key VARCHAR(100), component_type VARCHAR(100), component_name VARCHAR(100), @@ -174,16 +168,14 @@ ALTER TABLE table_groups ADD CONSTRAINT table_groups_test_suites_monitor_test_su FOREIGN KEY (monitor_test_suite_id) REFERENCES test_suites ON DELETE SET NULL; CREATE TABLE test_definitions ( - id UUID DEFAULT gen_random_uuid(), - cat_test_id BIGINT GENERATED BY DEFAULT AS IDENTITY - CONSTRAINT test_definitions_cat_test_id_pk + id UUID DEFAULT gen_random_uuid() + CONSTRAINT test_definitions_id_pk PRIMARY KEY, table_groups_id UUID, profile_run_id UUID, test_type VARCHAR(200), test_suite_id UUID NOT NULL, test_description VARCHAR(1000), - test_action VARCHAR(100), schema_name VARCHAR(100), table_name VARCHAR(100), column_name VARCHAR(500), @@ -227,15 +219,10 @@ CREATE TABLE test_definitions ( FOREIGN KEY (test_suite_id) REFERENCES test_suites ); -ALTER SEQUENCE test_definitions_cat_test_id_seq OWNED BY test_definitions.cat_test_id; - CREATE TABLE profile_results ( id UUID DEFAULT gen_random_uuid() CONSTRAINT profile_results_id_pk PRIMARY KEY, - dk_id BIGINT GENERATED ALWAYS AS IDENTITY, --- CONSTRAINT profile_results_dk_id_pk --- PRIMARY KEY, project_code VARCHAR(30), connection_id BIGINT CONSTRAINT profile_results_connections_connection_id_fk @@ -311,9 +298,6 @@ CREATE TABLE profile_results ( query_error VARCHAR(2000) ); -ALTER SEQUENCE profile_results_dk_id_seq OWNED BY profile_results.dk_id; - - CREATE TABLE profile_anomaly_types ( id VARCHAR(10) NOT NULL CONSTRAINT pk_anomaly_types_id @@ -513,8 +497,8 @@ CREATE TABLE test_runs ( test_starttime TIMESTAMP, test_endtime TIMESTAMP, status VARCHAR(100) DEFAULT 'Running', + progress JSONB, log_message TEXT, - duration VARCHAR(50), test_ct INTEGER, passed_ct INTEGER, failed_ct INTEGER, @@ -543,8 +527,6 @@ CREATE TABLE test_results ( test_definition_id UUID, auto_gen BOOLEAN, test_time TIMESTAMP, - starttime TIMESTAMP, - endtime TIMESTAMP, schema_name VARCHAR(100), table_name VARCHAR(100), column_names VARCHAR(500), @@ -557,11 +539,7 @@ CREATE TABLE test_results ( result_signal VARCHAR(1000), result_measure VARCHAR(1000), threshold_value VARCHAR(1000), - result_error_data VARCHAR(4000), - test_action VARCHAR(100), disposition VARCHAR(20), - subset_condition VARCHAR(500), - result_query VARCHAR(4000), test_description VARCHAR(1000), test_run_id UUID NOT NULL, table_groups_id UUID, @@ -572,38 +550,6 @@ CREATE TABLE test_results ( FOREIGN KEY (test_suite_id) REFERENCES test_suites ); -CREATE TABLE working_agg_cat_tests ( - test_run_id UUID NOT NULL, - schema_name VARCHAR(200) NOT NULL, - table_name VARCHAR(200) NOT NULL, - cat_sequence INTEGER NOT NULL, - test_count INTEGER, - test_time TIMESTAMP, - start_time TIMESTAMP, - end_time TIMESTAMP, - column_names TEXT, - test_types TEXT, - test_definition_ids TEXT, - test_actions TEXT, - test_descriptions TEXT, - test_parms TEXT, - test_measures TEXT, - test_conditions TEXT, - CONSTRAINT working_agg_cat_tests_trid_sn_tn_cs - PRIMARY KEY (test_run_id, schema_name, table_name, cat_sequence) -); - -CREATE TABLE working_agg_cat_results ( - test_run_id UUID NOT NULL, - schema_name VARCHAR(200) NOT NULL, - table_name VARCHAR(200) NOT NULL, - cat_sequence INTEGER NOT NULL, - measure_results TEXT, - test_results TEXT, - CONSTRAINT working_agg_cat_results_tri_sn_tn_cs - PRIMARY KEY (test_run_id, schema_name, table_name, cat_sequence) -); - CREATE TABLE cat_test_conditions ( id VARCHAR, test_type VARCHAR(200) NOT NULL @@ -730,10 +676,6 @@ CREATE TABLE IF NOT EXISTS score_definition_results_breakdown ( CREATE UNIQUE INDEX table_groups_name_unique ON table_groups(project_code, table_groups_name); --- Index working table - ORIGINAL -CREATE INDEX working_agg_cat_tests_test_run_id_index - ON working_agg_cat_tests(test_run_id); - -- Index Connections CREATE UNIQUE INDEX uix_con_id ON connections(id); diff --git a/testgen/template/dbsetup/060_create_standard_views.sql b/testgen/template/dbsetup/060_create_standard_views.sql index d5aac62..536edce 100644 --- a/testgen/template/dbsetup/060_create_standard_views.sql +++ b/testgen/template/dbsetup/060_create_standard_views.sql @@ -61,15 +61,13 @@ SELECT p.project_name, r.result_code as passed_ct, (1 - COALESCE(r.result_code, 0))::INTEGER as exception_ct, CASE - WHEN result_status = 'Warning' - AND result_message NOT ILIKE 'Inactivated%' THEN 1 + WHEN result_status = 'Warning' THEN 1 END::INTEGER as warning_ct, CASE - WHEN result_status = 'Failed' - AND result_message NOT ILIKE 'Inactivated%' THEN 1 + WHEN result_status = 'Failed' THEN 1 END::INTEGER as failed_ct, CASE - WHEN result_message ILIKE 'Inactivated%' THEN 1 + WHEN result_status = 'Error' THEN 1 END as execution_error_ct, p.project_code, r.table_groups_id, @@ -112,7 +110,6 @@ CREATE VIEW v_queued_observability_results SELECT p.project_name, cn.sql_flavor as component_tool, - ts.test_suite_schema as schema, cn.connection_name, cn.project_db, diff --git a/testgen/template/dbsetup/075_grant_role_rights.sql b/testgen/template/dbsetup/075_grant_role_rights.sql index f8fb631..97a54b4 100644 --- a/testgen/template/dbsetup/075_grant_role_rights.sql +++ b/testgen/template/dbsetup/075_grant_role_rights.sql @@ -23,8 +23,6 @@ GRANT SELECT, INSERT, DELETE, UPDATE ON {SCHEMA_NAME}.stg_secondary_profile_updates, {SCHEMA_NAME}.stg_data_chars_updates, {SCHEMA_NAME}.test_runs, - {SCHEMA_NAME}.working_agg_cat_results, - {SCHEMA_NAME}.working_agg_cat_tests, {SCHEMA_NAME}.functional_test_results, {SCHEMA_NAME}.connections, {SCHEMA_NAME}.table_groups, diff --git a/testgen/template/dbupgrade/0158_incremental_upgrade.sql b/testgen/template/dbupgrade/0158_incremental_upgrade.sql new file mode 100644 index 0000000..46d0119 --- /dev/null +++ b/testgen/template/dbupgrade/0158_incremental_upgrade.sql @@ -0,0 +1,35 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +DROP VIEW IF EXISTS v_latest_profile_results CASCADE; +DROP VIEW IF EXISTS v_queued_observability_results CASCADE; + +DROP SEQUENCE profile_results_dk_id_seq; +DROP SEQUENCE test_definitions_cat_test_id_seq; + +DROP TABLE working_agg_cat_tests; +DROP TABLE working_agg_cat_results; + +ALTER TABLE profile_results + DROP COLUMN dk_id; + +ALTER TABLE test_suites + DROP COLUMN test_action, + DROP COLUMN test_suite_schema; + +ALTER TABLE test_definitions + DROP CONSTRAINT test_definitions_cat_test_id_pk, + DROP COLUMN cat_test_id, + DROP COLUMN test_action, + ADD CONSTRAINT test_definitions_id_pk PRIMARY KEY (id); + +ALTER TABLE test_runs + DROP COLUMN duration, + ADD COLUMN progress JSONB; + +ALTER TABLE test_results + DROP COLUMN starttime, + DROP COLUMN endtime, + DROP COLUMN test_action, + DROP COLUMN subset_condition, + DROP COLUMN result_error_data, + DROP COLUMN result_query; diff --git a/testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql b/testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql deleted file mode 100644 index b6268c5..0000000 --- a/testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql +++ /dev/null @@ -1,148 +0,0 @@ --- Create one record per CAT query: all test sets against one table, split over max chars -INSERT INTO working_agg_cat_tests - (test_run_id, - schema_name, table_name, cat_sequence, test_count, test_time, - column_names, test_types, test_definition_ids, - test_actions, test_descriptions, - test_parms, test_measures, test_conditions) - --- Column types from latest profile_results -WITH column_types AS ( - SELECT pr.table_groups_id, - pr.connection_id, - pr.schema_name, - pr.table_name, - pr.column_name, - pr.column_type - FROM profile_results pr - INNER JOIN ( - SELECT table_groups_id, - connection_id, - schema_name, - table_name, - column_name, - MAX(run_date) AS max_run_date - FROM profile_results - GROUP BY table_groups_id, connection_id, schema_name, table_name, column_name - ) latest - ON pr.table_groups_id = latest.table_groups_id - AND pr.schema_name = latest.schema_name - AND pr.table_name = latest.table_name - AND pr.column_name = latest.column_name - AND pr.run_date = latest.max_run_date -), - --- Test details from each test type -test_detail AS ( - SELECT t.test_suite_id, - '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, - '{RUN_DATE}'::TIMESTAMP as test_time, - t.column_name, t.test_type, t.id::VARCHAR as test_definition_id, - t.test_action, t.test_description, - - SUBSTRING( - CASE WHEN t.baseline_ct > '' THEN ', Baseline_Ct=' || t.baseline_ct ELSE '' END - || CASE WHEN t.baseline_unique_ct > '' THEN ', Baseline_Unique_Ct=' || t.baseline_unique_ct ELSE '' END - || CASE WHEN t.baseline_value > '' THEN ', Baseline_Value=' || t.baseline_value ELSE '' END - || CASE WHEN t.baseline_value_ct > '' THEN ', Baseline_Value_Ct=' || t.baseline_value_ct ELSE '' END - || CASE WHEN t.baseline_sum > '' THEN ', Baseline_Sum=' || t.baseline_sum ELSE '' END - || CASE WHEN t.baseline_avg > '' THEN ', Baseline_Avg=' || t.baseline_avg ELSE '' END - || CASE WHEN t.baseline_sd > '' THEN ', Baseline_SD=' || t.baseline_sd ELSE '' END - || CASE WHEN t.threshold_value > '' THEN ', Threshold_Value=' || t.threshold_value ELSE '' END, - 3, 999) || ' ' - as parms, - - -- Standard Measure start - 'CAST(' || - -- Nested parm replacements - part of query, not Python parms - REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE( - c.measure, - '{COLUMN_NAME}', '{QUOTE}' || COALESCE(t.column_name, '') || '{QUOTE}'), - '{COLUMN_TYPE}', COALESCE(ct.column_type, '')), - '{BASELINE_CT}', COALESCE(t.baseline_ct, '')), - '{BASELINE_UNIQUE_CT}', COALESCE(t.baseline_unique_ct, '')), - '{BASELINE_VALUE}', COALESCE(t.baseline_value, '') ), - '{BASELINE_VALUE_CT}', COALESCE(t.baseline_value_ct, '') ), - '{BASELINE_SUM}', COALESCE(t.baseline_sum, '') ), - '{BASELINE_AVG}', COALESCE(t.baseline_avg, '') ), - '{BASELINE_SD}', COALESCE(t.baseline_sd, '') ), - '{CUSTOM_QUERY}', COALESCE(t.custom_query, '')), - '{THRESHOLD_VALUE}', COALESCE(t.threshold_value, '')) - -- Standard measure end with pipe delimiter - || ' AS {VARCHAR_TYPE}) {CONCAT_OPERATOR} ''|'' ' as measure, - - -- Standard CASE for condition starts - 'CASE WHEN ' || - -- Nested parm replacements - standard - REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE( - c.measure || c.test_operator || c.test_condition, - '{COLUMN_NAME}', '{QUOTE}' || COALESCE(t.column_name, '') || '{QUOTE}'), - '{COLUMN_TYPE}', COALESCE(ct.column_type, '')), - '{BASELINE_CT}', COALESCE(t.baseline_ct, '')), - '{BASELINE_UNIQUE_CT}', COALESCE(t.baseline_unique_ct, '')), - '{BASELINE_VALUE}', COALESCE(t.baseline_value, '') ), - '{BASELINE_VALUE_CT}', COALESCE(t.baseline_value_ct, '') ), - '{BASELINE_SUM}', COALESCE(t.baseline_sum, '') ), - '{BASELINE_AVG}', COALESCE(t.baseline_avg, '') ), - '{BASELINE_SD}', COALESCE(t.baseline_sd, '') ), - '{CUSTOM_QUERY}', COALESCE(t.custom_query, '')), - '{THRESHOLD_VALUE}', COALESCE(t.threshold_value, '')) - -- Standard case ends - || ' THEN ''0,'' ELSE ''1,'' END' as condition - FROM test_definitions t - INNER JOIN cat_test_conditions c - ON (t.test_type = c.test_type - AND '{SQL_FLAVOR}' = c.sql_flavor) - INNER JOIN test_suites s - ON t.test_suite_id = s.id - LEFT JOIN column_types ct - ON s.table_groups_id = ct.table_groups_id - AND t.schema_name = ct.schema_name - AND t.table_name = ct.table_name - AND t.column_name = ct.column_name - WHERE t.test_suite_id = '{TEST_SUITE_ID}' - AND t.schema_name = '{SCHEMA_NAME}' - AND t.table_name = '{TABLE_NAME}' - AND COALESCE(t.test_active, 'Y') = 'Y' - ), - -test_detail_split AS ( - SELECT test_suite_id, schema_name, table_name, test_time, - column_name, test_type, test_definition_id, test_action, test_description, - parms, measure, condition, - SUM(LENGTH(condition)) OVER (PARTITION BY t.schema_name, t.table_name - ORDER BY t.column_name ROWS UNBOUNDED PRECEDING ) as run_total_chars, - FLOOR( SUM(LENGTH(condition)) OVER (PARTITION BY t.schema_name, t.table_name - ORDER BY t.column_name ROWS UNBOUNDED PRECEDING ) - / {MAX_QUERY_CHARS} ) + 1 as query_split_no - FROM test_detail t -) - -SELECT '{TEST_RUN_ID}' as test_run_id, - d.schema_name, d.table_name, - d.query_split_no as cat_sequence, - COUNT(*) as test_count, - '{RUN_DATE}'::TIMESTAMP as test_time, - STRING_AGG(COALESCE(d.column_name, 'N/A'), '~|~' ORDER BY d.column_name) as column_names, - STRING_AGG(d.test_type, ',' ORDER BY d.column_name) as test_types, - STRING_AGG(d.test_definition_id, ',' ORDER BY d.column_name) as test_definition_ids, - -- Pipe delimiter below, because commas may be embedded - STRING_AGG(d.test_action, '|' ORDER BY d.column_name) as test_actions, - STRING_AGG(d.test_description, '|' ORDER BY d.column_name) as test_descriptions, - - -- Consolidated Parms - STRING_AGG( d.parms, '|' ORDER BY d.column_name) as parms, - - -- Consolidated Measures - -- Encode Null as text to decode when freeing kittens - STRING_AGG( 'COALESCE(' || d.measure || ',''' || '' || '|'')', - -- Use ++ as STRING_AGG delimiter -- replace with + later - '++' ORDER BY d.column_name) as measures, - - -- Consolidated CASE statements - STRING_AGG( d.condition, - -- Use ++ as STRING_AGG delimiter -- replace with + later - '++' ORDER BY d.column_name) as conditions - - FROM test_detail_split d -GROUP BY d.test_suite_id, d.schema_name, d.table_name, test_time, d.query_split_no; diff --git a/testgen/template/exec_cat_tests/ex_cat_get_distinct_tables.sql b/testgen/template/exec_cat_tests/ex_cat_get_distinct_tables.sql deleted file mode 100644 index b5c7961..0000000 --- a/testgen/template/exec_cat_tests/ex_cat_get_distinct_tables.sql +++ /dev/null @@ -1,11 +0,0 @@ -SELECT DISTINCT schema_name, - table_name - FROM test_definitions td - INNER JOIN test_types tt - ON td.test_type = tt.test_type - INNER JOIN table_groups tg - ON (td.table_groups_id = tg.id) - INNER JOIN connections c - ON (tg.connection_id = c.connection_id) - WHERE td.test_suite_id = :TEST_SUITE_ID - AND tt.run_type = 'CAT'; diff --git a/testgen/template/exec_cat_tests/ex_cat_results_parse.sql b/testgen/template/exec_cat_tests/ex_cat_results_parse.sql deleted file mode 100644 index 74f5dce..0000000 --- a/testgen/template/exec_cat_tests/ex_cat_results_parse.sql +++ /dev/null @@ -1,68 +0,0 @@ --- Parses aggregated results and inserts into test_results table -WITH seq_digit AS ( - SELECT 0 as d UNION ALL - SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL - SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL - SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9 ), - seq_table_raw AS ( - SELECT CAST(a.d + (10 * b.d) + (100 * c.d) + (1000 * d.d) as INT) as nbr - FROM seq_digit a CROSS JOIN seq_digit b CROSS JOIN seq_digit c CROSS JOIN seq_digit d - ORDER BY nbr LIMIT 1000), - seq_table AS ( - SELECT nbr FROM seq_table_raw WHERE nbr > 0), - raw_results AS ( - SELECT t.test_run_id, t.schema_name, t.table_name, t.cat_sequence, t.test_count, - t.test_time, t.start_time, t.end_time, t.column_names, t.test_types, t.test_definition_ids, - t.test_actions, t.test_descriptions, - t.test_parms, t.test_measures, t.test_conditions, - r.measure_results, r.test_results - FROM working_agg_cat_tests t - INNER JOIN working_agg_cat_results r - ON (t.test_run_id = r.test_run_id - AND t.schema_name = r.schema_name - AND t.table_name = r.table_name - AND t.cat_sequence = r.cat_sequence) - WHERE t.test_run_id = :TEST_RUN_ID - AND t.column_names > '' - ), - parsed_results AS ( - SELECT t.schema_name, - t.table_name, - t.test_time, - t.start_time, - t.end_time, - nbr AS test_number, - SPLIT_PART(t.test_actions, '|,', s.nbr) AS test_action, - SPLIT_PART(t.test_descriptions, '|', s.nbr) AS test_description, - SPLIT_PART(t.column_names, '~|~', s.nbr) AS column_name, - SPLIT_PART(t.test_types, ',', s.nbr) AS test_type, - SPLIT_PART(t.test_definition_ids, ',', s.nbr) AS test_definition_id, - SPLIT_PART(t.test_parms, '|', s.nbr) AS test_parms, - SPLIT_PART(t.test_measures, '++', s.nbr) AS measure, - TRIM(SPLIT_PART(t.test_conditions, '++', s.nbr)) AS condition, - -- Restore encoded null value - NULLIF(SPLIT_PART(t.measure_results, '|', s.nbr), '') AS measure_result, - SPLIT_PART(t.test_results, ',', s.nbr) AS test_result - FROM raw_results t - CROSS JOIN seq_table s - ) -INSERT INTO test_results - (test_run_id, test_type, test_definition_id, test_suite_id, - test_time, starttime, endtime, schema_name, table_name, column_names, - skip_errors, input_parameters, result_code, - result_measure, test_action, subset_condition, result_query, test_description) -SELECT :TEST_RUN_ID as test_run_id, - r.test_type, r.test_definition_id::UUID, :TEST_SUITE_ID, r.test_time, r.start_time, r.end_time, - r.schema_name, r.table_name, r.column_name, - 0 as skip_errors, - r.test_parms as input_parameters, - r.test_result::INT as result_code, - r.measure_result as result_measure, - r.test_action, NULL as subset_condition, - 'SELECT ' || LEFT(REPLACE(r.condition, '{RUN_' || 'DATE}', :RUN_DATE), LENGTH(REPLACE(r.condition, '{RUN_' || 'DATE}', :RUN_DATE - )) - LENGTH(' THEN ''0,'' ELSE ''1,'' END')) || ' THEN 0 ELSE 1 END' - || ' FROM ' || r.schema_name || '.' || r.table_name as result_query, - COALESCE(r.test_description, c.test_description) as test_description - FROM parsed_results r - INNER JOIN test_types c - ON r.test_type = c.test_type; diff --git a/testgen/template/exec_cat_tests/ex_cat_retrieve_agg_test_parms.sql b/testgen/template/exec_cat_tests/ex_cat_retrieve_agg_test_parms.sql deleted file mode 100644 index 7632fdb..0000000 --- a/testgen/template/exec_cat_tests/ex_cat_retrieve_agg_test_parms.sql +++ /dev/null @@ -1,8 +0,0 @@ -SELECT schema_name, - table_name, - cat_sequence, - -- Replace list delimiters with concat operator - REPLACE(test_measures, '++', :CONCAT_OPERATOR) as test_measures, - REPLACE(test_conditions, '++', :CONCAT_OPERATOR) as test_conditions - FROM working_agg_cat_tests - WHERE test_run_id = :TEST_RUN_ID; diff --git a/testgen/template/exec_cat_tests/ex_cat_test_query.sql b/testgen/template/exec_cat_tests/ex_cat_test_query.sql deleted file mode 100644 index 3013780..0000000 --- a/testgen/template/exec_cat_tests/ex_cat_test_query.sql +++ /dev/null @@ -1,7 +0,0 @@ -SELECT '{TEST_RUN_ID}' as test_run_id, - '{SCHEMA_NAME}' as schema_name, - '{TABLE_NAME}' as table_name, - '{CAT_SEQUENCE}' as cat_sequence, - {TEST_MEASURES} as measure_results, - {TEST_CONDITIONS} as test_results - FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} diff --git a/testgen/template/execution/disable_invalid_test_definitions.sql b/testgen/template/execution/disable_invalid_test_definitions.sql new file mode 100644 index 0000000..fd3616e --- /dev/null +++ b/testgen/template/execution/disable_invalid_test_definitions.sql @@ -0,0 +1,6 @@ +UPDATE test_definitions td +SET test_active = 'N', + test_definition_status = LEFT('Deactivated ' || :RUN_DATE || ': ' || tr.result_message, 200) +FROM test_results tr +WHERE td.id = tr.test_definition_id + AND tr.test_run_id = :TEST_RUN_ID; diff --git a/testgen/template/execution/ex_get_tests_non_cat.sql b/testgen/template/execution/ex_get_tests_non_cat.sql deleted file mode 100644 index 69672e1..0000000 --- a/testgen/template/execution/ex_get_tests_non_cat.sql +++ /dev/null @@ -1,51 +0,0 @@ -SELECT tt.test_type, - td.id::VARCHAR AS test_definition_id, - COALESCE(td.test_description, tt.test_description) AS test_description, - COALESCE(td.test_action, ts.test_action, '') AS test_action, - schema_name, - table_name, - column_name, - cast(coalesce(skip_errors, 0) as varchar(50)) as skip_errors, - coalesce(baseline_ct, '') as baseline_ct, - coalesce(baseline_unique_ct, '') as baseline_unique_ct, - coalesce(baseline_value, '') as baseline_value, - coalesce(baseline_value_ct, '') as baseline_value_ct, - coalesce(threshold_value, '') as threshold_value, - coalesce(baseline_sum, '') as baseline_sum, - coalesce(baseline_avg, '') as baseline_avg, - coalesce(baseline_sd, '') as baseline_sd, - coalesce(lower_tolerance, '') as lower_tolerance, - coalesce(upper_tolerance, '') as upper_tolerance, - case - when nullif(subset_condition, '') is null then '1=1' - else subset_condition end as subset_condition, - coalesce(groupby_names, '') as groupby_names, - case - when having_condition is null then '' - else concat('HAVING ', having_condition) end as having_condition, - coalesce(window_date_column, '') as window_date_column, - cast(coalesce(window_days, '0') as varchar(50)) as window_days, - coalesce(match_schema_name, '') as match_schema_name, - coalesce(match_table_name, '') as match_table_name, - coalesce(match_column_names, '') as match_column_names, - case - when nullif(match_subset_condition, '') is null then '1=1' - else match_subset_condition end as match_subset_condition, - coalesce(match_groupby_names, '') as match_groupby_names, - case - when match_having_condition is null then '' - else concat('HAVING ', match_having_condition) - END as match_having_condition, - coalesce(custom_query, '') as custom_query, - coalesce(tm.template_name, '') as template_name -FROM test_definitions td - INNER JOIN test_suites ts - ON (td.test_suite_id = ts.id) - INNER JOIN test_types tt - ON (td.test_type = tt.test_type) - LEFT JOIN test_templates tm - ON (td.test_type = tm.test_type - AND :SQL_FLAVOR = tm.sql_flavor) -WHERE td.test_suite_id = :TEST_SUITE_ID - AND tt.run_type = 'QUERY' - AND td.test_active = 'Y'; diff --git a/testgen/template/execution/ex_update_test_suite.sql b/testgen/template/execution/ex_update_test_suite.sql deleted file mode 100644 index 7250559..0000000 --- a/testgen/template/execution/ex_update_test_suite.sql +++ /dev/null @@ -1,13 +0,0 @@ -WITH last_run - AS (SELECT test_suite_id, MAX(test_starttime) as max_starttime - FROM test_runs - WHERE test_suite_id = :TEST_SUITE_ID - AND status = 'Complete' - GROUP BY test_suite_id) -UPDATE test_suites - SET last_complete_test_run_id = r.id - FROM test_runs r -INNER JOIN last_run l - ON (r.test_suite_id = l.test_suite_id - AND r.test_starttime = l.max_starttime) - WHERE test_suites.id = r.test_suite_id; \ No newline at end of file diff --git a/testgen/template/execution/get_active_test_definitions.sql b/testgen/template/execution/get_active_test_definitions.sql new file mode 100644 index 0000000..f59b670 --- /dev/null +++ b/testgen/template/execution/get_active_test_definitions.sql @@ -0,0 +1,46 @@ +SELECT td.id, + td.test_type, + schema_name, + table_name, + column_name, + skip_errors, + baseline_ct, + baseline_unique_ct, + baseline_value, + baseline_value_ct, + threshold_value, + baseline_sum, + baseline_avg, + baseline_sd, + lower_tolerance, + upper_tolerance, + subset_condition, + groupby_names, + having_condition, + window_date_column, + window_days, + match_schema_name, + match_table_name, + match_column_names, + match_subset_condition, + match_groupby_names, + match_having_condition, + custom_query, + tt.run_type, + tt.test_scope, + tm.template_name, + c.measure, + c.test_operator, + c.test_condition +FROM test_definitions td + LEFT JOIN test_types tt ON (td.test_type = tt.test_type) + LEFT JOIN test_templates tm ON ( + td.test_type = tm.test_type + AND :SQL_FLAVOR = tm.sql_flavor + ) + LEFT JOIN cat_test_conditions c ON ( + td.test_type = c.test_type + AND :SQL_FLAVOR = c.sql_flavor + ) +WHERE td.test_suite_id = :TEST_SUITE_ID + AND td.test_active = 'Y'; \ No newline at end of file diff --git a/testgen/template/execution/ex_update_history_threshold_last_n.sql b/testgen/template/execution/update_historic_thresholds.sql similarity index 95% rename from testgen/template/execution/ex_update_history_threshold_last_n.sql rename to testgen/template/execution/update_historic_thresholds.sql index b8b9d53..51d4340 100644 --- a/testgen/template/execution/ex_update_history_threshold_last_n.sql +++ b/testgen/template/execution/update_historic_thresholds.sql @@ -16,7 +16,7 @@ WITH stats AS ( ORDER BY tr.test_time DESC LIMIT CASE WHEN d.history_calculation = 'Value' THEN 1 ELSE d.history_lookback END ) AS r ON TRUE - WHERE d.test_suite_id = '{TEST_SUITE_ID}' + WHERE d.test_suite_id = :TEST_SUITE_ID AND d.test_active = 'Y' AND d.history_lookback IS NOT NULL GROUP BY d.id, d.history_calculation, d.history_lookback diff --git a/testgen/template/execution/ex_finalize_test_run_results.sql b/testgen/template/execution/update_test_results.sql similarity index 100% rename from testgen/template/execution/ex_finalize_test_run_results.sql rename to testgen/template/execution/update_test_results.sql diff --git a/testgen/template/execution/ex_update_test_record_in_testrun_table.sql b/testgen/template/execution/update_test_run_stats.sql similarity index 55% rename from testgen/template/execution/ex_update_test_record_in_testrun_table.sql rename to testgen/template/execution/update_test_run_stats.sql index 5313715..15dab13 100644 --- a/testgen/template/execution/ex_update_test_record_in_testrun_table.sql +++ b/testgen/template/execution/update_test_run_stats.sql @@ -1,22 +1,18 @@ WITH stats AS ( SELECT r.id as test_run_id, - COALESCE(COUNT(tr.id) , 0) AS test_ct, - SUM(result_code) AS passed_ct, - COALESCE(SUM(CASE WHEN tr.result_status = 'Failed' THEN 1 END), 0) AS failed_ct, - COALESCE(SUM(CASE WHEN tr.result_status = 'Warning' THEN 1 END), 0) AS warning_ct, - COALESCE(SUM(CASE WHEN tr.result_status = 'Log' THEN 1 END), 0) AS log_ct, - COALESCE(SUM(CASE WHEN tr.result_message ILIKE 'ERROR%' THEN 1 ELSE 0 END), 0) AS error_ct + COALESCE(COUNT(tr.id), 0) AS test_ct, + SUM(result_code) AS passed_ct, + COALESCE(SUM(CASE WHEN tr.result_status = 'Failed' THEN 1 END), 0) AS failed_ct, + COALESCE(SUM(CASE WHEN tr.result_status = 'Warning' THEN 1 END), 0) AS warning_ct, + COALESCE(SUM(CASE WHEN tr.result_status = 'Log' THEN 1 END), 0) AS log_ct, + COALESCE(SUM(CASE WHEN tr.result_status = 'Error' THEN 1 ELSE 0 END), 0) AS error_ct FROM test_runs r INNER JOIN test_results tr ON r.id = tr.test_run_id WHERE r.id = :TEST_RUN_ID GROUP BY r.id ) UPDATE test_runs - SET status = CASE WHEN length(:EXCEPTION_MESSAGE) = 0 then 'Complete' else 'Error' end, - test_endtime = :NOW_TIMESTAMP, - log_message = :EXCEPTION_MESSAGE, - duration = TO_CHAR(:NOW_TIMESTAMP - r.test_starttime, 'HH24:MI:SS'), - test_ct = s.test_ct, + SET test_ct = s.test_ct, passed_ct = s.passed_ct, failed_ct = s.failed_ct, warning_ct = s.warning_ct, diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_data_match_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_data_match_bigquery.sql index 03ccee3..374de51 100644 --- a/testgen/template/flavors/bigquery/exec_query_tests/ex_data_match_bigquery.sql +++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_data_match_bigquery.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' AS test_type, '{TEST_SUITE_ID}' AS test_suite_id, '{TEST_RUN_ID}' AS test_run_id, '{RUN_DATE}' AS test_time, - '{START_TIME}' AS starttime, - CURRENT_TIMESTAMP AS endtime, '{SCHEMA_NAME}' AS schema_name, '{TABLE_NAME}' AS table_name, '{COLUMN_NAME_NO_QUOTES}' AS column_names, @@ -26,9 +24,7 @@ SELECT '{TEST_TYPE}' AS test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) AS result_measure, - '{SUBSET_DISPLAY}' AS subset_condition, - NULL AS result_query + COUNT(*) AS result_measure FROM ( SELECT {COLUMN_NAME_NO_QUOTES} FROM `{SCHEMA_NAME}.{TABLE_NAME}` diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_relative_entropy_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_relative_entropy_bigquery.sql index 0aee6ea..780538e 100644 --- a/testgen/template/flavors/bigquery/exec_query_tests/ex_relative_entropy_bigquery.sql +++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_relative_entropy_bigquery.sql @@ -32,8 +32,6 @@ SELECT '{TEST_TYPE}' AS test_type, '{TEST_SUITE_ID}' AS test_suite_id, '{TEST_RUN_ID}' AS test_run_id, '{RUN_DATE}' AS test_time, - '{START_TIME}' AS starttime, - CURRENT_TIMESTAMP AS endtime, '{SCHEMA_NAME}' AS schema_name, '{TABLE_NAME}' AS table_name, '{COLUMN_NAME_NO_QUOTES}' AS column_names, @@ -44,9 +42,7 @@ SELECT '{TEST_TYPE}' AS test_type, NULL as result_signal, CASE WHEN js_divergence > {THRESHOLD_VALUE} THEN 0 ELSE 1 END AS result_code, CONCAT('Divergence Level: ', CAST(js_divergence AS STRING), ', Threshold: {THRESHOLD_VALUE}.') AS result_message, - js_divergence AS result_measure, - '{SUBSET_DISPLAY}' AS subset_condition, - NULL AS result_query + js_divergence AS result_measure FROM ( SELECT 0.5 * ABS(SUM(new_pct * LN(new_pct/avg_pct)/LN(2))) + 0.5 * ABS(SUM(old_pct * LN(old_pct/avg_pct)/LN(2))) AS js_divergence diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_table_changed_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_table_changed_bigquery.sql index 70d97b3..87365dc 100644 --- a/testgen/template/flavors/bigquery/exec_query_tests/ex_table_changed_bigquery.sql +++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_table_changed_bigquery.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' AS test_type, '{TEST_SUITE_ID}' AS test_suite_id, '{TEST_RUN_ID}' AS test_run_id, '{RUN_DATE}' AS test_time, - '{START_TIME}' AS starttime, - CURRENT_TIMESTAMP AS endtime, '{SCHEMA_NAME}' AS schema_name, '{TABLE_NAME}' AS table_name, '{COLUMN_NAME_NO_QUOTES}' AS column_names, @@ -20,9 +18,7 @@ SELECT '{TEST_TYPE}' AS test_type, END AS result_message, CASE WHEN fingerprint = '{BASELINE_VALUE}' THEN 0 ELSE 1 - END AS result_measure, - '{SUBSET_DISPLAY}' AS subset_condition, - NULL AS result_query + END AS result_measure FROM ( SELECT {CUSTOM_QUERY} AS fingerprint FROM `{SCHEMA_NAME}.{TABLE_NAME}` diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_no_drops_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_no_drops_bigquery.sql index 5ba04cf..4e47eaf 100644 --- a/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_no_drops_bigquery.sql +++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_no_drops_bigquery.sql @@ -4,8 +4,6 @@ SELECT '{TEST_SUITE_ID}' AS test_suite_id, '{TEST_RUN_ID}' AS test_run_id, '{RUN_DATE}' AS test_time, - '{START_TIME}' AS starttime, - CURRENT_TIMESTAMP AS endtime, '{SCHEMA_NAME}' AS schema_name, '{TABLE_NAME}' AS table_name, '{COLUMN_NAME_NO_QUOTES}' AS column_names, @@ -25,9 +23,7 @@ SELECT ) ELSE 'No errors found.' END AS result_message, - COUNT(*) AS result_measure, - '{SUBSET_DISPLAY}' AS subset_condition, - NULL AS result_query + COUNT(*) AS result_measure FROM ( SELECT {COLUMN_NAME_NO_QUOTES} FROM `{SCHEMA_NAME}.{TABLE_NAME}` diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_same_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_same_bigquery.sql index c16c158..9b05197 100644 --- a/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_same_bigquery.sql +++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_same_bigquery.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' AS test_type, '{TEST_SUITE_ID}' AS test_suite_id, '{TEST_RUN_ID}' AS test_run_id, '{RUN_DATE}' AS test_time, - '{START_TIME}' AS starttime, - CURRENT_TIMESTAMP AS endtime, '{SCHEMA_NAME}' AS schema_name, '{TABLE_NAME}' AS table_name, '{COLUMN_NAME_NO_QUOTES}' AS column_names, @@ -26,13 +24,11 @@ SELECT '{TEST_TYPE}' AS test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) AS result_measure, - '{SUBSET_DISPLAY}' AS subset_condition, - NULL AS result_query + COUNT(*) AS result_measure FROM ( -- Values in the prior timeframe but not in the latest ( - SELECT 'Prior Timeframe' AS missing_from, {COLUMN_NAME} + SELECT 'Prior Timeframe' AS missing_from, {COLUMN_NAME_NO_QUOTES} FROM `{SCHEMA_NAME}.{TABLE_NAME}` WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATE_ADD( @@ -40,7 +36,7 @@ FROM ( INTERVAL -{WINDOW_DAYS} DAY ) EXCEPT DISTINCT - SELECT 'Prior Timeframe' AS missing_from, {COLUMN_NAME} + SELECT 'Prior Timeframe' AS missing_from, {COLUMN_NAME_NO_QUOTES} FROM `{SCHEMA_NAME}.{TABLE_NAME}` WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATE_ADD( @@ -55,7 +51,7 @@ FROM ( UNION ALL -- Values in the latest timeframe but not in the prior ( - SELECT 'Latest Timeframe' AS missing_from, {COLUMN_NAME} + SELECT 'Latest Timeframe' AS missing_from, {COLUMN_NAME_NO_QUOTES} FROM `{SCHEMA_NAME}.{TABLE_NAME}` WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATE_ADD( @@ -67,7 +63,7 @@ FROM ( INTERVAL -{WINDOW_DAYS} DAY ) EXCEPT DISTINCT - SELECT 'Latest Timeframe' AS missing_from, {COLUMN_NAME} + SELECT 'Latest Timeframe' AS missing_from, {COLUMN_NAME_NO_QUOTES} FROM `{SCHEMA_NAME}.{TABLE_NAME}` WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATE_ADD( diff --git a/testgen/template/flavors/bigquery/validate_tests/ex_get_project_column_list.sql b/testgen/template/flavors/bigquery/validate_tests/ex_get_project_column_list.sql deleted file mode 100644 index 8a465da..0000000 --- a/testgen/template/flavors/bigquery/validate_tests/ex_get_project_column_list.sql +++ /dev/null @@ -1,3 +0,0 @@ -select concat(concat(concat(table_schema, '.'), concat(table_name, '.')), column_name) as columns -from `{DATA_SCHEMA}.INFORMATION_SCHEMA.COLUMNS` -where table_schema in ({TEST_SCHEMAS}); diff --git a/testgen/template/flavors/bigquery/validate_tests/get_target_identifiers.sql b/testgen/template/flavors/bigquery/validate_tests/get_target_identifiers.sql new file mode 100644 index 0000000..a05b333 --- /dev/null +++ b/testgen/template/flavors/bigquery/validate_tests/get_target_identifiers.sql @@ -0,0 +1,5 @@ +SELECT table_schema AS schema_name, + table_name, + column_name +FROM `{DATA_SCHEMA}.INFORMATION_SCHEMA.COLUMNS` +WHERE table_schema IN ({TEST_SCHEMAS}); diff --git a/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_no_drops_databricks.sql b/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_no_drops_databricks.sql index 6e5184d..fc354f4 100644 --- a/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_no_drops_databricks.sql +++ b/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_no_drops_databricks.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( SELECT {COLUMN_NAME_NO_QUOTES} FROM `{SCHEMA_NAME}`.`{TABLE_NAME}` diff --git a/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_same_databricks.sql b/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_same_databricks.sql index 7a078dc..a30768b 100644 --- a/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_same_databricks.sql +++ b/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_same_databricks.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( ( SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_no_drops_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_no_drops_generic.sql index 098da4d..7e8d3ff 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_no_drops_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_no_drops_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_generic.sql index fe60101..accad51 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_range_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_range_generic.sql index 8984570..e183241 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_range_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_range_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql index 3fb69cc..e5dbfbf 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_custom_query_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_custom_query_generic.sql index 19d0c51..0d17c0f 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_custom_query_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_custom_query_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, CASE @@ -31,9 +29,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - NULL as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( {CUSTOM_QUERY} ) TEST; diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_data_match_2way_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_data_match_2way_generic.sql index 838ea5c..52dd918 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_data_match_2way_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_data_match_2way_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( ( SELECT {GROUPBY_NAMES} FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_data_match_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_data_match_generic.sql index 0c0c0b1..f7758fa 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_data_match_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_data_match_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( SELECT {COLUMN_NAME_NO_QUOTES} FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_dupe_rows_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_dupe_rows_generic.sql index 6113710..b194bde 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_dupe_rows_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_dupe_rows_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COALESCE(SUM(record_ct), 0) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COALESCE(SUM(record_ct), 0) as result_measure FROM ( SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_prior_match_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_prior_match_generic.sql deleted file mode 100644 index fb71734..0000000 --- a/testgen/template/flavors/generic/exec_query_tests/ex_prior_match_generic.sql +++ /dev/null @@ -1,38 +0,0 @@ -SELECT '{TEST_TYPE}' as test_type, - '{TEST_DEFINITION_ID}' as test_definition_id, - '{TEST_SUITE_ID}' as test_suite_id, - '{RUN_DATE}' as test_time, '{START_TIME}' as starttime,CURRENT_TIMESTAMP as endtime, - '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME}' as column_names, - {SKIP_ERRORS} as skip_errors, - 'schema_name = {SCHEMA_NAME}, prior_schema = {MATCH_SCHEMA_NAME}, table_name = {TABLE_NAME}, column_name = {COLUMN_NAME}, subset_condition = {SUBSET_CONDITION}, mode = {MODE}' - as input_parameters, - NULL as result_signal, - CASE WHEN COUNT(*) > COALESCE(skip_errors, 0) THEN 0 ELSE 1 END as result_code, - CONCAT( - CONCAT( 'Mismatched measures: ', CAST( COALESCE(COUNT(*), 0) AS {VARCHAR_TYPE}) ), - CONCAT( ', Threshold: ', - CONCAT( CAST(COALESCE(skip_errors, 0) AS {VARCHAR_TYPE}), '.') - ) - ) AS result_message, - COUNT(*) as result_measure, - '{TEST_ACTION}' as test_action, - '{SUBSET_CONDITION}' as subset_condition, - NULL as result_query, - '{TEST_DESCRIPTION}' as test_description - FROM ( - ( SELECT {COLUMN_NAME} - FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} - WHERE {SUBSET_CONDITION} - EXCEPT - SELECT {COLUMN_NAME} - FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} - WHERE {SUBSET_CONDITION} ) - UNION -( SELECT {COLUMN_NAME} - FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} - WHERE {SUBSET_CONDITION} - EXCEPT - SELECT {COLUMN_NAME} - FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} - WHERE {SUBSET_CONDITION} ) -); diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql index 84be731..6f30c53 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql @@ -30,8 +30,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -44,9 +42,7 @@ SELECT '{TEST_TYPE}' as test_type, CONCAT('Divergence Level: ', CONCAT(CAST(js_divergence AS {VARCHAR_TYPE}), ', Threshold: {THRESHOLD_VALUE}.')) as result_message, - js_divergence as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + js_divergence as result_measure FROM ( SELECT 0.5 * ABS(SUM(new_pct * LN(new_pct/avg_pct)/LN(2))) + 0.5 * ABS(SUM(old_pct * LN(old_pct/avg_pct)/LN(2))) as js_divergence diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_table_changed_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_table_changed_generic.sql index bf573f7..672f19d 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_table_changed_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_table_changed_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -24,9 +22,7 @@ SELECT '{TEST_TYPE}' as test_type, WHEN fingerprint = '{BASELINE_VALUE}' THEN 0 ELSE 1 - END as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + END as result_measure FROM ( SELECT {CUSTOM_QUERY} as fingerprint FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql index 81d0784..7ece651 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( SELECT {COLUMN_NAME_NO_QUOTES} FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql index 3bb2e84..9b463d7 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( ( SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} diff --git a/testgen/template/flavors/generic/validate_tests/ex_get_project_column_list.sql b/testgen/template/flavors/generic/validate_tests/ex_get_project_column_list.sql deleted file mode 100644 index eacffa6..0000000 --- a/testgen/template/flavors/generic/validate_tests/ex_get_project_column_list.sql +++ /dev/null @@ -1,3 +0,0 @@ -select concat(concat(concat(table_schema, '.'), concat(table_name, '.')), column_name) as columns -from information_schema.columns -where table_schema in ({TEST_SCHEMAS}); diff --git a/testgen/template/flavors/generic/validate_tests/get_target_identifiers.sql b/testgen/template/flavors/generic/validate_tests/get_target_identifiers.sql new file mode 100644 index 0000000..dba356d --- /dev/null +++ b/testgen/template/flavors/generic/validate_tests/get_target_identifiers.sql @@ -0,0 +1,5 @@ +SELECT table_schema AS schema_name, + table_name, + column_name +FROM information_schema.columns +WHERE table_schema IN ({TEST_SCHEMAS}); diff --git a/testgen/template/flavors/mssql/exec_query_tests/ex_relative_entropy_mssql.sql b/testgen/template/flavors/mssql/exec_query_tests/ex_relative_entropy_mssql.sql index 7b26cba..4ec91d2 100644 --- a/testgen/template/flavors/mssql/exec_query_tests/ex_relative_entropy_mssql.sql +++ b/testgen/template/flavors/mssql/exec_query_tests/ex_relative_entropy_mssql.sql @@ -30,8 +30,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -44,9 +42,7 @@ SELECT '{TEST_TYPE}' as test_type, CONCAT('Divergence Level: ', CONCAT(CAST(js_divergence AS VARCHAR), ', Threshold: {THRESHOLD_VALUE}.')) as result_message, - js_divergence as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + js_divergence as result_measure FROM ( SELECT 0.5 * ABS(SUM(new_pct * LOG(new_pct/avg_pct)/LOG(2))) + 0.5 * ABS(SUM(old_pct * LOG(old_pct/avg_pct)/LOG(2))) as js_divergence diff --git a/testgen/template/flavors/mssql/exec_query_tests/ex_table_changed_mssql.sql b/testgen/template/flavors/mssql/exec_query_tests/ex_table_changed_mssql.sql index 978a46d..b448fe8 100644 --- a/testgen/template/flavors/mssql/exec_query_tests/ex_table_changed_mssql.sql +++ b/testgen/template/flavors/mssql/exec_query_tests/ex_table_changed_mssql.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -24,9 +22,7 @@ SELECT '{TEST_TYPE}' as test_type, WHEN fingerprint = '{BASELINE_VALUE}' THEN 0 ELSE 1 - END as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + END as result_measure FROM ( SELECT {CUSTOM_QUERY} as fingerprint FROM "{SCHEMA_NAME}"."{TABLE_NAME}" WITH (NOLOCK) WHERE {SUBSET_CONDITION} diff --git a/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_no_drops_postgresql.sql b/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_no_drops_postgresql.sql index 31b99ee..6088cd6 100644 --- a/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_no_drops_postgresql.sql +++ b/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_no_drops_postgresql.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( SELECT {COLUMN_NAME_NO_QUOTES} FROM "{SCHEMA_NAME}"."{TABLE_NAME}" diff --git a/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql b/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql index eda6d93..4cf4faf 100644 --- a/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql +++ b/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( ( SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} diff --git a/testgen/template/flavors/redshift_spectrum/validate_tests/ex_get_project_column_list.sql b/testgen/template/flavors/redshift_spectrum/validate_tests/ex_get_project_column_list.sql deleted file mode 100644 index 83cc609..0000000 --- a/testgen/template/flavors/redshift_spectrum/validate_tests/ex_get_project_column_list.sql +++ /dev/null @@ -1,3 +0,0 @@ -select concat(concat(concat(schemaname, '.'), concat(tablename, '.')), columnname) as columns -from svv_external_columns -where schemaname in ({TEST_SCHEMAS}); diff --git a/testgen/template/flavors/redshift_spectrum/validate_tests/get_target_identifiers.sql b/testgen/template/flavors/redshift_spectrum/validate_tests/get_target_identifiers.sql new file mode 100644 index 0000000..ebd4ca8 --- /dev/null +++ b/testgen/template/flavors/redshift_spectrum/validate_tests/get_target_identifiers.sql @@ -0,0 +1,5 @@ +SELECT schemaname AS schema_name, + tablename AS table_name, + columnname AS column_name +FROM svv_external_columns +WHERE schemaname IN ({TEST_SCHEMAS}); diff --git a/testgen/template/get_entities/get_test_info.sql b/testgen/template/get_entities/get_test_info.sql index b941cc2..142ddc6 100644 --- a/testgen/template/get_entities/get_test_info.sql +++ b/testgen/template/get_entities/get_test_info.sql @@ -4,7 +4,6 @@ Alternative: project-code, connection-id Optional: last_auto_run_date (==test-gen-run-id==), schema-name, table-name, column-name*/ SELECT ts.project_code as project_key, - td.cat_test_id, ts.test_suite as test_suite_key, td.test_type, COALESCE(td.test_description, tt.test_description) as test_description, diff --git a/testgen/template/get_entities/get_test_suite.sql b/testgen/template/get_entities/get_test_suite.sql index b602768..fdbd963 100644 --- a/testgen/template/get_entities/get_test_suite.sql +++ b/testgen/template/get_entities/get_test_suite.sql @@ -4,8 +4,6 @@ SELECT test_suite as test_suite_key, connection_id, test_suite_description, - test_action as default_test_action, - test_suite_schema, component_key, component_type FROM test_suites diff --git a/testgen/template/observability/get_test_results.sql b/testgen/template/observability/get_test_results.sql index 85ab567..077f872 100644 --- a/testgen/template/observability/get_test_results.sql +++ b/testgen/template/observability/get_test_results.sql @@ -1,7 +1,6 @@ SELECT project_name, component_tool, - "schema", connection_name, project_db, sample_min_count, diff --git a/testgen/template/parms/parms_test_execution.sql b/testgen/template/parms/parms_test_execution.sql deleted file mode 100644 index f81b0c2..0000000 --- a/testgen/template/parms/parms_test_execution.sql +++ /dev/null @@ -1,14 +0,0 @@ -SELECT ts.project_code, - ts.id::VARCHAR as test_suite_id, - ts.table_groups_id::VARCHAR, - tg.table_group_schema, - CASE - WHEN tg.profiling_table_set ILIKE '''%''' THEN tg.profiling_table_set - ELSE fn_format_csv_quotes(tg.profiling_table_set) - END as profiling_table_set, - tg.profiling_include_mask, - tg.profiling_exclude_mask - FROM test_suites ts - JOIN table_groups tg ON (ts.table_groups_id = tg.id) - WHERE ts.project_code = :PROJECT_CODE - AND ts.test_suite = :TEST_SUITE; diff --git a/testgen/template/quick_start/initial_data_seeding.sql b/testgen/template/quick_start/initial_data_seeding.sql index 6d9e76e..f47a161 100644 --- a/testgen/template/quick_start/initial_data_seeding.sql +++ b/testgen/template/quick_start/initial_data_seeding.sql @@ -31,9 +31,10 @@ SELECT '0ea85e17-acbe-47fe-8394-9970725ad37d'::UUID as id, 15000 as profile_sample_min_count; INSERT INTO test_suites - (project_code, test_suite, connection_id, table_groups_id, test_suite_description, + (id, project_code, test_suite, connection_id, table_groups_id, test_suite_description, export_to_observability, component_key, component_type) -SELECT '{PROJECT_CODE}' as project_code, +SELECT '9df7489d-92b3-49f9-95ca-512160d7896f'::UUID as id, + '{PROJECT_CODE}' as project_code, '{TEST_SUITE}' as test_suite, 1 as connection_id, '0ea85e17-acbe-47fe-8394-9970725ad37d'::UUID as table_groups_id, diff --git a/testgen/template/execution/ex_calc_prevalence_test_results.sql b/testgen/template/rollup_scores/calc_prevalence_test_results.sql similarity index 96% rename from testgen/template/execution/ex_calc_prevalence_test_results.sql rename to testgen/template/rollup_scores/calc_prevalence_test_results.sql index 95e09b4..88fdb6f 100644 --- a/testgen/template/execution/ex_calc_prevalence_test_results.sql +++ b/testgen/template/rollup_scores/calc_prevalence_test_results.sql @@ -14,7 +14,7 @@ UPDATE test_results INNER JOIN data_table_chars tc ON (r.table_groups_id = tc.table_groups_id AND r.table_name ILIKE tc.table_name) - WHERE r.test_run_id = '{TEST_RUN_ID}'::UUID + WHERE r.test_run_id = '{RUN_ID}'::UUID AND test_results.id = r.id; -- PROFILED COLUMN TESTS: Update to calculated prevalence for all fails/warnings - result_code = 0 @@ -51,7 +51,7 @@ WITH result_calc LEFT JOIN data_table_chars tc ON (r.table_groups_id = tc.table_groups_id AND r.table_name ILIKE tc.table_name) - WHERE r.test_run_id = '{TEST_RUN_ID}'::UUID + WHERE r.test_run_id = '{RUN_ID}'::UUID AND result_code = 0 AND r.result_measure IS NOT NULL AND tt.test_scope = 'column' @@ -79,7 +79,7 @@ WITH result_calc INNER JOIN data_table_chars tc ON (r.table_groups_id = tc.table_groups_id AND r.table_name ILIKE tc.table_name) - WHERE r.test_run_id = '{TEST_RUN_ID}'::UUID + WHERE r.test_run_id = '{RUN_ID}'::UUID AND result_code = 0 AND r.result_measure IS NOT NULL AND tt.test_scope <> 'column' diff --git a/testgen/template/validate_tests/ex_disable_tests_test_definitions.sql b/testgen/template/validate_tests/ex_disable_tests_test_definitions.sql deleted file mode 100644 index 6747843..0000000 --- a/testgen/template/validate_tests/ex_disable_tests_test_definitions.sql +++ /dev/null @@ -1,4 +0,0 @@ -UPDATE test_definitions - SET test_active = 'N' - WHERE test_suite_id = :TEST_SUITE_ID - AND test_active = 'D'; diff --git a/testgen/template/validate_tests/ex_flag_tests_test_definitions.sql b/testgen/template/validate_tests/ex_flag_tests_test_definitions.sql deleted file mode 100644 index 5d0b5a5..0000000 --- a/testgen/template/validate_tests/ex_flag_tests_test_definitions.sql +++ /dev/null @@ -1,7 +0,0 @@ -/* -Mark Test inactive for Missing columns/tables with update status -*/ -UPDATE test_definitions -SET test_active = :FLAG, - test_definition_status = LEFT('Inactivated ' || :RUN_DATE || ': ' || CONCAT_WS('; ', substring(test_definition_status from 34), :MESSAGE), 200) -WHERE cat_test_id IN :CAT_TEST_IDS; diff --git a/testgen/template/validate_tests/ex_get_test_column_list_tg.sql b/testgen/template/validate_tests/ex_get_test_column_list_tg.sql deleted file mode 100644 index f7a1474..0000000 --- a/testgen/template/validate_tests/ex_get_test_column_list_tg.sql +++ /dev/null @@ -1,98 +0,0 @@ - SELECT schema_name || '.' || table_name || '.' || column_name AS columns, - ARRAY_AGG(cat_test_id) as test_id_array - FROM ( - -- FROM: column_name - column scope (single column) - SELECT cat_test_id, - schema_name AS schema_name, - table_name AS table_name, - column_name - FROM test_definitions d - INNER JOIN test_types t - ON d.test_type = t.test_type - WHERE test_suite_id = :TEST_SUITE_ID - AND COALESCE(test_active, 'Y') = 'Y' - AND t.test_scope = 'column' - UNION - -- FROM: column_name - referential scope (could be multiple columns) - SELECT cat_test_id, - schema_name AS schema_name, - table_name AS table_name, - TRIM(TRIM(UNNEST(ARRAY_REMOVE( - REGEXP_SPLIT_TO_ARRAY(column_name, ',(?=(?:[^"]*"[^"]*")*[^"]*$)'), - '' )), ' '), '{QUOTE}') as column_name - FROM test_definitions d - INNER JOIN test_types t - ON d.test_type = t.test_type - WHERE test_suite_id = :TEST_SUITE_ID - AND COALESCE(test_active, 'Y') = 'Y' - AND t.test_scope = 'referential' - AND t.test_type NOT LIKE 'Aggregate_%' - UNION - -- FROM: groupby_names - SELECT cat_test_id, - schema_name AS schema_name, - table_name AS table_name, - TRIM(TRIM(UNNEST(ARRAY_REMOVE( - REGEXP_SPLIT_TO_ARRAY(groupby_names, ',(?=(?:[^"]*"[^"]*")*[^"]*$)'), - '' )), ' '), '{QUOTE}') AS column_name - FROM test_definitions d - INNER JOIN test_types t - ON d.test_type = t.test_type - WHERE test_suite_id = :TEST_SUITE_ID - AND COALESCE(test_active, 'Y') = 'Y' - AND t.test_scope IN ('column', 'referential', 'table') - UNION - -- FROM: window_date_column (referential) - SELECT cat_test_id, - schema_name AS schema_name, - table_name AS table_name, - TRIM(TRIM(UNNEST(ARRAY_REMOVE( - REGEXP_SPLIT_TO_ARRAY(window_date_column, ',(?=(?:[^"]*"[^"]*")*[^"]*$)'), - '' )), ' '), '{QUOTE}') as column_name - FROM test_definitions d - INNER JOIN test_types t - ON d.test_type = t.test_type - WHERE test_suite_id = :TEST_SUITE_ID - AND COALESCE(test_active, 'Y') = 'Y' - AND t.test_scope = 'referential' - UNION - -- FROM: match_column_names (referential) - SELECT cat_test_id, - match_schema_name AS schema_name, - match_table_name AS table_name, - TRIM(TRIM(UNNEST(ARRAY_REMOVE( - REGEXP_SPLIT_TO_ARRAY(match_column_names, ',(?=(?:[^"]*"[^"]*")*[^"]*$)'), - '' )), ' '), '{QUOTE}') as column_name - FROM test_definitions d - INNER JOIN test_types t - ON d.test_type = t.test_type - WHERE test_suite_id = :TEST_SUITE_ID - AND COALESCE(test_active, 'Y') = 'Y' - AND t.test_scope = 'referential' - AND t.test_type NOT LIKE 'Aggregate_%' - UNION - -- FROM: match_groupby_names (referential) - SELECT cat_test_id, - match_schema_name AS schema_name, - match_table_name AS table_name, - TRIM(TRIM(UNNEST(ARRAY_REMOVE( - REGEXP_SPLIT_TO_ARRAY(match_groupby_names, ',(?=(?:[^"]*"[^"]*")*[^"]*$)'), - '' )), ' '), '{QUOTE}') as column_name - FROM test_definitions d - INNER JOIN test_types t - ON d.test_type = t.test_type - WHERE test_suite_id = :TEST_SUITE_ID - AND COALESCE(test_active, 'Y') = 'Y' - AND t.test_scope = 'referential' - UNION - SELECT cat_test_id, - schema_name AS schema_name, - table_name AS table_name, - '' AS column_name - FROM test_definitions d - INNER JOIN test_types t - ON d.test_type = t.test_type - WHERE test_suite_id = :TEST_SUITE_ID - AND COALESCE(test_active, 'Y') = 'Y' - AND t.test_scope = 'table' ) cols -GROUP BY columns; diff --git a/testgen/template/validate_tests/ex_prep_flag_tests_test_definitions.sql b/testgen/template/validate_tests/ex_prep_flag_tests_test_definitions.sql deleted file mode 100644 index d436a3c..0000000 --- a/testgen/template/validate_tests/ex_prep_flag_tests_test_definitions.sql +++ /dev/null @@ -1,6 +0,0 @@ -/* -Clean the test definition status before it's set with missing tables / columns information -*/ -UPDATE test_definitions -SET test_definition_status = NULL -WHERE cat_test_id IN :CAT_TEST_IDS; diff --git a/testgen/template/validate_tests/ex_write_test_val_errors.sql b/testgen/template/validate_tests/ex_write_test_val_errors.sql deleted file mode 100644 index 318d76b..0000000 --- a/testgen/template/validate_tests/ex_write_test_val_errors.sql +++ /dev/null @@ -1,30 +0,0 @@ -INSERT INTO test_results - ( test_suite_id, - test_type, - test_definition_id, - schema_name, - table_name, - column_names, - test_time, - test_run_id, - input_parameters, - result_code, - result_status, - result_message, - result_measure ) - SELECT :TEST_SUITE_ID, - test_type, - id, - schema_name, - table_name, - column_name, - :RUN_DATE as test_time, - :TEST_RUN_ID as test_run_id, - NULL as input_parameters, - NULL as result_code, - 'Error' as result_status, - test_definition_status AS result_message, - NULL as result_measure - FROM test_definitions - WHERE test_active = 'D' - AND test_suite_id = :TEST_SUITE_ID; diff --git a/testgen/ui/components/frontend/css/shared.css b/testgen/ui/components/frontend/css/shared.css index 2de2aba..7665ae4 100644 --- a/testgen/ui/components/frontend/css/shared.css +++ b/testgen/ui/components/frontend/css/shared.css @@ -216,7 +216,7 @@ body { .text-code { font-family:'Courier New', Courier, monospace; line-height: 1.5; - white-space: pre; + white-space: pre-wrap; } /* */ diff --git a/testgen/ui/components/frontend/js/pages/profiling_runs.js b/testgen/ui/components/frontend/js/pages/profiling_runs.js index 3258192..e5fcaab 100644 --- a/testgen/ui/components/frontend/js/pages/profiling_runs.js +++ b/testgen/ui/components/frontend/js/pages/profiling_runs.js @@ -222,7 +222,7 @@ const Toolbar = ( /** @type boolean */ userCanEdit, ) => { return div( - { class: 'flex-row fx-align-flex-end fx-justify-space-between mb-4 fx-gap-4' }, + { class: 'flex-row fx-align-flex-end fx-justify-space-between mb-4 fx-gap-4 fx-flex-wrap' }, () => Select({ label: 'Table Group', value: getValue(props.table_group_options)?.find((op) => op.selected)?.value ?? null, diff --git a/testgen/ui/components/frontend/js/pages/test_runs.js b/testgen/ui/components/frontend/js/pages/test_runs.js index 05cb59e..d8aa643 100644 --- a/testgen/ui/components/frontend/js/pages/test_runs.js +++ b/testgen/ui/components/frontend/js/pages/test_runs.js @@ -2,6 +2,13 @@ * @import { ProjectSummary } from '../types.js'; * @import { SelectOption } from '../components/select.js'; * + * @typedef ProgressStep + * @type {object} + * @property {'data_chars'|'validation'|'QUERY'|'CAT'|'METADATA'} key + * @property {'Pending'|'Running'|'Completed'|'Warning'} status + * @property {string} label + * @property {string} detail + * * @typedef TestRun * @type {object} * @property {string} test_run_id @@ -10,6 +17,7 @@ * @property {string} table_groups_name * @property {string} test_suite * @property {'Running'|'Complete'|'Error'|'Cancelled'} status + * @property {ProgressStep[]} progress * @property {string} log_message * @property {string} process_id * @property {number} test_ct @@ -34,7 +42,7 @@ * @property {Permissions} permissions */ import van from '../van.min.js'; -import { Tooltip } from '../components/tooltip.js'; +import { withTooltip } from '../components/tooltip.js'; import { SummaryBar } from '../components/summary_bar.js'; import { Link } from '../components/link.js'; import { Button } from '../components/button.js'; @@ -45,10 +53,19 @@ import { Checkbox } from '../components/checkbox.js'; import { Select } from '../components/select.js'; import { Paginator } from '../components/paginator.js'; import { EMPTY_STATE_MESSAGE, EmptyState } from '../components/empty_state.js'; +import { Icon } from '../components/icon.js'; const { div, i, span, strong } = van.tags; const PAGE_SIZE = 100; const SCROLL_CONTAINER = window.top.document.querySelector('.stMain'); +const REFRESH_INTERVAL = 15000 // 15 seconds + +const progressStatusIcons = { + Pending: { color: 'grey', icon: 'more_horiz', size: 22 }, + Running: { color: 'blue', icon: 'autoplay', size: 18 }, + Completed: { color: 'green', icon: 'check', size: 24 }, + Warning: { color: 'orange', icon: 'warning', size: 20 }, +}; const TestRuns = (/** @type Properties */ props) => { loadStylesheet('testRuns', stylesheet); @@ -63,7 +80,18 @@ const TestRuns = (/** @type Properties */ props) => { pageIndex.val = 0; return getValue(props.test_runs); }); - const paginatedRuns = van.derive(() => testRuns.val.slice(PAGE_SIZE * pageIndex.val, PAGE_SIZE * (pageIndex.val + 1))); + let refreshIntervalId = null; + + const paginatedRuns = van.derive(() => { + const paginated = testRuns.val.slice(PAGE_SIZE * pageIndex.val, PAGE_SIZE * (pageIndex.val + 1)); + const hasActiveRuns = paginated.some(({ status }) => status === 'Running'); + if (!refreshIntervalId && hasActiveRuns) { + refreshIntervalId = setInterval(() => emitEvent('RefreshData', {}), REFRESH_INTERVAL); + } else if (refreshIntervalId && !hasActiveRuns) { + clearInterval(refreshIntervalId); + } + return paginated; + }); const selectedRuns = {}; const initializeSelectedStates = (items) => { @@ -188,7 +216,7 @@ const Toolbar = ( /** @type boolean */ userCanEdit, ) => { return div( - { class: 'flex-row fx-align-flex-end fx-justify-space-between mb-4 fx-gap-4' }, + { class: 'flex-row fx-align-flex-end fx-justify-space-between mb-4 fx-gap-4 fx-flex-wrap' }, div( { class: 'flex-row fx-gap-4' }, () => Select({ @@ -251,6 +279,8 @@ const TestRunItem = ( /** @type boolean */ selected, /** @type boolean */ userCanEdit, ) => { + const runningStep = item.progress?.find((item) => item.status === 'Running'); + return div( { class: 'table-row flex-row' }, userCanEdit @@ -277,20 +307,37 @@ const TestRunItem = ( ), ), div( - { class: 'flex-row', style: `flex: ${columns[2]}` }, + { style: `flex: ${columns[2]}` }, div( + { class: 'flex-row' }, TestRunStatus(item), - div( + item.status === 'Running' && item.process_id && userCanEdit ? Button({ + type: 'stroked', + label: 'Cancel', + style: 'width: 64px; height: 28px; color: var(--purple); margin-left: 12px;', + onclick: () => emitEvent('RunCanceled', { payload: item }), + }) : null, + ), + item.test_endtime + ? div( { class: 'text-caption mt-1' }, formatDuration(item.test_starttime, item.test_endtime), + ) + : div( + { class: 'text-caption mt-1' }, + runningStep + ? [ + div( + runningStep.label, + withTooltip( + Icon({ style: 'font-size: 18px; margin-left: 4px; vertical-align: middle;' }, 'info'), + { text: ProgressTooltip(item) }, + ), + ), + div(runningStep.detail), + ] + : '--', ), - ), - item.status === 'Running' && item.process_id && userCanEdit ? Button({ - type: 'stroked', - label: 'Cancel Run', - style: 'width: auto; height: 32px; color: var(--purple); margin-left: 16px;', - onclick: () => emitEvent('RunCanceled', { payload: item }), - }) : null, ), div( { class: 'pr-3', style: `flex: ${columns[3]}` }, @@ -314,9 +361,9 @@ const TestRunItem = ( : '--', ), ); -} +}; -function TestRunStatus(/** @type TestRun */ item) { +const TestRunStatus = (/** @type TestRun */ item) => { const attributeMap = { Running: { label: 'Running', color: 'blue' }, Complete: { label: 'Completed', color: '' }, @@ -324,27 +371,48 @@ function TestRunStatus(/** @type TestRun */ item) { Cancelled: { label: 'Canceled', color: 'purple' }, }; const attributes = attributeMap[item.status] || { label: 'Unknown', color: 'grey' }; + const hasProgressError = item.progress?.some(({error}) => !!error); return span( { class: 'flex-row', style: `color: var(--${attributes.color});`, }, attributes.label, - () => { - const tooltipError = van.state(false); - return item.status === 'Error' && item.log_message ? i( - { - class: 'material-symbols-rounded text-secondary ml-1', - style: 'position: relative; font-size: 16px;', - onmouseenter: () => tooltipError.val = true, - onmouseleave: () => tooltipError.val = false, - }, - 'info', - Tooltip({ text: item.log_message, show: tooltipError }), - ) : null; - }, + item.status === 'Complete' && hasProgressError + ? withTooltip( + Icon({ style: 'font-size: 18px; margin-left: 4px; vertical-align: middle; color: var(--orange);' }, 'warning' ), + { text: ProgressTooltip(item) }, + ) + : null, + item.status === 'Error' && item.log_message + ? withTooltip( + Icon({ style: 'font-size: 18px; margin-left: 4px;' }, 'info'), + { text: item.log_message, width: 250, style: 'word-break: break-word;' }, + ) + : null, ); -} +}; + +const ProgressTooltip = (/** @type ProfilingRun */ item) => { + return div( + { class: 'flex-column fx-gap-1' }, + item.progress?.map(step => { + const stepIcon = progressStatusIcons[step.status]; + return div( + { class: 'flex-row fx-gap-1' }, + Icon( + { style: `font-size: ${stepIcon.size}px; color: var(--${stepIcon.color}); min-width: 24px;` }, + stepIcon.icon, + ), + div( + { class: 'flex-column fx-align-flex-start text-left' }, + span(`${step.label}${step.detail ? (': ' + step.detail) : ''}`), + span({ style: 'font-size: 12px; opacity: 0.6; margin-top: 2px; white-space: pre-wrap;' }, step.error), + ), + ); + }), + ); +}; const ConditionalEmptyState = ( /** @type ProjectSummary */ projectSummary, diff --git a/testgen/ui/queries/source_data_queries.py b/testgen/ui/queries/source_data_queries.py index 45afd5d..467297c 100644 --- a/testgen/ui/queries/source_data_queries.py +++ b/testgen/ui/queries/source_data_queries.py @@ -5,7 +5,7 @@ import pandas as pd import streamlit as st -from testgen.common.clean_sql import ConcatColumnList +from testgen.common.clean_sql import concat_columns from testgen.common.database.database_service import get_flavor_service, replace_params from testgen.common.models.connection import Connection, SQLFlavor from testgen.common.models.test_definition import TestDefinition @@ -111,7 +111,7 @@ def get_test_issue_source_query(issue_data: dict) -> str: params = { "TARGET_SCHEMA": issue_data["schema_name"], "TABLE_NAME": issue_data["table_name"], - "COLUMN_NAME": issue_data["column_names"], + "COLUMN_NAME": issue_data["column_names"], # Don't quote this - queries already have quotes "COLUMN_TYPE": issue_data["column_type"], "TEST_DATE": str(issue_data["test_date"]), "CUSTOM_QUERY": test_definition.custom_query, @@ -124,18 +124,18 @@ def get_test_issue_source_query(issue_data: dict) -> str: "THRESHOLD_VALUE": test_definition.threshold_value, "SUBSET_CONDITION": test_definition.subset_condition or "1=1", "GROUPBY_NAMES": test_definition.groupby_names, - "HAVING_CONDITION": test_definition.having_condition, + "HAVING_CONDITION": f"HAVING {test_definition.having_condition}" if test_definition.having_condition else "", "MATCH_SCHEMA_NAME": test_definition.match_schema_name, "MATCH_TABLE_NAME": test_definition.match_table_name, "MATCH_COLUMN_NAMES": test_definition.match_column_names, "MATCH_SUBSET_CONDITION": test_definition.match_subset_condition or "1=1", "MATCH_GROUPBY_NAMES": test_definition.match_groupby_names, - "MATCH_HAVING_CONDITION": test_definition.match_having_condition, + "MATCH_HAVING_CONDITION": f"HAVING {test_definition.match_having_condition}" if test_definition.having_condition else "", "COLUMN_NAME_NO_QUOTES": issue_data["column_names"], "WINDOW_DATE_COLUMN": test_definition.window_date_column, "WINDOW_DAYS": test_definition.window_days, - "CONCAT_COLUMNS": ConcatColumnList(issue_data["column_names"], ""), - "CONCAT_MATCH_GROUPBY": ConcatColumnList(test_definition.match_groupby_names, ""), + "CONCAT_COLUMNS": concat_columns(issue_data["column_names"], ""), + "CONCAT_MATCH_GROUPBY": concat_columns(test_definition.match_groupby_names, ""), } lookup_query = replace_params(lookup_data.lookup_query, params) diff --git a/testgen/ui/queries/table_group_queries.py b/testgen/ui/queries/table_group_queries.py index 58bd282..8cc32c7 100644 --- a/testgen/ui/queries/table_group_queries.py +++ b/testgen/ui/queries/table_group_queries.py @@ -1,4 +1,5 @@ from collections.abc import Callable +from datetime import UTC, datetime from typing import TypedDict from uuid import UUID @@ -6,7 +7,6 @@ from testgen.commands.queries.refresh_data_chars_query import ColumnChars, RefreshDataCharsSQL from testgen.commands.run_refresh_data_chars import write_data_chars -from testgen.common import date_service from testgen.common.models.connection import Connection from testgen.common.models.table_group import TableGroup from testgen.ui.services.database_service import fetch_from_target_db @@ -60,7 +60,7 @@ def get_table_group_preview( def save_data_chars(table_group_id: UUID) -> None: # Unsaved table groups will not have an ID, so we have to update it after saving sql_generator.table_group.id = table_group_id - write_data_chars(data_chars, sql_generator, date_service.get_now_as_string()) + write_data_chars(data_chars, sql_generator, datetime.now(UTC)) if verify_table_access: tables_preview = table_group_preview["tables"] diff --git a/testgen/ui/queries/test_result_queries.py b/testgen/ui/queries/test_result_queries.py index f11abea..806ec03 100644 --- a/testgen/ui/queries/test_result_queries.py +++ b/testgen/ui/queries/test_result_queries.py @@ -43,19 +43,16 @@ def get_test_results( r.result_code as passed_ct, (1 - r.result_code)::INTEGER as exception_ct, CASE - WHEN result_status = 'Warning' - AND result_message NOT ILIKE 'Inactivated%%' THEN 1 + WHEN result_status = 'Warning' THEN 1 END::INTEGER as warning_ct, CASE - WHEN result_status = 'Failed' - AND result_message NOT ILIKE 'Inactivated%%' THEN 1 + WHEN result_status = 'Failed' THEN 1 END::INTEGER as failed_ct, CASE - WHEN result_status = 'Log' - AND result_message NOT ILIKE 'Inactivated%%' THEN 1 + WHEN result_status = 'Log' THEN 1 END::INTEGER as log_ct, CASE - WHEN result_message ILIKE 'Inactivated%%' THEN 1 + WHEN result_status = 'Error' THEN 1 END as execution_error_ct, p.project_code, r.table_groups_id::VARCHAR, r.id::VARCHAR as test_result_id, r.test_run_id::VARCHAR, diff --git a/testgen/ui/views/dialogs/run_tests_dialog.py b/testgen/ui/views/dialogs/run_tests_dialog.py index 808451d..c01bd04 100644 --- a/testgen/ui/views/dialogs/run_tests_dialog.py +++ b/testgen/ui/views/dialogs/run_tests_dialog.py @@ -2,7 +2,7 @@ import streamlit as st -from testgen.commands.run_execute_tests import run_execution_steps_in_background +from testgen.commands.run_test_execution import run_test_execution_in_background from testgen.common.models import with_database_session from testgen.common.models.test_suite import TestSuite, TestSuiteMinimal from testgen.ui.components import widgets as testgen @@ -42,7 +42,7 @@ def run_tests_dialog(project_code: str, test_suite: TestSuiteMinimal | None = No if testgen.expander_toggle(expand_label="Show CLI command", key="run_tests_dialog:keys:show-cli"): st.code( - f"testgen run-tests --project-key {project_code} --test-suite-key '{test_suite_name}'", + f"testgen run-tests --test-suite-id {test_suite_id}", language="shellSession" ) @@ -60,7 +60,7 @@ def run_tests_dialog(project_code: str, test_suite: TestSuiteMinimal | None = No status_container.info("Starting test run ...") try: - run_execution_steps_in_background(project_code, test_suite_name) + run_test_execution_in_background(test_suite_id) except Exception as e: status_container.error(f"Test run encountered errors: {e!s}.") diff --git a/testgen/ui/views/hygiene_issues.py b/testgen/ui/views/hygiene_issues.py index 7c6d3b1..8eb1f53 100644 --- a/testgen/ui/views/hygiene_issues.py +++ b/testgen/ui/views/hygiene_issues.py @@ -571,7 +571,7 @@ def source_data_dialog(selected_row): st.markdown("#### SQL Query") query = get_hygiene_issue_source_query(selected_row) if query: - st.code(query, language="sql", height=100) + st.code(query, language="sql", wrap_lines=True, height=100) with st.spinner("Retrieving source data..."): bad_data_status, bad_data_msg, _, df_bad = get_hygiene_issue_source_data(selected_row, limit=500) diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index e88fc85..9bfbb65 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -741,7 +741,7 @@ def render_binary_chart(data: pd.DataFrame, **params: dict) -> None: history["test_start"] = history["test_date"].apply(datetime.fromisoformat) history["test_end"] = history["test_start"].apply(lambda start: start + timedelta(seconds=60)) history["formatted_test_date"] = history["test_date"].apply(lambda date_str: datetime.fromisoformat(date_str).strftime("%I:%M:%S %p, %d/%m/%Y")) - history["result_measure_with_status"] = history.apply(lambda row: f"{legend_labels[str(int(row['result_measure']))]} ({row['result_status']})", axis=1) + history["result_measure_with_status"] = history.apply(lambda row: f"{legend_labels[str(int(row['result_measure'])) if not pd.isnull(row['result_measure']) else "0"]} ({row['result_status']})", axis=1) fig = px.timeline( history, @@ -814,7 +814,7 @@ def source_data_dialog(selected_row): else: query = get_test_issue_source_query(selected_row) if query: - st.code(query, language="sql", height=100) + st.code(query, language="sql", wrap_lines=True, height=100) with st.spinner("Retrieving source data..."): if selected_row["test_type"] == "CUSTOM": diff --git a/testgen/ui/views/test_suites.py b/testgen/ui/views/test_suites.py index 31a75f0..0345c42 100644 --- a/testgen/ui/views/test_suites.py +++ b/testgen/ui/views/test_suites.py @@ -105,14 +105,12 @@ def show_test_suite(mode, project_code, table_groups: Iterable[TableGroupMinimal connection_id = selected_test_suite.connection_id if mode == "edit" else None table_groups_id = selected_test_suite.table_groups_id if mode == "edit" else None test_suite_description = empty_if_null(selected_test_suite.test_suite_description) if mode == "edit" else "" - test_action = empty_if_null(selected_test_suite.test_action) if mode == "edit" else "" try: severity_index = severity_options.index(selected_test_suite.severity) if mode == "edit" else 0 except ValueError: severity_index = 0 export_to_observability = selected_test_suite.export_to_observability if mode == "edit" else False dq_score_exclude = selected_test_suite.dq_score_exclude if mode == "edit" else False - test_suite_schema = empty_if_null(selected_test_suite.test_suite_schema) if mode == "edit" else "" component_key = empty_if_null(selected_test_suite.component_key) if mode == "edit" else "" component_type = empty_if_null(selected_test_suite.component_type) if mode == "edit" else "dataset" component_name = empty_if_null(selected_test_suite.component_name) if mode == "edit" else "" @@ -140,7 +138,6 @@ def show_test_suite(mode, project_code, table_groups: Iterable[TableGroupMinimal "test_suite_description": left_column.text_input( label="Test Suite Description", max_chars=40, value=test_suite_description ), - "test_action": test_action, "severity": right_column.selectbox( label="Severity", options=severity_options, @@ -148,7 +145,6 @@ def show_test_suite(mode, project_code, table_groups: Iterable[TableGroupMinimal index=severity_index, help="Overrides the default severity in 'Test Definition' and/or 'Test Run'.", ), - "test_suite_schema": test_suite_schema, "export_to_observability": left_column.checkbox( "Export to Observability", value=export_to_observability, From 598f6d40022ae131606b57bb538b64d8775d2ad9 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Tue, 11 Nov 2025 02:36:07 -0500 Subject: [PATCH 10/28] misc: self-review fixes --- testgen/__main__.py | 1 + .../commands/queries/execute_tests_query.py | 22 +++++++++++-------- testgen/commands/run_test_execution.py | 15 +++++++------ testgen/commands/run_test_validation.py | 8 +++---- testgen/common/models/test_definition.py | 8 ++++--- .../030_initialize_new_schema_structure.sql | 2 +- .../dbupgrade/0158_incremental_upgrade.sql | 2 ++ .../disable_invalid_test_definitions.sql | 5 +++-- .../components/frontend/js/pages/test_runs.js | 2 +- 9 files changed, 38 insertions(+), 27 deletions(-) diff --git a/testgen/__main__.py b/testgen/__main__.py index 8217727..e4871f0 100644 --- a/testgen/__main__.py +++ b/testgen/__main__.py @@ -182,6 +182,7 @@ def run_test_generation(configuration: Configuration, table_group_id: str, test_ required=False, default=settings.DEFAULT_TEST_SUITE_KEY, ) +@with_database_session def run_tests(test_suite_id: str | None = None, project_key: str | None = None, test_suite_key: str | None = None): click.echo(f"run-tests for suite: {test_suite_id or test_suite_key}") # For backward compatibility diff --git a/testgen/commands/queries/execute_tests_query.py b/testgen/commands/queries/execute_tests_query.py index b794971..6caa156 100644 --- a/testgen/commands/queries/execute_tests_query.py +++ b/testgen/commands/queries/execute_tests_query.py @@ -1,7 +1,7 @@ import dataclasses from collections.abc import Iterable from datetime import datetime -from typing import Literal, TypedDict +from typing import TypedDict from uuid import UUID from testgen.common import read_template_sql_file @@ -9,10 +9,10 @@ from testgen.common.database.database_service import get_flavor_service, replace_params from testgen.common.models.connection import Connection from testgen.common.models.table_group import TableGroup +from testgen.common.models.test_definition import TestRunType, TestScope from testgen.common.models.test_run import TestRun from testgen.common.read_file import replace_templated_functions -TestRunType = Literal["QUERY", "CAT", "METADATA"] @dataclasses.dataclass class InputParameters: @@ -48,7 +48,7 @@ class TestExecutionDef(InputParameters): skip_errors: int custom_query: str run_type: TestRunType - test_scope: Literal["column", "referential", "table", "custom"] + test_scope: TestScope template_name: str measure: str test_operator: str @@ -201,7 +201,7 @@ def get_test_errors(self, test_defs: list[TestExecutionDef]) -> list[list[UUID | self._get_input_parameters(td), None, # No result_code on errors "Error", - ". ".join(td.errors)[:1000], + ". ".join(td.errors), None, # No result_measure on errors ] for td in test_defs if td.errors ] @@ -252,6 +252,9 @@ def aggregate_cat_tests( aggregate_test_defs: list[list[TestExecutionDef]] = [] def add_query(test_defs: list[TestExecutionDef]) -> str: + if not test_defs: + return + query = ( f"SELECT {len(aggregate_queries)} AS query_index, " f"{concat_operator.join([td.measure_expression for td in test_defs])} AS result_measures, " @@ -282,16 +285,17 @@ def add_query(test_defs: list[TestExecutionDef]) -> str: current_test_defs = [] for td in test_defs: - current_chars += len(td.measure_expression) + len(td.condition_expression) + 2 * len(concat_operator) + td_chars = len(td.measure_expression) + len(td.condition_expression) + 2 * len(concat_operator) # Add new query if current query will become bigger than character limit - if current_chars > max_query_chars: + if (current_chars + td_chars) > max_query_chars: add_query(current_test_defs) current_chars = 0 current_test_defs = [] + + current_chars += td_chars current_test_defs.append(td) - if current_test_defs: - add_query(current_test_defs) + add_query(current_test_defs) return aggregate_queries, aggregate_test_defs @@ -300,7 +304,7 @@ def get_cat_test_results( aggregate_results: list[AggregateResult], aggregate_test_defs: list[list[TestExecutionDef]], ) -> list[list[UUID | str | datetime | int | None]]: - test_results: list[list[TestExecutionDef]] = [] + test_results: list[list[UUID | str | datetime | int | None]] = [] for result in aggregate_results: test_defs = aggregate_test_defs[result["query_index"]] result_measures = result["result_measures"].split("|") diff --git a/testgen/commands/run_test_execution.py b/testgen/commands/run_test_execution.py index 3c6c0af..3262c9c 100644 --- a/testgen/commands/run_test_execution.py +++ b/testgen/commands/run_test_execution.py @@ -3,6 +3,7 @@ import threading from datetime import UTC, datetime, timedelta from functools import partial +from typing import Literal from uuid import UUID import testgen.common.process_service as process_service @@ -83,9 +84,9 @@ def run_test_execution(test_suite_id: str | UUID, username: str | None = None, r LOG.info("Retrieving active test definitions in test suite") test_defs = fetch_dict_from_db(*sql_generator.get_active_test_definitions()) test_defs = [TestExecutionDef(**item) for item in test_defs] - LOG.info(f"Active test definitions: {len(test_defs)}") if test_defs: + LOG.info(f"Active test definitions: {len(test_defs)}") test_run.set_progress("validation", "Running") test_run.save() @@ -126,6 +127,7 @@ def run_test_execution(test_suite_id: str | UUID, username: str | None = None, r LOG.info("Updating test results and test run") test_run.save() execute_db_queries(sql_generator.update_test_results()) + # Refresh needed because previous query updates the test run too test_run.refresh() except Exception as e: LOG.exception("Test execution encountered an error.") @@ -162,13 +164,11 @@ def run_test_execution(test_suite_id: str | UUID, username: str | None = None, r """ -def _run_tests(sql_generator: TestExecutionSQL, run_type: TestRunType, test_defs: list[TestExecutionDef]) -> None: +def _run_tests(sql_generator: TestExecutionSQL, run_type: Literal["QUERY", "METADATA"], test_defs: list[TestExecutionDef]) -> None: test_run = sql_generator.test_run test_run.set_progress(run_type, "Running") test_run.save() - LOG.info(f"Running {run_type} tests: {len(test_defs)}") - def update_test_progress(progress: ThreadedProgress) -> None: test_run.set_progress( run_type, @@ -180,6 +180,7 @@ def update_test_progress(progress: ThreadedProgress) -> None: ) test_run.save() + LOG.info(f"Running {run_type} tests: {len(test_defs)}") test_results, result_columns, error_data = fetch_from_db_threaded( [sql_generator.run_query_test(td) for td in test_defs], use_target_db=run_type != "METADATA", @@ -187,8 +188,8 @@ def update_test_progress(progress: ThreadedProgress) -> None: progress_callback=update_test_progress, ) - LOG.info(f"Writing {run_type} test results") if test_results: + LOG.info(f"Writing {run_type} test results") write_to_app_db(test_results, result_columns, sql_generator.test_results_table) if error_count := len(error_data): @@ -217,7 +218,6 @@ def _run_cat_tests(sql_generator: TestExecutionSQL, test_defs: list[TestExecutio total_count = len(test_defs) LOG.info(f"Aggregating CAT tests: {total_count}") aggregate_queries, aggregate_test_defs = sql_generator.aggregate_cat_tests(test_defs) - LOG.info(f"Running aggregated CAT test queries: {len(aggregate_queries)}") def update_aggegate_progress(progress: ThreadedProgress) -> None: processed_count = sum(len(aggregate_test_defs[index]) for index in progress["indexes"]) @@ -231,6 +231,7 @@ def update_aggegate_progress(progress: ThreadedProgress) -> None: ) test_run.save() + LOG.info(f"Running aggregated CAT test queries: {len(aggregate_queries)}") aggregate_results, _, aggregate_errors = fetch_from_db_threaded( aggregate_queries, use_target_db=True, @@ -252,7 +253,6 @@ def update_aggegate_progress(progress: ThreadedProgress) -> None: single_queries, single_test_defs = sql_generator.aggregate_cat_tests(error_test_defs, single=True) - LOG.info(f"Rerunning errored CAT tests singly: {len(single_test_defs)}") test_run.set_progress( "CAT", "Running", @@ -271,6 +271,7 @@ def update_single_progress(progress: ThreadedProgress) -> None: ) test_run.save() + LOG.info(f"Rerunning errored CAT tests singly: {len(single_test_defs)}") single_results, _, single_errors = fetch_from_db_threaded( single_queries, use_target_db=True, diff --git a/testgen/commands/run_test_validation.py b/testgen/commands/run_test_validation.py index 3d0d0af..55fb618 100644 --- a/testgen/commands/run_test_validation.py +++ b/testgen/commands/run_test_validation.py @@ -39,8 +39,8 @@ def add_test_error(test_ids: list[UUID], error: str) -> None: test_defs_by_id[test_id].errors.append(error) for td in test_defs: - # No validation needed for custom query - if td.test_type == "CUSTOM": + # No validation needed for custom query or table group tests + if td.test_type == "CUSTOM" or td.test_scope == "tablegroup": continue if td.schema_name and td.table_name and (td.column_name or td.test_scope in ["table", "custom"]): @@ -87,9 +87,9 @@ def add_test_error(test_ids: list[UUID], error: str) -> None: for identifier, test_ids in identifiers_to_check.items(): table = (identifier[0], identifier[1]) if table not in target_tables: - add_test_error(test_ids, f"Missing table: {".".join(table)}") + add_test_error(test_ids, f"Missing table: {'.'.join(table)}") elif identifier[2] and identifier not in target_columns: - add_test_error(test_ids, f"Missing column: {".".join(identifier)}") + add_test_error(test_ids, f"Missing column: {'.'.join(identifier)}") error_results = sql_generator.get_test_errors(test_defs_by_id.values()) if error_results: diff --git a/testgen/common/models/test_definition.py b/testgen/common/models/test_definition.py index 8422301..195121b 100644 --- a/testgen/common/models/test_definition.py +++ b/testgen/common/models/test_definition.py @@ -27,6 +27,8 @@ from testgen.common.models.entity import ENTITY_HASH_FUNCS, Entity, EntityMinimal from testgen.utils import is_uuid4 +TestRunType = Literal["QUERY", "CAT", "METADATA"] +TestScope = Literal["column", "referential", "table", "tablegroup", "custom"] TestRunStatus = Literal["Running", "Complete", "Error", "Cancelled"] @@ -82,7 +84,7 @@ class TestDefinitionSummary(EntityMinimal): default_parm_prompts: str default_parm_help: str default_severity: str - test_scope: str + test_scope: TestScope usage_notes: str @@ -133,8 +135,8 @@ class TestType(Entity): default_parm_prompts: str = Column(Text) default_parm_help: str = Column(Text) default_severity: str = Column(String) - run_type: str = Column(String) - test_scope: str = Column(String) + run_type: TestRunType = Column(String) + test_scope: TestScope = Column(String) dq_dimension: str = Column(String) health_dimension: str = Column(String) threshold_description: str = Column(String) diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index 4e62476..dbf27fc 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -535,7 +535,7 @@ CREATE TABLE test_results ( result_code INTEGER, severity VARCHAR(10), result_status VARCHAR(10), - result_message VARCHAR(1000), + result_message VARCHAR, result_signal VARCHAR(1000), result_measure VARCHAR(1000), threshold_value VARCHAR(1000), diff --git a/testgen/template/dbupgrade/0158_incremental_upgrade.sql b/testgen/template/dbupgrade/0158_incremental_upgrade.sql index 46d0119..b2493c7 100644 --- a/testgen/template/dbupgrade/0158_incremental_upgrade.sql +++ b/testgen/template/dbupgrade/0158_incremental_upgrade.sql @@ -2,6 +2,7 @@ SET SEARCH_PATH TO {SCHEMA_NAME}; DROP VIEW IF EXISTS v_latest_profile_results CASCADE; DROP VIEW IF EXISTS v_queued_observability_results CASCADE; +DROP VIEW IF EXISTS v_test_results CASCADE; DROP SEQUENCE profile_results_dk_id_seq; DROP SEQUENCE test_definitions_cat_test_id_seq; @@ -27,6 +28,7 @@ ALTER TABLE test_runs ADD COLUMN progress JSONB; ALTER TABLE test_results + ALTER COLUMN result_message TYPE VARCHAR, DROP COLUMN starttime, DROP COLUMN endtime, DROP COLUMN test_action, diff --git a/testgen/template/execution/disable_invalid_test_definitions.sql b/testgen/template/execution/disable_invalid_test_definitions.sql index fd3616e..37ed5b1 100644 --- a/testgen/template/execution/disable_invalid_test_definitions.sql +++ b/testgen/template/execution/disable_invalid_test_definitions.sql @@ -1,6 +1,7 @@ UPDATE test_definitions td SET test_active = 'N', - test_definition_status = LEFT('Deactivated ' || :RUN_DATE || ': ' || tr.result_message, 200) + test_definition_status = LEFT('Deactivated ' || :RUN_DATE || '.' || SUBSTRING(tr.result_message, 13), 200) FROM test_results tr WHERE td.id = tr.test_definition_id - AND tr.test_run_id = :TEST_RUN_ID; + AND tr.test_run_id = :TEST_RUN_ID + AND tr.result_status = 'Error'; diff --git a/testgen/ui/components/frontend/js/pages/test_runs.js b/testgen/ui/components/frontend/js/pages/test_runs.js index d8aa643..1904f52 100644 --- a/testgen/ui/components/frontend/js/pages/test_runs.js +++ b/testgen/ui/components/frontend/js/pages/test_runs.js @@ -393,7 +393,7 @@ const TestRunStatus = (/** @type TestRun */ item) => { ); }; -const ProgressTooltip = (/** @type ProfilingRun */ item) => { +const ProgressTooltip = (/** @type TestRun */ item) => { return div( { class: 'flex-column fx-gap-1' }, item.progress?.map(step => { From bebfae560d000c4ebfd02db5bef4ccf1cc7af5af Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Tue, 11 Nov 2025 03:07:46 -0500 Subject: [PATCH 11/28] fix(schedules): migrate test run schedules to use id --- testgen/commands/run_test_execution.py | 2 +- testgen/common/models/scheduler.py | 6 +++--- testgen/common/models/table_group.py | 2 +- testgen/common/models/test_suite.py | 7 ++----- testgen/template/dbupgrade/0158_incremental_upgrade.sql | 7 +++++++ testgen/ui/views/test_runs.py | 6 +++--- 6 files changed, 17 insertions(+), 13 deletions(-) diff --git a/testgen/commands/run_test_execution.py b/testgen/commands/run_test_execution.py index 3262c9c..0a0614f 100644 --- a/testgen/commands/run_test_execution.py +++ b/testgen/commands/run_test_execution.py @@ -8,7 +8,7 @@ import testgen.common.process_service as process_service from testgen import settings -from testgen.commands.queries.execute_tests_query import TestExecutionDef, TestExecutionSQL, TestRunType +from testgen.commands.queries.execute_tests_query import TestExecutionDef, TestExecutionSQL from testgen.commands.queries.rollup_scores_query import RollupScoresSQL from testgen.commands.run_refresh_score_cards_results import run_refresh_score_cards_results from testgen.common import ( diff --git a/testgen/common/models/scheduler.py b/testgen/common/models/scheduler.py index 3cd9cb7..da3f707 100644 --- a/testgen/common/models/scheduler.py +++ b/testgen/common/models/scheduler.py @@ -4,7 +4,7 @@ from uuid import UUID, uuid4 from cron_converter import Cron -from sqlalchemy import Boolean, Column, String, delete, func, select, update +from sqlalchemy import Boolean, Column, String, cast, delete, func, select, update from sqlalchemy.dialects import postgresql from sqlalchemy.orm import InstrumentedAttribute @@ -33,10 +33,10 @@ class JobSchedule(Base): def select_where(cls, *clauses, order_by: str | InstrumentedAttribute | None = None) -> Iterable[Self]: test_definitions_count = ( select(cls.id) - .join(TestSuite, TestSuite.test_suite == cls.kwargs["test_suite_key"].astext) + .join(TestSuite, TestSuite.id == cast(cls.kwargs["test_suite_id"].astext, postgresql.UUID)) .join(TestDefinition, TestDefinition.test_suite_id == TestSuite.id) .where(cls.key == RUN_TESTS_JOB_KEY, cls.active == True) - .group_by(cls.id, TestSuite.test_suite) + .group_by(cls.id, TestSuite.id) .having(func.count(TestDefinition.id) > 0) .subquery() ) diff --git a/testgen/common/models/table_group.py b/testgen/common/models/table_group.py index 2520c83..ae24af0 100644 --- a/testgen/common/models/table_group.py +++ b/testgen/common/models/table_group.py @@ -372,7 +372,7 @@ def save( cron_expr="0 * * * *", cron_tz=monitor_schedule_timezone, args=[], - kwargs={"project_key": self.project_code, "test_suite_key": test_suite.test_suite}, + kwargs={"test_suite_id": test_suite.id}, ) db_session.add(schedule_job) diff --git a/testgen/common/models/test_suite.py b/testgen/common/models/test_suite.py index a0703a1..ce7d601 100644 --- a/testgen/common/models/test_suite.py +++ b/testgen/common/models/test_suite.py @@ -223,11 +223,8 @@ def cascade_delete(cls, ids: list[str]) -> None: DELETE FROM test_definitions WHERE test_suite_id IN :test_suite_ids; - DELETE FROM job_schedules js - USING test_suites ts - WHERE js.kwargs->>'project_key' = ts.project_code - AND js.kwargs->>'test_suite_key' = ts.test_suite - AND ts.id IN :test_suite_ids; + DELETE FROM job_schedules + WHERE (kwargs->>'test_suite_id')::UUID IN :test_suite_ids; """ db_session = get_current_session() db_session.execute(text(query), {"test_suite_ids": tuple(ids)}) diff --git a/testgen/template/dbupgrade/0158_incremental_upgrade.sql b/testgen/template/dbupgrade/0158_incremental_upgrade.sql index b2493c7..e33a6b2 100644 --- a/testgen/template/dbupgrade/0158_incremental_upgrade.sql +++ b/testgen/template/dbupgrade/0158_incremental_upgrade.sql @@ -35,3 +35,10 @@ ALTER TABLE test_results DROP COLUMN subset_condition, DROP COLUMN result_error_data, DROP COLUMN result_query; + +UPDATE job_schedules + SET kwargs = jsonb_build_object('test_suite_id', test_suites.id) +FROM test_suites +WHERE job_schedules.key = 'run-tests' + AND job_schedules.kwargs->>'project_key' = test_suites.project_code + AND job_schedules.kwargs->>'test_suite_key' = test_suites.test_suite; diff --git a/testgen/ui/views/test_runs.py b/testgen/ui/views/test_runs.py index 33cf379..765a2f6 100644 --- a/testgen/ui/views/test_runs.py +++ b/testgen/ui/views/test_runs.py @@ -118,16 +118,16 @@ def init(self) -> None: self.test_suites = TestSuite.select_minimal_where(TestSuite.project_code == self.project_code) def get_arg_value(self, job): - return job.kwargs["test_suite_key"] + return next(item.test_suite for item in self.test_suites if str(item.id) == job.kwargs["test_suite_id"]) def get_arg_value_options(self) -> list[dict[str, str]]: return [ - {"value": test_suite.test_suite, "label": test_suite.test_suite} + {"value": str(test_suite.id), "label": test_suite.test_suite} for test_suite in self.test_suites ] def get_job_arguments(self, arg_value: str) -> tuple[list[typing.Any], dict[str, typing.Any]]: - return [], {"project_key": self.project_code, "test_suite_key": arg_value} + return [], {"test_suite_id": str(arg_value)} def on_cancel_run(test_run: dict) -> None: From 94526a09eee2445266ecee48eb1d91265ac803cc Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Fri, 7 Nov 2025 01:36:26 -0500 Subject: [PATCH 12/28] fix(test-generation): handle pipes and newlines in LOV Match test --- .../test_types_LOV_Match.yaml | 106 +++++++++++++++++- 1 file changed, 105 insertions(+), 1 deletion(-) diff --git a/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml b/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml index 6656776..6d80ebc 100644 --- a/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml +++ b/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml @@ -18,7 +18,111 @@ test_types: column_name_help: null default_parm_columns: baseline_value,threshold_value default_parm_values: |- - '(' || SUBSTRING( CASE WHEN SPLIT_PART(top_freq_values, '|' , 2) > '' THEN ',''' || TRIM( REPLACE ( SPLIT_PART(top_freq_values, '|' , 2), '''' , '''''' ) ) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 4) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 4), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 6) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 6), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 8) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 8), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 10) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 10), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 12) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 12), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 14) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 14), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 16) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 16), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 18) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 18), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 20) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 20), '''' , '''''' )) || '''' ELSE '' END, 2, 999) || ')',0 + '(' || SUBSTRING( + CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 1) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 1), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END || CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 2) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 2), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END || CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 3) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 3), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END || CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 4) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 4), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END || CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 5) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 5), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END || CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 6) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 6), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END || CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 7) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 7), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END || CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 8) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 8), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END || CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 9) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 9), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END || CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 10) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 10), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END, + 2, + 999 + ) || ')',0 default_parm_prompts: |- List of Expected Values,Threshold Error Count default_parm_help: null From 1c1096e1f2ef80947801ff0b216ed81949101005 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Mon, 10 Nov 2025 12:01:21 -0500 Subject: [PATCH 13/28] fix(lookup): add templated limit counts to lookup queries --- ..._anomaly_types_Boolean_Value_Mismatch.yaml | 15 +++++----- ...anomaly_types_Char_Column_Date_Values.yaml | 16 +++++------ ...omaly_types_Char_Column_Number_Values.yaml | 16 +++++------ ...anomaly_types_Column_Pattern_Mismatch.yaml | 20 ++++++------- ...anomaly_types_Delimited_Data_Embedded.yaml | 14 +++++----- ...ile_anomaly_types_Inconsistent_Casing.yaml | 28 +++++++++---------- ...rofile_anomaly_types_Invalid_Zip3_USA.yaml | 14 +++++----- ...profile_anomaly_types_Invalid_Zip_USA.yaml | 14 +++++----- .../profile_anomaly_types_Leading_Spaces.yaml | 12 ++++---- ...le_anomaly_types_Multiple_Types_Major.yaml | 15 +++++----- ...le_anomaly_types_Multiple_Types_Minor.yaml | 15 +++++----- .../profile_anomaly_types_No_Values.yaml | 15 +++++----- ..._anomaly_types_Non_Alpha_Name_Address.yaml | 14 +++++----- ...anomaly_types_Non_Alpha_Prefixed_Name.yaml | 14 +++++----- ...file_anomaly_types_Non_Printing_Chars.yaml | 12 ++++---- ...ile_anomaly_types_Non_Standard_Blanks.yaml | 15 +++++----- ...le_anomaly_types_Potential_Duplicates.yaml | 14 +++++----- .../profile_anomaly_types_Potential_PII.yaml | 14 +++++----- .../profile_anomaly_types_Quoted_Values.yaml | 14 +++++----- ...nomaly_types_Small_Divergent_Value_Ct.yaml | 12 ++++---- ..._anomaly_types_Small_Missing_Value_Ct.yaml | 12 ++++---- ..._anomaly_types_Small_Numeric_Value_Ct.yaml | 16 +++++------ ...maly_types_Standardized_Value_Matches.yaml | 14 +++++----- .../profile_anomaly_types_Suggested_Type.yaml | 14 +++++----- ..._anomaly_types_Table_Pattern_Mismatch.yaml | 15 +++++----- ...ofile_anomaly_types_Unexpected_Emails.yaml | 14 +++++----- ...le_anomaly_types_Unexpected_US_States.yaml | 14 +++++----- ...le_anomaly_types_Unlikely_Date_Values.yaml | 14 +++++----- ...le_anomaly_types_Variant_Coded_Values.yaml | 15 +++++----- .../test_types_Aggregate_Balance.yaml | 20 ++++++++----- .../test_types_Aggregate_Balance_Percent.yaml | 20 ++++++++----- .../test_types_Aggregate_Balance_Range.yaml | 20 ++++++++----- .../test_types_Aggregate_Minimum.yaml | 20 ++++++++----- .../test_types_Alpha_Trunc.yaml | 14 +++++----- .../test_types_Combo_Match.yaml | 20 ++++++++----- .../test_types_Condition_Flag.yaml | 14 +++++----- .../test_types_Constant.yaml | 14 +++++----- .../test_types_Daily_Record_Ct.yaml | 14 +++++----- .../test_types_Dec_Trunc.yaml | 14 +++++----- .../test_types_Distinct_Date_Ct.yaml | 14 +++++----- .../test_types_Distinct_Value_Ct.yaml | 14 +++++----- .../test_types_Distribution_Shift.yaml | 12 ++++++-- .../test_types_Dupe_Rows.yaml | 10 +++++-- .../test_types_Email_Format.yaml | 14 +++++----- .../test_types_Future_Date.yaml | 14 +++++----- .../test_types_Future_Date_1Y.yaml | 14 +++++----- .../test_types_LOV_All.yaml | 14 +++++----- .../test_types_LOV_Match.yaml | 14 +++++----- .../test_types_Min_Date.yaml | 14 +++++----- .../test_types_Min_Val.yaml | 14 +++++----- .../test_types_Missing_Pct.yaml | 14 +++++----- .../test_types_Monthly_Rec_Ct.yaml | 15 +++++----- .../test_types_Pattern_Match.yaml | 15 +++++----- .../test_types_Recency.yaml | 15 +++++----- .../test_types_Required.yaml | 14 +++++----- .../test_types_Street_Addr_Pattern.yaml | 14 +++++----- .../test_types_Timeframe_Combo_Gain.yaml | 9 ++++-- .../test_types_Timeframe_Combo_Match.yaml | 18 ++++++++++-- .../test_types_US_State.yaml | 14 +++++----- .../dbsetup_test_types/test_types_Unique.yaml | 14 +++++----- .../test_types_Unique_Pct.yaml | 14 +++++----- .../test_types_Valid_Characters.yaml | 14 +++++----- .../test_types_Valid_US_Zip.yaml | 14 +++++----- .../test_types_Valid_US_Zip3.yaml | 14 +++++----- .../test_types_Weekly_Rec_Ct.yaml | 15 +++++----- testgen/ui/queries/source_data_queries.py | 21 ++++++++++---- 66 files changed, 531 insertions(+), 452 deletions(-) diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml index fae0ec4..fc3bd2e 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml @@ -32,7 +32,8 @@ profile_anomaly_types: SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` - ORDER BY COUNT(*) DESC; + ORDER BY COUNT(*) DESC + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1287' test_id: '1015' @@ -40,7 +41,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY count DESC; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1129' test_id: '1015' @@ -48,7 +49,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; error_type: Profile Anomaly - id: '1072' test_id: '1015' @@ -56,7 +57,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1047' test_id: '1015' @@ -64,7 +65,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1447' test_id: '1015' @@ -72,7 +73,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1186' test_id: '1015' @@ -80,5 +81,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml index a7371d2..d769024 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml @@ -34,7 +34,7 @@ profile_anomaly_types: WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS DATE) IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 10 + LIMIT {LIMIT_2} ) UNION ALL ( @@ -43,7 +43,7 @@ profile_anomaly_types: WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS DATE) IS NULL GROUP BY `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 10 + LIMIT {LIMIT_2} ) ORDER BY data_type, count DESC; error_type: Profile Anomaly @@ -53,7 +53,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT A.* FROM (SELECT DISTINCT 'Date' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_DATE;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Date' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_DATE;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10; + SELECT A.* FROM (SELECT DISTINCT 'Date' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_DATE;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Date' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_DATE;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1126' test_id: '1012' @@ -61,7 +61,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT DISTINCT TOP 10 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT TOP {LIMIT_2} 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP {LIMIT_2} 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1069' test_id: '1012' @@ -69,7 +69,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1044' test_id: '1012' @@ -77,7 +77,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1444' test_id: '1012' @@ -85,7 +85,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1183' test_id: '1012' @@ -93,5 +93,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT A.* FROM (SELECT DISTINCT TOP 10 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM (SELECT DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml index 12cccad..9c600ba 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml @@ -34,7 +34,7 @@ profile_anomaly_types: WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS FLOAT64) IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 10 + LIMIT {LIMIT_2} ) UNION ALL ( @@ -43,7 +43,7 @@ profile_anomaly_types: WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS FLOAT64) IS NULL GROUP BY `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 10 + LIMIT {LIMIT_2} ) ORDER BY data_type, count DESC; error_type: Profile Anomaly @@ -53,7 +53,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10; + SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1125' test_id: '1011' @@ -61,7 +61,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT TOP {LIMIT_2} 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP {LIMIT_2} 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1068' test_id: '1011' @@ -69,7 +69,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1043' test_id: '1011' @@ -77,7 +77,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1443' test_id: '1011' @@ -85,7 +85,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1182' test_id: '1011' @@ -93,5 +93,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT A.* FROM (SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml index 987d9f0..7bdd0df 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml @@ -42,7 +42,7 @@ profile_anomaly_types: WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r'[a-z]', 'a'), r'[A-Z]', 'A'), r'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 5 + LIMIT {LIMIT_4} ) UNION ALL ( @@ -52,7 +52,7 @@ profile_anomaly_types: WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r'[a-z]', 'a'), r'[A-Z]', 'A'), r'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 5 + LIMIT {LIMIT_4} ) UNION ALL ( @@ -62,7 +62,7 @@ profile_anomaly_types: WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r'[a-z]', 'a'), r'[A-Z]', 'A'), r'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 5 + LIMIT {LIMIT_4} ) UNION ALL ( @@ -72,7 +72,7 @@ profile_anomaly_types: WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r'[a-z]', 'a'), r'[A-Z]', 'A'), r'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 5 + LIMIT {LIMIT_4} ) ORDER BY top_pattern DESC, count DESC; error_type: Profile Anomaly @@ -82,7 +82,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT A.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) A UNION ALL SELECT B.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) B UNION ALL SELECT C.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) C UNION ALL SELECT D.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC; + SELECT A.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT_4}) A UNION ALL SELECT B.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT_4}) B UNION ALL SELECT C.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT_4}) C UNION ALL SELECT D.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT_4}) D ORDER BY top_pattern DESC, count DESC; error_type: Profile Anomaly - id: '1121' test_id: '1007' @@ -90,7 +90,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - WITH cte AS ( SELECT TRIM(value) AS top_pattern, ROW_NUMBER() OVER (ORDER BY CHARINDEX('| '+ TRIM(value) + ' |', '| ' + '{DETAIL_EXPRESSION}' + ' |' ) ASC) as row_num FROM STRING_SPLIT('{DETAIL_EXPRESSION}', '|') ) SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 4 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 6 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 8 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 10 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" ORDER BY top_pattern DESC, count DESC; + WITH cte AS ( SELECT TRIM(value) AS top_pattern, ROW_NUMBER() OVER (ORDER BY CHARINDEX('| '+ TRIM(value) + ' |', '| ' + '{DETAIL_EXPRESSION}' + ' |' ) ASC) as row_num FROM STRING_SPLIT('{DETAIL_EXPRESSION}', '|') ) SELECT DISTINCT TOP {LIMIT_4} c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 4 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP {LIMIT_4} c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 6 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP {LIMIT_4} c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 8 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP {LIMIT_4} c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 10 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" ORDER BY top_pattern DESC, count DESC; error_type: Profile Anomaly - id: '1064' test_id: '1007' @@ -98,7 +98,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) A UNION ALL SELECT B.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) B UNION ALL SELECT C.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) C UNION ALL SELECT D.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC; + SELECT A.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) A UNION ALL SELECT B.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) B UNION ALL SELECT C.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) C UNION ALL SELECT D.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) D ORDER BY top_pattern DESC, count DESC; error_type: Profile Anomaly - id: '1039' test_id: '1007' @@ -106,7 +106,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) A UNION ALL SELECT B.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) B UNION ALL SELECT C.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) C UNION ALL SELECT D.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) D ORDER BY top_pattern DESC, count DESC; + SELECT A.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) A UNION ALL SELECT B.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) B UNION ALL SELECT C.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) C UNION ALL SELECT D.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) D ORDER BY top_pattern DESC, count DESC; error_type: Profile Anomaly - id: '1439' test_id: '1007' @@ -114,7 +114,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) A UNION ALL SELECT B.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) B UNION ALL SELECT C.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) C UNION ALL SELECT D.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) D ORDER BY top_pattern DESC, count DESC; + SELECT A.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) A UNION ALL SELECT B.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) B UNION ALL SELECT C.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) C UNION ALL SELECT D.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) D ORDER BY top_pattern DESC, count DESC; error_type: Profile Anomaly - id: '1178' test_id: '1007' @@ -122,5 +122,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT A.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) B UNION ALL SELECT C.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) C UNION ALL SELECT D.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) D ORDER BY top_pattern DESC, count DESC; + SELECT A.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) A UNION ALL SELECT B.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) B UNION ALL SELECT C.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) C UNION ALL SELECT D.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) D ORDER BY top_pattern DESC, count DESC; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml index aea55e6..f2a2ade 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml @@ -28,7 +28,7 @@ profile_anomaly_types: AND NOT REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r'.*\s(and|but|or|yet)\s.*') GROUP BY `{COLUMN_NAME}` ORDER BY COUNT(*) DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1297' test_id: '1025' @@ -36,7 +36,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') AND NOT REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '.*\\s(and|but|or|yet)\\s.*') GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') AND NOT REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '.*\\s(and|but|or|yet)\\s.*') GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1139' test_id: '1025' @@ -44,7 +44,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE ( "{COLUMN_NAME}" LIKE '%,%,%,%' OR "{COLUMN_NAME}" LIKE '%|%|%|%' OR "{COLUMN_NAME}" LIKE '%^%^%^%' OR "{COLUMN_NAME}" LIKE '%' + CHAR(9) + '%' + CHAR(9) + '%' + CHAR(9) + '%' ) AND NOT ( "{COLUMN_NAME}" LIKE '% and %' OR "{COLUMN_NAME}" LIKE '% but %' OR "{COLUMN_NAME}" LIKE '% or %' OR "{COLUMN_NAME}" LIKE '% yet %' ) AND ISNULL(CAST(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", ',', '')) as FLOAT) / CAST(NULLIF(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", ' ', '')), 0) as FLOAT), 1) > 0.6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE ( "{COLUMN_NAME}" LIKE '%,%,%,%' OR "{COLUMN_NAME}" LIKE '%|%|%|%' OR "{COLUMN_NAME}" LIKE '%^%^%^%' OR "{COLUMN_NAME}" LIKE '%' + CHAR(9) + '%' + CHAR(9) + '%' + CHAR(9) + '%' ) AND NOT ( "{COLUMN_NAME}" LIKE '% and %' OR "{COLUMN_NAME}" LIKE '% but %' OR "{COLUMN_NAME}" LIKE '% or %' OR "{COLUMN_NAME}" LIKE '% yet %' ) AND ISNULL(CAST(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", ',', '')) as FLOAT) / CAST(NULLIF(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", ' ', '')), 0) as FLOAT), 1) > 0.6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; error_type: Profile Anomaly - id: '1082' test_id: '1025' @@ -52,7 +52,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\s(and|but|or|yet)\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\s(and|but|or|yet)\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1057' test_id: '1025' @@ -60,7 +60,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\\s(and|but|or|yet)\\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\\s(and|but|or|yet)\\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1457' test_id: '1025' @@ -68,7 +68,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\\s(and|but|or|yet)\\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\\s(and|but|or|yet)\\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1196' test_id: '1025' @@ -76,5 +76,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') AND NOT REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '.*\\s(and|but|or|yet)\\s.*') GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') AND NOT REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '.*\\s(and|but|or|yet)\\s.*') GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml index b09f870..6443d84 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml @@ -28,7 +28,7 @@ profile_anomaly_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE UPPER(CAST(`{COLUMN_NAME}` AS STRING)) = CAST(`{COLUMN_NAME}` AS STRING) GROUP BY `{COLUMN_NAME}` - LIMIT 20 + LIMIT {LIMIT_2} ) UNION ALL ( @@ -37,7 +37,7 @@ profile_anomaly_types: WHERE CAST(`{COLUMN_NAME}` AS STRING) <> UPPER(CAST(`{COLUMN_NAME}` AS STRING)) AND CAST(`{COLUMN_NAME}` AS STRING) <> LOWER(CAST(`{COLUMN_NAME}` AS STRING)) GROUP BY `{COLUMN_NAME}` - LIMIT 20 + LIMIT {LIMIT_2} ); error_type: Profile Anomaly - id: '1262' @@ -48,11 +48,11 @@ profile_anomaly_types: lookup_query: |- (SELECT 'Upper Case' as casing, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE UPPER(`{COLUMN_NAME}`) = `{COLUMN_NAME}` - GROUP BY `{COLUMN_NAME}` LIMIT 20) + GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT_2}) UNION ALL (SELECT 'Mixed Case' as casing, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` <> UPPER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` <> LOWER(`{COLUMN_NAME}`) - GROUP BY `{COLUMN_NAME}` LIMIT 20) + GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT_2}) error_type: Profile Anomaly - id: '1260' test_id: '1028' @@ -60,11 +60,11 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 20 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + SELECT TOP {LIMIT_2} 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" GROUP BY "{COLUMN_NAME}" UNION ALL - SELECT TOP 20 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + SELECT TOP {LIMIT_2} 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") GROUP BY "{COLUMN_NAME}" error_type: Profile Anomaly @@ -76,11 +76,11 @@ profile_anomaly_types: lookup_query: |- (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" - GROUP BY "{COLUMN_NAME}" LIMIT 20) + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) UNION ALL (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") - GROUP BY "{COLUMN_NAME}" LIMIT 20) + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) error_type: Profile Anomaly - id: '1258' test_id: '1028' @@ -90,11 +90,11 @@ profile_anomaly_types: lookup_query: |- (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" - GROUP BY "{COLUMN_NAME}" LIMIT 20) + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) UNION ALL (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") - GROUP BY "{COLUMN_NAME}" LIMIT 20) + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) error_type: Profile Anomaly - id: '1473' test_id: '1028' @@ -104,11 +104,11 @@ profile_anomaly_types: lookup_query: |- (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" - GROUP BY "{COLUMN_NAME}" LIMIT 20) + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) UNION ALL (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") - GROUP BY "{COLUMN_NAME}" LIMIT 20) + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) error_type: Profile Anomaly - id: '1261' test_id: '1028' @@ -118,9 +118,9 @@ profile_anomaly_types: lookup_query: |- (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" - GROUP BY "{COLUMN_NAME}" LIMIT 20) + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) UNION ALL (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") - GROUP BY "{COLUMN_NAME}" LIMIT 20) + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml index 87576c2..876661d 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml @@ -31,7 +31,7 @@ profile_anomaly_types: WHERE TRANSLATE(CAST(`{COLUMN_NAME}` AS STRING), '012345678', '999999999') <> '999' GROUP BY `{COLUMN_NAME}` ORDER BY count DESC, `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1296' test_id: '1024' @@ -39,7 +39,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') <> '999' GROUP BY `{COLUMN_NAME}` ORDER BY count DESC, `{COLUMN_NAME}` LIMIT 500; + SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') <> '999' GROUP BY `{COLUMN_NAME}` ORDER BY count DESC, `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1138' test_id: '1024' @@ -47,7 +47,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}"; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}"; error_type: Profile Anomaly - id: '1081' test_id: '1024' @@ -55,7 +55,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1056' test_id: '1024' @@ -63,7 +63,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1456' test_id: '1024' @@ -71,7 +71,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1195' test_id: '1024' @@ -79,5 +79,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml index 03c47fc..400424a 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml @@ -27,7 +27,7 @@ profile_anomaly_types: WHERE TRANSLATE(CAST(`{COLUMN_NAME}` AS STRING), '012345678', '999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1275' test_id: '1003' @@ -35,7 +35,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500; + SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1117' test_id: '1003' @@ -43,7 +43,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; error_type: Profile Anomaly - id: '1060' test_id: '1003' @@ -51,7 +51,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1035' test_id: '1003' @@ -59,7 +59,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1435' test_id: '1003' @@ -67,7 +67,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1174' test_id: '1003' @@ -75,5 +75,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml index a6dc9c9..4231f42 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml @@ -35,7 +35,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE (CASE WHEN `{COLUMN_NAME}` BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE (CASE WHEN `{COLUMN_NAME}` BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1123' test_id: '1009' @@ -43,7 +43,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; error_type: Profile Anomaly - id: '1066' test_id: '1009' @@ -51,7 +51,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1041' test_id: '1009' @@ -59,7 +59,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1441' test_id: '1009' @@ -67,7 +67,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1180' test_id: '1009' @@ -75,5 +75,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml index f6bc2d4..9f3e805 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml @@ -40,7 +40,8 @@ profile_anomaly_types: WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' - ORDER BY data_type, table_name; + ORDER BY data_type, table_name + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1277' test_id: '1005' @@ -48,7 +49,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS STRING) || ',' || CAST(numeric_scale AS STRING) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; + SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS STRING) || ',' || CAST(numeric_scale AS STRING) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1119' test_id: '1005' @@ -56,7 +57,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = 'datetime' THEN 'datetime' WHEN data_type = 'datetime2' THEN 'datetime' WHEN data_type = 'varchar' THEN 'varchar(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'char' THEN 'char(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'numeric' THEN 'numeric(' + CAST(numeric_precision AS VARCHAR) + ',' + CAST(numeric_scale AS VARCHAR) + ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; + SELECT TOP {LIMIT} column_name, columns.table_name, CASE WHEN data_type = 'datetime' THEN 'datetime' WHEN data_type = 'datetime2' THEN 'datetime' WHEN data_type = 'varchar' THEN 'varchar(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'char' THEN 'char(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'numeric' THEN 'numeric(' + CAST(numeric_precision AS VARCHAR) + ',' + CAST(numeric_scale AS VARCHAR) + ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; error_type: Profile Anomaly - id: '1062' test_id: '1005' @@ -64,7 +65,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY data_type, table_name; + SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY data_type, table_name LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1037' test_id: '1005' @@ -72,7 +73,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT column_name, table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name; + SELECT DISTINCT column_name, table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1437' test_id: '1005' @@ -80,7 +81,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT columnname AS column_name, tablename AS table_name, external_type AS data_type FROM svv_external_columns WHERE schemaname = '{TARGET_SCHEMA}' AND columnname = '{COLUMN_NAME}' ORDER BY external_type, tablename; + SELECT DISTINCT columnname AS column_name, tablename AS table_name, external_type AS data_type FROM svv_external_columns WHERE schemaname = '{TARGET_SCHEMA}' AND columnname = '{COLUMN_NAME}' ORDER BY external_type, tablename LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1176' test_id: '1005' @@ -88,5 +89,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; + SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml index 554a78b..1ddee50 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml @@ -40,7 +40,8 @@ profile_anomaly_types: WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' - ORDER BY data_type, table_name; + ORDER BY data_type, table_name + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1276' test_id: '1004' @@ -48,7 +49,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS STRING) || ',' || CAST(numeric_scale AS STRING) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; + SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS STRING) || ',' || CAST(numeric_scale AS STRING) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1118' test_id: '1004' @@ -56,7 +57,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = 'datetime' THEN 'datetime' WHEN data_type = 'datetime2' THEN 'datetime' WHEN data_type = 'varchar' THEN 'varchar(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'char' THEN 'char(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'numeric' THEN 'numeric(' + CAST(numeric_precision AS VARCHAR) + ',' + CAST(numeric_scale AS VARCHAR) + ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; + SELECT TOP {LIMIT} column_name, columns.table_name, CASE WHEN data_type = 'datetime' THEN 'datetime' WHEN data_type = 'datetime2' THEN 'datetime' WHEN data_type = 'varchar' THEN 'varchar(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'char' THEN 'char(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'numeric' THEN 'numeric(' + CAST(numeric_precision AS VARCHAR) + ',' + CAST(numeric_scale AS VARCHAR) + ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; error_type: Profile Anomaly - id: '1061' test_id: '1004' @@ -64,7 +65,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY data_type, table_name; + SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY data_type, table_name LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1036' test_id: '1004' @@ -72,7 +73,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT column_name, table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name; + SELECT DISTINCT column_name, table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1436' test_id: '1004' @@ -80,7 +81,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT columnname AS column_name, tablename AS table_name, external_type AS data_type FROM svv_external_columns WHERE schemaname = '{TARGET_SCHEMA}' AND columnname = '{COLUMN_NAME}' ORDER BY external_type, tablename; + SELECT DISTINCT columnname AS column_name, tablename AS table_name, external_type AS data_type FROM svv_external_columns WHERE schemaname = '{TARGET_SCHEMA}' AND columnname = '{COLUMN_NAME}' ORDER BY external_type, tablename LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1175' test_id: '1004' @@ -88,5 +89,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; + SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml index 29978d5..87d80e6 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml @@ -28,7 +28,8 @@ profile_anomaly_types: SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` - ORDER BY `{COLUMN_NAME}`; + ORDER BY `{COLUMN_NAME}` + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1278' test_id: '1006' @@ -36,7 +37,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1120' test_id: '1006' @@ -44,7 +45,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; error_type: Profile Anomaly - id: '1063' test_id: '1006' @@ -52,7 +53,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1038' test_id: '1006' @@ -60,7 +61,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1438' test_id: '1006' @@ -68,7 +69,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1177' test_id: '1006' @@ -76,5 +77,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml index 81d2d0c..3cfd99e 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml @@ -29,7 +29,7 @@ profile_anomaly_types: AND CAST(`{COLUMN_NAME}` AS STRING) = LOWER(CAST(`{COLUMN_NAME}` AS STRING)) AND CAST(`{COLUMN_NAME}` AS STRING) > '' GROUP BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1267' test_id: '1029' @@ -39,7 +39,7 @@ profile_anomaly_types: lookup_query: |- SELECT `{COLUMN_NAME}`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` = UPPER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` = LOWER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` > '' - GROUP BY `{COLUMN_NAME}` LIMIT 500 + GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT} error_type: Profile Anomaly - id: '1265' test_id: '1029' @@ -47,7 +47,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '' GROUP BY "{COLUMN_NAME}" error_type: Profile Anomaly @@ -59,7 +59,7 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '' - GROUP BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly - id: '1263' test_id: '1029' @@ -69,7 +69,7 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '' - GROUP BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly - id: '1474' test_id: '1029' @@ -79,7 +79,7 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '' - GROUP BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly - id: '1266' test_id: '1029' @@ -89,5 +89,5 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '' - GROUP BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml index 0281a7f..dbaa263 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml @@ -30,7 +30,7 @@ profile_anomaly_types: AND SUBSTR(CAST(`{COLUMN_NAME}` AS STRING), LENGTH(CAST(`{COLUMN_NAME}` AS STRING)), 1) <> '\'' GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1272' test_id: '1030' @@ -40,7 +40,7 @@ profile_anomaly_types: lookup_query: |- SELECT `{COLUMN_NAME}`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` < 'A' AND LEFT(`{COLUMN_NAME}`, 1) NOT IN ('"', ' ') AND RIGHT(`{COLUMN_NAME}`, 1) <> '''' - GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500 + GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT {LIMIT} error_type: Profile Anomaly - id: '1270' test_id: '1030' @@ -48,7 +48,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> '''' GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" error_type: Profile Anomaly @@ -60,7 +60,7 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> '''' - GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly - id: '1268' test_id: '1030' @@ -70,7 +70,7 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> '''' - GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly - id: '1475' test_id: '1030' @@ -80,7 +80,7 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> '''' - GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly - id: '1271' test_id: '1030' @@ -90,5 +90,5 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> '''' - GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml index 6761e2b..a6118be 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml @@ -36,7 +36,7 @@ profile_anomaly_types: '\ufeff', '\x65279') as `{COLUMN_NAME}_content`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`, '\u00a0\u2009\u200b\u200c\u200d\u200e\u200f\u202f\u3000\ufeff', 'XXXXXXXXXX') <> `{COLUMN_NAME}` - GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500 + GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT {LIMIT} error_type: Profile Anomaly - id: '1275' test_id: '1031' @@ -44,7 +44,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", + SELECT TOP {LIMIT} REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", NCHAR(160), '\x160'), NCHAR(8201), '\x8201'), NCHAR(8203), '\x8203'), @@ -79,7 +79,7 @@ profile_anomaly_types: CHR(65279), '\x65279') as "{COLUMN_NAME}_content", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" - GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly - id: '1273' test_id: '1031' @@ -100,7 +100,7 @@ profile_anomaly_types: CHR(65279), '\x65279') as "{COLUMN_NAME}_content", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" - GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly - id: '1476' test_id: '1031' @@ -121,7 +121,7 @@ profile_anomaly_types: CHR(65279), '\x65279') as "{COLUMN_NAME}_content", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" - GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly - id: '1276' test_id: '1031' @@ -142,5 +142,5 @@ profile_anomaly_types: CHR(65279), '\x65279') as "{COLUMN_NAME}_content", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" - GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml index 6a115e8..839c9fc 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml @@ -39,7 +39,8 @@ profile_anomaly_types: OR `{COLUMN_NAME}` IS NULL ) GROUP BY `{COLUMN_NAME}` - ORDER BY `{COLUMN_NAME}`; + ORDER BY `{COLUMN_NAME}` + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1274' test_id: '1002' @@ -47,7 +48,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE CASE WHEN `{COLUMN_NAME}` IN ('.', '?', ' ') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP '-{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '0{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '9{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'x{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'z{2,}' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN `{COLUMN_NAME}` = '' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE CASE WHEN `{COLUMN_NAME}` IN ('.', '?', ' ') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP '-{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '0{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '9{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'x{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'z{2,}' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN `{COLUMN_NAME}` = '' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1116' test_id: '1002' @@ -55,7 +56,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?') OR "{COLUMN_NAME}" LIKE ' ' THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE '%..%' OR LOWER("{COLUMN_NAME}") LIKE '%--%' OR (LEN(REPLACE("{COLUMN_NAME}", '0', ''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", '9', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'x', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'z', ''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?') OR "{COLUMN_NAME}" LIKE ' ' THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE '%..%' OR LOWER("{COLUMN_NAME}") LIKE '%--%' OR (LEN(REPLACE("{COLUMN_NAME}", '0', ''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", '9', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'x', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'z', ''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; error_type: Profile Anomaly - id: '1059' test_id: '1002' @@ -63,7 +64,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1034' test_id: '1002' @@ -71,7 +72,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1434' test_id: '1002' @@ -79,7 +80,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1173' test_id: '1002' @@ -87,5 +88,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '9{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '9{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml index 20e6fc3..005957b 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml @@ -29,7 +29,7 @@ profile_anomaly_types: GROUP BY `{COLUMN_NAME}` HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1288' test_id: '1016' @@ -37,7 +37,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1130' test_id: '1016' @@ -45,7 +45,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC; error_type: Profile Anomaly - id: '1073' test_id: '1016' @@ -53,7 +53,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1048' test_id: '1016' @@ -61,7 +61,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1448' test_id: '1016' @@ -69,7 +69,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1187' test_id: '1016' @@ -77,5 +77,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml index 652fc46..7efb6ed 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml @@ -27,7 +27,7 @@ profile_anomaly_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1338' test_id: '1100' @@ -35,7 +35,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500; + SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1271' test_id: '1100' @@ -43,7 +43,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Profile Anomaly - id: '1272' test_id: '1100' @@ -51,7 +51,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1269' test_id: '1100' @@ -59,7 +59,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1470' test_id: '1100' @@ -67,7 +67,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1270' test_id: '1100' @@ -75,5 +75,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml index c4a3499..74a91f0 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml @@ -28,7 +28,7 @@ profile_anomaly_types: WHERE LEFT(CAST(`{COLUMN_NAME}` AS STRING), 1) = '"' OR LEFT(CAST(`{COLUMN_NAME}` AS STRING), 1) = "'" GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1282' test_id: '1010' @@ -36,7 +36,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE (CASE WHEN `{COLUMN_NAME}` ILIKE '"%"' OR `{COLUMN_NAME}` ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE (CASE WHEN `{COLUMN_NAME}` ILIKE '"%"' OR `{COLUMN_NAME}` ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1124' test_id: '1010' @@ -44,7 +44,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" LIKE '"%"' OR "{COLUMN_NAME}" LIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" LIKE '"%"' OR "{COLUMN_NAME}" LIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; error_type: Profile Anomaly - id: '1067' test_id: '1010' @@ -52,7 +52,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1042' test_id: '1010' @@ -60,7 +60,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1442' test_id: '1010' @@ -68,7 +68,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1181' test_id: '1010' @@ -76,5 +76,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml index afb7893..bd121c7 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml @@ -28,7 +28,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY count DESC; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1128' test_id: '1014' @@ -36,7 +36,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; error_type: Profile Anomaly - id: '1071' test_id: '1014' @@ -44,7 +44,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1046' test_id: '1014' @@ -52,7 +52,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1446' test_id: '1014' @@ -60,7 +60,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1185' test_id: '1014' @@ -68,5 +68,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml index 964d7eb..381c26c 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml @@ -31,7 +31,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE (CASE WHEN `{COLUMN_NAME}` IN ('.', '?', ' ') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP '-{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '0{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '9{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'x{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'z{2,}' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN `{COLUMN_NAME}` = '' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE (CASE WHEN `{COLUMN_NAME}` IN ('.', '?', ' ') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP '-{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '0{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '9{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'x{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'z{2,}' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN `{COLUMN_NAME}` = '' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1127' test_id: '1013' @@ -39,7 +39,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE '%..%' OR LOWER("{COLUMN_NAME}") LIKE '%--%' OR (LEN(REPLACE("{COLUMN_NAME}", '0', ''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", '9', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'x', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'z', ''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE '%..%' OR LOWER("{COLUMN_NAME}") LIKE '%--%' OR (LEN(REPLACE("{COLUMN_NAME}", '0', ''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", '9', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'x', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'z', ''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; error_type: Profile Anomaly - id: '1070' test_id: '1013' @@ -47,7 +47,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1045' test_id: '1013' @@ -55,7 +55,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1445' test_id: '1013' @@ -63,7 +63,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1184' test_id: '1013' @@ -71,5 +71,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '9{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '9{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml index 9ef1f37..3b7f394 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml @@ -31,7 +31,7 @@ profile_anomaly_types: WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS FLOAT64) IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 10 + LIMIT {LIMIT_2} ) UNION ALL ( @@ -40,7 +40,7 @@ profile_anomaly_types: WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS FLOAT64) IS NULL GROUP BY `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 10 + LIMIT {LIMIT_2} ) ORDER BY data_type, count DESC; error_type: Profile Anomaly @@ -50,7 +50,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10; + SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT {LIMIT_2}; error_type: Profile Anomaly - id: '1137' test_id: '1023' @@ -58,7 +58,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT TOP {LIMIT_2} 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP {LIMIT_2} 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1080' test_id: '1023' @@ -66,7 +66,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1055' test_id: '1023' @@ -74,7 +74,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1455' test_id: '1023' @@ -82,7 +82,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1194' test_id: '1023' @@ -90,5 +90,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT A.* FROM (SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml index 4eb691a..4f7b457 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml @@ -37,7 +37,7 @@ profile_anomaly_types: ON UPPER(REGEXP_REPLACE(CAST(a.`{COLUMN_NAME}` AS STRING), r"[ '\.\-\,]", '')) = b.possible_standard_value GROUP BY a.`{COLUMN_NAME}`, b.possible_standard_value ORDER BY b.possible_standard_value ASC, count DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1289' test_id: '1017' @@ -45,7 +45,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT `{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) HAVING COUNT(DISTINCT `{COLUMN_NAME}`) > 1 ) SELECT DISTINCT a.`{COLUMN_NAME}`, possible_standard_value, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` a, cte b WHERE UPPER(TRANSLATE(a.`{COLUMN_NAME}`, ' '',.-', '')) = b.possible_standard_value GROUP BY a.`{COLUMN_NAME}`, possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500; + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT `{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) HAVING COUNT(DISTINCT `{COLUMN_NAME}`) > 1 ) SELECT DISTINCT a.`{COLUMN_NAME}`, possible_standard_value, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` a, cte b WHERE UPPER(TRANSLATE(a.`{COLUMN_NAME}`, ' '',.-', '')) = b.possible_standard_value GROUP BY a.`{COLUMN_NAME}`, possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1131' test_id: '1017' @@ -53,7 +53,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT TOP 500 UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") as distinct_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC; + WITH CTE AS ( SELECT DISTINCT TOP {LIMIT} UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") as distinct_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC; error_type: Profile Anomaly - id: '1074' test_id: '1017' @@ -61,7 +61,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500; + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1049' test_id: '1017' @@ -69,7 +69,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500; + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1449' test_id: '1017' @@ -77,7 +77,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500; + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1188' test_id: '1017' @@ -85,5 +85,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500; + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml index 9763b98..0016e44 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml @@ -28,7 +28,7 @@ profile_anomaly_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC - LIMIT 20; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1273' test_id: '1001' @@ -36,7 +36,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20; + SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1115' test_id: '1001' @@ -44,7 +44,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; error_type: Profile Anomaly - id: '1058' test_id: '1001' @@ -52,7 +52,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1033' test_id: '1001' @@ -60,7 +60,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1433' test_id: '1001' @@ -68,7 +68,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1172' test_id: '1001' @@ -76,5 +76,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml index f8ea4ce..e31fd5d 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml @@ -37,7 +37,8 @@ profile_anomaly_types: WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' - ORDER BY table_name; + ORDER BY table_name + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1280' test_id: '1008' @@ -48,7 +49,7 @@ profile_anomaly_types: \ JOIN information_schema.tables ON columns.table_name = tables.table_name AND\ \ columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}'\ \ AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE\ - \ TABLE' ORDER BY table_name; " + \ TABLE' ORDER BY table_name LIMIT {LIMIT};" error_type: Profile Anomaly - id: '1122' test_id: '1008' @@ -56,7 +57,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY table_name; + SELECT TOP {LIMIT} column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY table_name; error_type: Profile Anomaly - id: '1065' test_id: '1008' @@ -64,7 +65,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY columns.table_name; + SELECT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY columns.table_name LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1040' test_id: '1008' @@ -72,7 +73,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT column_name, table_name, data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type; + SELECT column_name, table_name, data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1440' test_id: '1008' @@ -80,7 +81,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT columnname AS column_name, tablename AS table_name, external_type AS data_type FROM svv_external_columns WHERE schemaname = '{TARGET_SCHEMA}' AND columnname = '{COLUMN_NAME}' ORDER BY external_type; + SELECT columnname AS column_name, tablename AS table_name, external_type AS data_type FROM svv_external_columns WHERE schemaname = '{TARGET_SCHEMA}' AND columnname = '{COLUMN_NAME}' ORDER BY external_type LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1179' test_id: '1008' @@ -91,5 +92,5 @@ profile_anomaly_types: \ JOIN information_schema.tables ON columns.table_name = tables.table_name AND\ \ columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}'\ \ AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE\ - \ TABLE' ORDER BY table_name; " + \ TABLE' ORDER BY table_name LIMIT {LIMIT};" error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml index a8574f9..1c5bbf1 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml @@ -27,7 +27,7 @@ profile_anomaly_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1294' test_id: '1022' @@ -35,7 +35,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1136' test_id: '1022' @@ -43,7 +43,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Profile Anomaly - id: '1079' test_id: '1022' @@ -51,7 +51,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1054' test_id: '1022' @@ -59,7 +59,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1454' test_id: '1022' @@ -67,7 +67,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1193' test_id: '1022' @@ -75,5 +75,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml index 0479026..68e6e2e 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml @@ -29,7 +29,7 @@ profile_anomaly_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1293' test_id: '1021' @@ -37,7 +37,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1135' test_id: '1021' @@ -45,7 +45,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Profile Anomaly - id: '1078' test_id: '1021' @@ -53,7 +53,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1053' test_id: '1021' @@ -61,7 +61,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1453' test_id: '1021' @@ -69,7 +69,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1192' test_id: '1021' @@ -77,5 +77,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml index 23dc70f..ea033f9 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml @@ -31,7 +31,7 @@ profile_anomaly_types: OR (CAST(`{COLUMN_NAME}` AS DATE) > DATE_ADD(CAST(CAST('{PROFILE_RUN_DATE}' AS DATETIME) AS DATE), INTERVAL 30 YEAR)) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1290' test_id: '1018' @@ -39,7 +39,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` a WHERE (`{COLUMN_NAME}` < '1900-01-01'::DATE) OR (`{COLUMN_NAME}` > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` a WHERE (`{COLUMN_NAME}` < '1900-01-01'::DATE) OR (`{COLUMN_NAME}` > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1132' test_id: '1018' @@ -47,7 +47,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", CAST( '{PROFILE_RUN_DATE}' AS DATE) AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < CAST('1900-01-01' AS DATE) ) OR ("{COLUMN_NAME}" > DATEADD(YEAR, 30, CAST('{PROFILE_RUN_DATE}' AS DATE ))) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", CAST( '{PROFILE_RUN_DATE}' AS DATE) AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < CAST('1900-01-01' AS DATE) ) OR ("{COLUMN_NAME}" > DATEADD(YEAR, 30, CAST('{PROFILE_RUN_DATE}' AS DATE ))) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Profile Anomaly - id: '1075' test_id: '1018' @@ -55,7 +55,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1050' test_id: '1018' @@ -63,7 +63,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1450' test_id: '1018' @@ -71,7 +71,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1189' test_id: '1018' @@ -79,5 +79,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml index a935d7f..7ba7112 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml @@ -30,7 +30,8 @@ profile_anomaly_types: WHERE LOWER(CAST(`{COLUMN_NAME}` AS STRING)) IN ( SELECT TRIM(val) FROM UNNEST(SPLIT(SUBSTR('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|')) AS val ) - GROUP BY `{COLUMN_NAME}`; + GROUP BY `{COLUMN_NAME}` + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1230' test_id: '1027' @@ -38,7 +39,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE LOWER(`{COLUMN_NAME}`) IN (SELECT TRIM(value) FROM (SELECT EXPLODE(SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', INSTR('{DETAIL_EXPRESSION}', ':') + 2), '\\|')) AS value)) GROUP BY `{COLUMN_NAME}`; + SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE LOWER(`{COLUMN_NAME}`) IN (SELECT TRIM(value) FROM (SELECT EXPLODE(SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', INSTR('{DETAIL_EXPRESSION}', ':') + 2), '\\|')) AS value)) GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1231' test_id: '1027' @@ -46,7 +47,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE LOWER("{COLUMN_NAME}") IN (SELECT trim(value) FROM STRING_SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', CHARINDEX(':', '{DETAIL_EXPRESSION}') + 2, 999), '|')) GROUP BY "{COLUMN_NAME}"; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE LOWER("{COLUMN_NAME}") IN (SELECT trim(value) FROM STRING_SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', CHARINDEX(':', '{DETAIL_EXPRESSION}') + 2, 999), '|')) GROUP BY "{COLUMN_NAME}"; error_type: Profile Anomaly - id: '1232' test_id: '1027' @@ -54,7 +55,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE LOWER("{COLUMN_NAME}") = ANY(STRING_TO_ARRAY(SUBSTRING('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|')) GROUP BY "{COLUMN_NAME}"; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE LOWER("{COLUMN_NAME}") = ANY(STRING_TO_ARRAY(SUBSTRING('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|')) GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1229' test_id: '1027' @@ -62,7 +63,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - WITH val_array AS (SELECT 1 as valkey, SPLIT_TO_ARRAY(SUBSTRING ('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|') vals), val_list AS ( SELECT valkey, val::VARCHAR FROM val_array v, v.vals val ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" t INNER JOIN val_list v ON (LOWER("{COLUMN_NAME}") = v.val) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + WITH val_array AS (SELECT 1 as valkey, SPLIT_TO_ARRAY(SUBSTRING ('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|') vals), val_list AS ( SELECT valkey, val::VARCHAR FROM val_array v, v.vals val ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" t INNER JOIN val_list v ON (LOWER("{COLUMN_NAME}") = v.val) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1458' test_id: '1027' @@ -70,7 +71,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - WITH val_array AS (SELECT 1 as valkey, SPLIT_TO_ARRAY(SUBSTRING ('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|') vals), val_list AS ( SELECT valkey, val::VARCHAR FROM val_array v, v.vals val ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" t INNER JOIN val_list v ON (LOWER("{COLUMN_NAME}") = v.val) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + WITH val_array AS (SELECT 1 as valkey, SPLIT_TO_ARRAY(SUBSTRING ('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|') vals), val_list AS ( SELECT valkey, val::VARCHAR FROM val_array v, v.vals val ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" t INNER JOIN val_list v ON (LOWER("{COLUMN_NAME}") = v.val) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1230' test_id: '1027' @@ -78,5 +79,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE lower("{COLUMN_NAME}") IN (SELECT trim(value) FROM TABLE (FLATTEN(INPUT => SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', POSITION(':', '{DETAIL_EXPRESSION}') + 2), '|'))) ) GROUP BY "{COLUMN_NAME}"; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE lower("{COLUMN_NAME}") IN (SELECT trim(value) FROM TABLE (FLATTEN(INPUT => SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', POSITION(':', '{DETAIL_EXPRESSION}') + 2), '|'))) ) GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml index 57b2901..5b277a5 100644 --- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml @@ -62,7 +62,8 @@ test_types: GROUP BY {GROUPBY_NAMES} ) s WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1333' test_id: '1500' @@ -86,7 +87,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1247' test_id: '1500' @@ -94,7 +96,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT * + SELECT TOP {LIMIT} * FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total @@ -134,7 +136,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1245' test_id: '1500' @@ -158,7 +161,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1462' test_id: '1500' @@ -182,7 +186,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1246' test_id: '1500' @@ -206,7 +211,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results test_templates: - id: '2506' diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml index fcc0487..84b28ec 100644 --- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml @@ -64,7 +64,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1248' test_id: '1504' @@ -90,7 +91,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1247' test_id: '1504' @@ -98,7 +100,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT * + SELECT TOP {LIMIT} * FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total @@ -142,7 +144,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1245' test_id: '1504' @@ -168,7 +171,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1466' test_id: '1504' @@ -194,7 +198,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1246' test_id: '1504' @@ -220,7 +225,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results test_templates: - id: '2509' diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml index 320ccc3..b4b03bc 100644 --- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml @@ -64,7 +64,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1245' test_id: '1505' @@ -90,7 +91,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1247' test_id: '1505' @@ -98,7 +100,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT * + SELECT TOP {LIMIT} * FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total @@ -142,7 +144,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1245' test_id: '1505' @@ -168,7 +171,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1467' test_id: '1505' @@ -194,7 +198,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1246' test_id: '1505' @@ -220,7 +225,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results test_templates: - id: '2510' diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml index 58462bf..e5355a7 100644 --- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml @@ -62,7 +62,8 @@ test_types: GROUP BY {GROUPBY_NAMES} ) s WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1334' test_id: '1501' @@ -86,7 +87,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1251' test_id: '1501' @@ -94,7 +96,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT * + SELECT TOP {LIMIT} * FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total @@ -134,7 +136,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1249' test_id: '1501' @@ -158,7 +161,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1463' test_id: '1501' @@ -182,7 +186,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1250' test_id: '1501' @@ -206,7 +211,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results test_templates: - id: '2502' diff --git a/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml b/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml index 97f00d8..3e9297e 100644 --- a/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml @@ -113,7 +113,7 @@ test_types: ) a WHERE LENGTH(CAST(`{COLUMN_NAME}` AS STRING)) = a.max_length AND a.max_length < {THRESHOLD_VALUE} - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1298' test_id: '1004' @@ -121,7 +121,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}` , LEN(`{COLUMN_NAME}`) as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT MAX(LEN(`{COLUMN_NAME}`)) as max_length FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) a WHERE LEN(`{COLUMN_NAME}`) = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}` , LEN(`{COLUMN_NAME}`) as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT MAX(LEN(`{COLUMN_NAME}`)) as max_length FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) a WHERE LEN(`{COLUMN_NAME}`) = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1140' test_id: '1004' @@ -129,7 +129,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} ; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} ; error_type: Test Results - id: '1083' test_id: '1004' @@ -137,7 +137,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", LENGTH("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LENGTH("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LENGTH("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", LENGTH("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LENGTH("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LENGTH("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1001' test_id: '1004' @@ -145,7 +145,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1401' test_id: '1004' @@ -153,7 +153,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1197' test_id: '1004' @@ -161,6 +161,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}" , LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}" , LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml index 3630cc0..2c02c15 100644 --- a/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml @@ -57,7 +57,8 @@ test_types: GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) test - ORDER BY {COLUMN_NAME_NO_QUOTES}; + ORDER BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results - id: '1335' test_id: '1502' @@ -78,7 +79,8 @@ test_types: GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) test - ORDER BY {COLUMN_NAME_NO_QUOTES}; + ORDER BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results - id: '1255' test_id: '1502' @@ -86,7 +88,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT * + SELECT TOP {LIMIT} * FROM ( SELECT {COLUMN_NAME_NO_QUOTES} FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} @@ -120,7 +122,8 @@ test_types: GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) test - ORDER BY {COLUMN_NAME_NO_QUOTES}; + ORDER BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results - id: '1253' test_id: '1502' @@ -141,7 +144,8 @@ test_types: GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) test - ORDER BY {COLUMN_NAME_NO_QUOTES}; + ORDER BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results - id: '1464' test_id: '1502' @@ -162,7 +166,8 @@ test_types: GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) test - ORDER BY {COLUMN_NAME_NO_QUOTES}; + ORDER BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results - id: '1254' test_id: '1502' @@ -183,7 +188,8 @@ test_types: GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) test - ORDER BY {COLUMN_NAME_NO_QUOTES}; + ORDER BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results test_templates: - id: '2501' diff --git a/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml b/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml index 6f01a0b..fcde8ab 100644 --- a/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml @@ -110,7 +110,7 @@ test_types: SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE {CUSTOM_QUERY} - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1300' test_id: '1006' @@ -118,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE {CUSTOM_QUERY} LIMIT 500; + SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE {CUSTOM_QUERY} LIMIT {LIMIT}; error_type: Test Results - id: '1142' test_id: '1006' @@ -126,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY}; + SELECT TOP {LIMIT} * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY}; error_type: Test Results - id: '1085' test_id: '1006' @@ -134,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT 500; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT {LIMIT}; error_type: Test Results - id: '1003' test_id: '1006' @@ -142,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT 500; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT {LIMIT}; error_type: Test Results - id: '1403' test_id: '1006' @@ -150,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT 500; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT {LIMIT}; error_type: Test Results - id: '1199' test_id: '1006' @@ -158,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT 500; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Constant.yaml b/testgen/template/dbsetup_test_types/test_types_Constant.yaml index b9e5033..6752163 100644 --- a/testgen/template/dbsetup_test_types/test_types_Constant.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Constant.yaml @@ -110,7 +110,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` <> {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1301' test_id: '1007' @@ -118,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` <> {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` <> {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Test Results - id: '1143' test_id: '1007' @@ -126,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}"; error_type: Test Results - id: '1086' test_id: '1007' @@ -134,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1004' test_id: '1007' @@ -142,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1404' test_id: '1007' @@ -150,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1200' test_id: '1007' @@ -158,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml index d0d0ca2..7f341c3 100644 --- a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml @@ -141,7 +141,7 @@ test_types: LEFT JOIN existing_periods e ON (p.prior_available_date = e.period) LEFT JOIN existing_periods f ON (p.next_available_date = f.period) ORDER BY p.missing_period - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1302' test_id: '1009' @@ -149,7 +149,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - WITH date_bounds AS( SELECT MIN(`{COLUMN_NAME}`) AS min_date, MAX(`{COLUMN_NAME}`) AS max_date FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), all_dates AS ( SELECT EXPLODE(SEQUENCE(min_date, max_date, INTERVAL 1 DAY)) AS all_dates FROM date_bounds ), existing_periods AS ( SELECT DISTINCT CAST(`{COLUMN_NAME}` AS DATE) AS period, COUNT(1) AS period_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY CAST(`{COLUMN_NAME}` AS DATE) ), missing_dates AS ( SELECT d.all_dates AS missing_period FROM all_dates d LEFT JOIN existing_periods e ON d.all_dates = e.period WHERE e.period IS NULL ) SELECT m.missing_period, MAX(e1.period) AS prior_available_date, MAX(e1.period_count) AS prior_available_date_count, MIN(e2.period) AS next_available_date, MAX(e2.period_count) AS next_available_date_count FROM missing_dates m LEFT JOIN existing_periods e1 ON e1.period < m.missing_period LEFT JOIN existing_periods e2 ON e2.period > m.missing_period GROUP BY m.missing_period ORDER BY m.missing_period LIMIT 500; + WITH date_bounds AS( SELECT MIN(`{COLUMN_NAME}`) AS min_date, MAX(`{COLUMN_NAME}`) AS max_date FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), all_dates AS ( SELECT EXPLODE(SEQUENCE(min_date, max_date, INTERVAL 1 DAY)) AS all_dates FROM date_bounds ), existing_periods AS ( SELECT DISTINCT CAST(`{COLUMN_NAME}` AS DATE) AS period, COUNT(1) AS period_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY CAST(`{COLUMN_NAME}` AS DATE) ), missing_dates AS ( SELECT d.all_dates AS missing_period FROM all_dates d LEFT JOIN existing_periods e ON d.all_dates = e.period WHERE e.period IS NULL ) SELECT m.missing_period, MAX(e1.period) AS prior_available_date, MAX(e1.period_count) AS prior_available_date_count, MIN(e2.period) AS next_available_date, MAX(e2.period_count) AS next_available_date_count FROM missing_dates m LEFT JOIN existing_periods e1 ON e1.period < m.missing_period LEFT JOIN existing_periods e2 ON e2.period > m.missing_period GROUP BY m.missing_period ORDER BY m.missing_period LIMIT {LIMIT}; error_type: Test Results - id: '1144' test_id: '1009' @@ -188,7 +188,7 @@ test_types: FROM check_periods c LEFT JOIN data_by_period d ON (c.check_period = d.data_period) ) - SELECT check_period, record_ct, + SELECT TOP {LIMIT} check_period, record_ct, CASE WHEN record_ct = 0 THEN 'MISSING' ELSE 'Present' @@ -205,7 +205,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates :: DATE + INTERVAL '1 day') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates LIMIT 500; + WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates :: DATE + INTERVAL '1 day') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates LIMIT {LIMIT}; error_type: Test Results - id: '1005' test_id: '1009' @@ -213,7 +213,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT 500; + WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT {LIMIT}; error_type: Test Results - id: '1405' test_id: '1009' @@ -221,7 +221,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT 500; + WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT {LIMIT}; error_type: Test Results - id: '1201' test_id: '1009' @@ -229,6 +229,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT p.missing_period, p.prior_available_date, e.period_count as prior_available_date_count, p.next_available_date, f.period_count as next_available_date_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, MIN(c.period) AS next_available_date FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_date = e.period) LEFT JOIN existing_periods f ON (p.next_available_date = f.period) ORDER BY p.missing_period LIMIT 500; + WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT p.missing_period, p.prior_available_date, e.period_count as prior_available_date_count, p.next_available_date, f.period_count as next_available_date_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, MIN(c.period) AS next_available_date FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_date = e.period) LEFT JOIN existing_periods f ON (p.next_available_date = f.period) ORDER BY p.missing_period LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml b/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml index b7554ca..ffa38aa 100644 --- a/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml @@ -110,7 +110,7 @@ test_types: SELECT DISTINCT LENGTH(SPLIT(CAST(`{COLUMN_NAME}` AS STRING), '.')[SAFE_OFFSET(1)]) AS decimal_scale, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY decimal_scale - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1303' test_id: '1011' @@ -118,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT LENGTH(SPLIT_PART(`{COLUMN_NAME}`::STRING, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY decimal_scale LIMIT 500; + SELECT DISTINCT LENGTH(SPLIT_PART(`{COLUMN_NAME}`::STRING, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY decimal_scale LIMIT {LIMIT}; error_type: Test Results - id: '1145' test_id: '1011' @@ -130,7 +130,7 @@ test_types: SELECT LEN(SUBSTRING(CAST(ABS("{COLUMN_NAME}") % 1 AS VARCHAR), 3, LEN("{COLUMN_NAME}"))) AS decimal_scale FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" ) - SELECT DISTINCT TOP 500 decimal_scale, COUNT(*) AS count + SELECT DISTINCT TOP {LIMIT} decimal_scale, COUNT(*) AS count FROM cte GROUP BY decimal_scale ORDER BY COUNT(*) DESC; error_type: Test Results - id: '1088' @@ -139,7 +139,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY decimal_scale LIMIT 500; + SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY decimal_scale LIMIT {LIMIT}; error_type: Test Results - id: '1006' test_id: '1011' @@ -147,7 +147,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) LIMIT 500; + SELECT DISTINCT DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) LIMIT {LIMIT}; error_type: Test Results - id: '1406' test_id: '1011' @@ -155,7 +155,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) LIMIT 500; + SELECT DISTINCT DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) LIMIT {LIMIT}; error_type: Test Results - id: '1202' test_id: '1011' @@ -163,6 +163,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY decimal_scale LIMIT 500; + SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY decimal_scale LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml index e27cdb9..1762b55 100644 --- a/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml @@ -112,7 +112,7 @@ test_types: WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1304' test_id: '1012' @@ -120,7 +120,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT {LIMIT}; error_type: Test Results - id: '1146' test_id: '1012' @@ -128,7 +128,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Test Results - id: '1089' test_id: '1012' @@ -136,7 +136,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Test Results - id: '1007' test_id: '1012' @@ -144,7 +144,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Test Results - id: '1407' test_id: '1012' @@ -152,7 +152,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Test Results - id: '1203' test_id: '1012' @@ -160,6 +160,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml index d0382cb..9e43a2b 100644 --- a/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml @@ -111,7 +111,7 @@ test_types: WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1305' test_id: '1013' @@ -119,7 +119,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT {LIMIT}; error_type: Test Results - id: '1147' test_id: '1013' @@ -127,7 +127,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Test Results - id: '1090' test_id: '1013' @@ -135,7 +135,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Test Results - id: '1008' test_id: '1013' @@ -143,7 +143,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Test Results - id: '1408' test_id: '1013' @@ -151,7 +151,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Test Results - id: '1204' test_id: '1013' @@ -159,6 +159,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml index cd7f6c0..8b5bcce 100644 --- a/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml @@ -52,7 +52,8 @@ test_types: GROUP BY {CONCAT_COLUMNS} ) SELECT * - FROM latest_ver; + FROM latest_ver + LIMIT {LIMIT}; error_type: Test Results - id: '1336' test_id: '1503' @@ -79,6 +80,7 @@ test_types: FULL JOIN older_ver o ON (l.category = o.category) ORDER BY COALESCE(l.category, o.category) + LIMIT {LIMIT}; error_type: Test Results - id: '1259' test_id: '1503' @@ -98,13 +100,13 @@ test_types: FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 WHERE {MATCH_SUBSET_CONDITION} GROUP BY {MATCH_GROUPBY_NAMES} ) - SELECT COALESCE(l.category, o.category) AS category, + SELECT TOP {LIMIT} COALESCE(l.category, o.category) AS category, o.pct_of_total AS old_pct, l.pct_of_total AS new_pct FROM latest_ver l FULL JOIN older_ver o ON (l.category = o.category) - ORDER BY COALESCE(l.category, o.category) + ORDER BY COALESCE(l.category, o.category); error_type: Test Results - id: '1260' test_id: '1503' @@ -131,6 +133,7 @@ test_types: FULL JOIN older_ver o ON (l.category = o.category) ORDER BY COALESCE(l.category, o.category) + LIMIT {LIMIT}; error_type: Test Results - id: '1257' test_id: '1503' @@ -157,6 +160,7 @@ test_types: FULL JOIN older_ver o ON (l.category = o.category) ORDER BY COALESCE(l.category, o.category) + LIMIT {LIMIT}; error_type: Test Results - id: '1465' test_id: '1503' @@ -183,6 +187,7 @@ test_types: FULL JOIN older_ver o ON (l.category = o.category) ORDER BY COALESCE(l.category, o.category) + LIMIT {LIMIT}; error_type: Test Results - id: '1258' test_id: '1503' @@ -209,6 +214,7 @@ test_types: FULL JOIN older_ver o ON (l.category = o.category) ORDER BY COALESCE(l.category, o.category) + LIMIT {LIMIT}; error_type: Test Results test_templates: - id: '2503' diff --git a/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml b/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml index af83785..a186f74 100644 --- a/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml @@ -48,7 +48,8 @@ test_types: WHERE {SUBSET_CONDITION} GROUP BY {GROUPBY_NAMES} HAVING COUNT(*) > 1 - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1257' test_id: '1510' @@ -62,6 +63,7 @@ test_types: GROUP BY {GROUPBY_NAMES} HAVING COUNT(*) > 1 ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1255' test_id: '1510' @@ -69,7 +71,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct + SELECT TOP {LIMIT} {GROUPBY_NAMES}, COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} GROUP BY {GROUPBY_NAMES} @@ -88,6 +90,7 @@ test_types: GROUP BY {GROUPBY_NAMES} HAVING COUNT(*) > 1 ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1253' test_id: '1510' @@ -101,6 +104,7 @@ test_types: GROUP BY {GROUPBY_NAMES} HAVING COUNT(*) > 1 ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1472' test_id: '1510' @@ -114,6 +118,7 @@ test_types: GROUP BY {GROUPBY_NAMES} HAVING COUNT(*) > 1 ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1254' test_id: '1510' @@ -127,6 +132,7 @@ test_types: GROUP BY {GROUPBY_NAMES} HAVING COUNT(*) > 1 ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results test_templates: - id: '2511' diff --git a/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml b/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml index 928f981..7cebba6 100644 --- a/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml @@ -110,7 +110,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NOT REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r'^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$') GROUP BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1306' test_id: '1014' @@ -118,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') != 1 GROUP BY `{COLUMN_NAME}` LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') != 1 GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Test Results - id: '1148' test_id: '1014' @@ -126,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" NOT LIKE '%[_a-zA-Z0-9.-]%@%[a-zA-Z0-9.-]%.[a-zA-Z][a-zA-Z]%' GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" NOT LIKE '%[_a-zA-Z0-9.-]%@%[a-zA-Z0-9.-]%.[a-zA-Z][a-zA-Z]%' GROUP BY "{COLUMN_NAME}"; error_type: Test Results - id: '1091' test_id: '1014' @@ -134,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1009' test_id: '1014' @@ -142,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1409' test_id: '1014' @@ -150,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1205' test_id: '1014' @@ -158,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') != 1 GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') != 1 GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml b/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml index 3f02dd2..5aab6fc 100644 --- a/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml @@ -109,7 +109,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE DATETIME_DIFF(`{COLUMN_NAME}`, CAST(CAST('{TEST_DATE}' AS DATETIME) AS {COLUMN_TYPE}), DAY) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1307' test_id: '1015' @@ -117,7 +117,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Test Results - id: '1149' test_id: '1015' @@ -125,7 +125,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) >= CONVERT(DATE, '{TEST_DATE}') GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) >= CONVERT(DATE, '{TEST_DATE}') GROUP BY "{COLUMN_NAME}"; error_type: Test Results - id: '1092' test_id: '1015' @@ -133,7 +133,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1010' test_id: '1015' @@ -141,7 +141,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1410' test_id: '1015' @@ -149,7 +149,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1206' test_id: '1015' @@ -157,6 +157,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml b/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml index 0ce7f4a..a11ceba 100644 --- a/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml @@ -110,7 +110,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE DATETIME_DIFF(`{COLUMN_NAME}`, DATE_ADD(CAST(CAST('{TEST_DATE}' AS DATETIME) AS {COLUMN_TYPE}), INTERVAL 365 DAY), DAY) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1308' test_id: '1016' @@ -118,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Test Results - id: '1150' test_id: '1016' @@ -126,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) >= DATEADD(DAY, 365, CONVERT(DATE, '{TEST_DATE}')) GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) >= DATEADD(DAY, 365, CONVERT(DATE, '{TEST_DATE}')) GROUP BY "{COLUMN_NAME}"; error_type: Test Results - id: '1093' test_id: '1016' @@ -134,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1011' test_id: '1016' @@ -142,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1411' test_id: '1016' @@ -150,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1207' test_id: '1016' @@ -158,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml b/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml index 96b8e33..3668681 100644 --- a/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml +++ b/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml @@ -110,7 +110,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` ) WHERE lov <> '{THRESHOLD_VALUE}' - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1310' test_id: '1018' @@ -118,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') AS aggregated_values FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` HAVING ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') <> '{THRESHOLD_VALUE}' LIMIT 500; + SELECT ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') AS aggregated_values FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` HAVING ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; error_type: Test Results - id: '1152' test_id: '1018' @@ -126,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - WITH CTE AS (SELECT DISTINCT "{COLUMN_NAME}" FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT STRING_AGG( "{COLUMN_NAME}", '|' ) WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) FROM CTE HAVING STRING_AGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}'; + WITH CTE AS (SELECT DISTINCT "{COLUMN_NAME}" FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT TOP {LIMIT} STRING_AGG( "{COLUMN_NAME}", '|' ) WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) FROM CTE HAVING STRING_AGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}'; error_type: Test Results - id: '1095' test_id: '1018' @@ -134,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}' LIMIT 500; + SELECT STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; error_type: Test Results - id: '1013' test_id: '1018' @@ -142,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT 500; + SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; error_type: Test Results - id: '1413' test_id: '1018' @@ -150,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT 500; + SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; error_type: Test Results - id: '1209' test_id: '1018' @@ -158,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT 500; + SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml b/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml index 6d80ebc..6f2aa12 100644 --- a/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml +++ b/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml @@ -214,7 +214,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1311' test_id: '1019' @@ -222,7 +222,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT NULLIF(`{COLUMN_NAME}`, '') AS `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500; + SELECT DISTINCT NULLIF(`{COLUMN_NAME}`, '') AS `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Test Results - id: '1153' test_id: '1019' @@ -230,7 +230,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" ; + SELECT DISTINCT TOP {LIMIT} NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" ; error_type: Test Results - id: '1096' test_id: '1019' @@ -238,7 +238,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1014' test_id: '1019' @@ -246,7 +246,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1414' test_id: '1019' @@ -254,7 +254,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1210' test_id: '1019' @@ -262,6 +262,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml b/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml index 939dc27..698d63a 100644 --- a/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml @@ -110,7 +110,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE CAST(`{COLUMN_NAME}` AS DATE) < CAST(CAST('{BASELINE_VALUE}' AS DATETIME) AS DATE) GROUP BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1312' test_id: '1020' @@ -118,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY `{COLUMN_NAME}` LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Test Results - id: '1154' test_id: '1020' @@ -126,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) < CAST('{BASELINE_VALUE}' AS DATE) GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) < CAST('{BASELINE_VALUE}' AS DATE) GROUP BY "{COLUMN_NAME}"; error_type: Test Results - id: '1097' test_id: '1020' @@ -134,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1015' test_id: '1020' @@ -142,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1415' test_id: '1020' @@ -150,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1211' test_id: '1020' @@ -158,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml b/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml index 8563d33..ea5b7d5 100644 --- a/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml @@ -109,7 +109,7 @@ test_types: SELECT DISTINCT `{COLUMN_NAME}`, (ABS(CAST(`{COLUMN_NAME}` AS NUMERIC)) - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE CAST(`{COLUMN_NAME}` AS NUMERIC) < {BASELINE_VALUE} - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1313' test_id: '1021' @@ -117,7 +117,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, (ABS(`{COLUMN_NAME}`) - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` < {BASELINE_VALUE} LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, (ABS(`{COLUMN_NAME}`) - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` < {BASELINE_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1155' test_id: '1021' @@ -125,7 +125,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE}; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE}; error_type: Test Results - id: '1098' test_id: '1021' @@ -133,7 +133,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1016' test_id: '1021' @@ -141,7 +141,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1416' test_id: '1021' @@ -149,7 +149,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1212' test_id: '1021' @@ -157,6 +157,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml index 67069e2..7598d6e 100644 --- a/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml @@ -110,7 +110,7 @@ test_types: SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NULL OR CAST(`{COLUMN_NAME}` AS STRING) = '' - LIMIT 10; + LIMIT {LIMIT}; error_type: Test Results - id: '1314' test_id: '1022' @@ -118,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NULL OR `{COLUMN_NAME}` :: VARCHAR(255) = '' LIMIT 10; + SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NULL OR `{COLUMN_NAME}` :: VARCHAR(255) = '' LIMIT {LIMIT}; error_type: Test Results - id: '1156' test_id: '1022' @@ -126,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 10 * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR CAST("{COLUMN_NAME}" AS VARCHAR(255)) = ''; + SELECT TOP {LIMIT} * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR CAST("{COLUMN_NAME}" AS VARCHAR(255)) = ''; error_type: Test Results - id: '1099' test_id: '1022' @@ -134,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' LIMIT 10; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' LIMIT {LIMIT}; error_type: Test Results - id: '1017' test_id: '1022' @@ -142,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT TOP 10 * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' ; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' LIMIT {LIMIT}; error_type: Test Results - id: '1417' test_id: '1022' @@ -150,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT TOP 10 * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' ; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' LIMIT {LIMIT}; error_type: Test Results - id: '1213' test_id: '1022' @@ -158,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT TOP 10 * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' ; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml index af459f0..0f155ed 100644 --- a/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml @@ -136,7 +136,8 @@ test_types: FROM p LEFT JOIN existing_periods e ON (p.prior_available_month = e.period) LEFT JOIN existing_periods f ON (p.next_available_month = f.period) - ORDER BY p.missing_period; + ORDER BY p.missing_period + LIMIT {LIMIT}; error_type: Test Results - id: '1315' test_id: '1023' @@ -144,7 +145,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - WITH daterange AS( SELECT explode( sequence( date_trunc('month', (SELECT MIN(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), date_trunc('month', (SELECT MAX(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), interval 1 month) ) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc('month', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY date_trunc('month', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_month, e.period_count AS prior_available_month_count, p.next_available_month, f.period_count AS next_available_month_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_month = e.period LEFT JOIN existing_periods f ON p.next_available_month = f.period ORDER BY p.missing_period; + WITH daterange AS( SELECT explode( sequence( date_trunc('month', (SELECT MIN(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), date_trunc('month', (SELECT MAX(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), interval 1 month) ) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc('month', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY date_trunc('month', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_month, e.period_count AS prior_available_month_count, p.next_available_month, f.period_count AS next_available_month_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_month = e.period LEFT JOIN existing_periods f ON p.next_available_month = f.period ORDER BY p.missing_period LIMIT {LIMIT}; error_type: Test Results - id: '1157' test_id: '1023' @@ -183,7 +184,7 @@ test_types: FROM check_periods c LEFT JOIN data_by_period d ON (c.check_period = d.data_period) ) - SELECT check_period, record_ct, + SELECT TOP {LIMIT} check_period, record_ct, CASE WHEN record_ct = 0 THEN 'MISSING' ELSE 'Present' @@ -200,7 +201,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates :: DATE + INTERVAL '1 month') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates; + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates :: DATE + INTERVAL '1 month') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT {LIMIT}; error_type: Test Results - id: '1018' test_id: '1023' @@ -208,7 +209,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates; + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT {LIMIT}; error_type: Test Results - id: '1418' test_id: '1023' @@ -216,7 +217,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates; + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT {LIMIT}; error_type: Test Results - id: '1214' test_id: '1023' @@ -224,6 +225,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS (SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_month, e.period_count as prior_available_month_count, p.next_available_month, f.period_count as next_available_month_count FROM (SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_month = e.period) LEFT JOIN existing_periods f ON (p.next_available_month = f.period) ORDER BY p.missing_period; + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS (SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_month, e.period_count as prior_available_month_count, p.next_available_month, f.period_count as next_available_month_count FROM (SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_month = e.period) LEFT JOIN existing_periods f ON (p.next_available_month = f.period) ORDER BY p.missing_period LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml index 03f123e..84d0052 100644 --- a/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml @@ -109,7 +109,8 @@ test_types: SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NOT REGEXP_CONTAINS(NULLIF(CAST(`{COLUMN_NAME}` AS STRING), ''), r'{BASELINE_VALUE}') - GROUP BY `{COLUMN_NAME}`; + GROUP BY `{COLUMN_NAME}` + LIMIT {LIMIT}; error_type: Test Results - id: '1318' test_id: '1026' @@ -117,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(NULLIF(`{COLUMN_NAME}`::STRING, ''),'{BASELINE_VALUE}') != 1 GROUP BY `{COLUMN_NAME}`; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(NULLIF(`{COLUMN_NAME}`::STRING, ''),'{BASELINE_VALUE}') != 1 GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Test Results - id: '1160' test_id: '1026' @@ -125,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT LIKE '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT LIKE '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}"; error_type: Test Results - id: '1103' test_id: '1026' @@ -133,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1021' test_id: '1026' @@ -141,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1421' test_id: '1026' @@ -149,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1217' test_id: '1026' @@ -157,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE(NULLIF("{COLUMN_NAME}"::VARCHAR, ''),'{BASELINE_VALUE}') != 1 GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE(NULLIF("{COLUMN_NAME}"::VARCHAR, ''),'{BASELINE_VALUE}') != 1 GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Recency.yaml b/testgen/template/dbsetup_test_types/test_types_Recency.yaml index 69aedb3..278eb9d 100644 --- a/testgen/template/dbsetup_test_types/test_types_Recency.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Recency.yaml @@ -109,7 +109,8 @@ test_types: lookup_query: |- SELECT DISTINCT col AS latest_date_available, CAST(CAST('{TEST_DATE}' AS DATETIME) AS {COLUMN_TYPE}) AS test_run_date FROM (SELECT DATE_TRUNC(MAX(`{COLUMN_NAME}`), DAY) AS col FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) - WHERE DATETIME_DIFF(CAST(CAST('{TEST_DATE}' AS DATETIME) AS {COLUMN_TYPE}), col, DAY) > {THRESHOLD_VALUE}; + WHERE DATETIME_DIFF(CAST(CAST('{TEST_DATE}' AS DATETIME) AS {COLUMN_TYPE}), col, DAY) > {THRESHOLD_VALUE} + LIMIT {LIMIT}; error_type: Test Results - id: '1319' test_id: '1028' @@ -117,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX(`{COLUMN_NAME}`) AS col FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) WHERE ABS(<%DATEDIFF_DAY;col;'{TEST_DATE}'::DATE%>) > {THRESHOLD_VALUE}; + SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX(`{COLUMN_NAME}`) AS col FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) WHERE ABS(<%DATEDIFF_DAY;col;'{TEST_DATE}'::DATE%>) > {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1161' test_id: '1028' @@ -125,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT col AS latest_date_available, CAST('{TEST_DATE}' AS DATE) AS test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE DATEDIFF(day, col, CAST('{TEST_DATE}' AS DATE)) > {THRESHOLD_VALUE}; + SELECT DISTINCT TOP {LIMIT} col AS latest_date_available, CAST('{TEST_DATE}' AS DATE) AS test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE DATEDIFF(day, col, CAST('{TEST_DATE}' AS DATE)) > {THRESHOLD_VALUE}; error_type: Test Results - id: '1104' test_id: '1028' @@ -133,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE <%DATEDIFF_DAY;col;'{TEST_DATE}'::DATE%> > {THRESHOLD_VALUE}; + SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE <%DATEDIFF_DAY;col;'{TEST_DATE}'::DATE%> > {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1022' test_id: '1028' @@ -141,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE}; + SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1422' test_id: '1028' @@ -149,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE}; + SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1218' test_id: '1028' @@ -157,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE}; + SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Required.yaml b/testgen/template/dbsetup_test_types/test_types_Required.yaml index 1149fbb..ada30df 100644 --- a/testgen/template/dbsetup_test_types/test_types_Required.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Required.yaml @@ -108,7 +108,7 @@ test_types: SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NULL - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1320' test_id: '1030' @@ -116,7 +116,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NULL LIMIT 500; + SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NULL LIMIT {LIMIT}; error_type: Test Results - id: '1162' test_id: '1030' @@ -124,7 +124,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL; + SELECT TOP {LIMIT} * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL; error_type: Test Results - id: '1105' test_id: '1030' @@ -132,7 +132,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT 500; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT {LIMIT}; error_type: Test Results - id: '1023' test_id: '1030' @@ -140,7 +140,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT 500; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT {LIMIT}; error_type: Test Results - id: '1423' test_id: '1030' @@ -148,7 +148,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT 500; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT {LIMIT}; error_type: Test Results - id: '1219' test_id: '1030' @@ -156,6 +156,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT 500; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml b/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml index 759e5a3..0fb0a90 100644 --- a/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml @@ -112,7 +112,7 @@ test_types: WHERE NOT REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r'^[0-9]{1,5}[A-Za-z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[A-Za-z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$') GROUP BY `{COLUMN_NAME}` ORDER BY COUNT(*) DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1323' test_id: '1033' @@ -120,7 +120,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT}; error_type: Test Results - id: '1165' test_id: '1033' @@ -128,7 +128,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") NOT LIKE '[1-9]% [A-Z]% %' AND CHARINDEX(' ', "{COLUMN_NAME}") NOT BETWEEN 2 AND 6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") NOT LIKE '[1-9]% [A-Z]% %' AND CHARINDEX(' ', "{COLUMN_NAME}") NOT BETWEEN 2 AND 6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; error_type: Test Results - id: '1108' test_id: '1033' @@ -136,7 +136,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results - id: '1026' test_id: '1033' @@ -144,7 +144,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results - id: '1426' test_id: '1033' @@ -152,7 +152,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results - id: '1222' test_id: '1033' @@ -160,6 +160,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') != 1 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') != 1 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml index dd72a77..746913c 100644 --- a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml @@ -55,7 +55,8 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), INTERVAL {WINDOW_DAYS} DAY) - GROUP BY {COLUMN_NAME_NO_QUOTES}; + GROUP BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results - id: '1263' test_id: '1508' @@ -63,7 +64,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT {COLUMN_NAME_NO_QUOTES} + SELECT TOP {LIMIT} {COLUMN_NAME_NO_QUOTES} FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) @@ -94,6 +95,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} GROUP BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results - id: '1261' test_id: '1508' @@ -113,6 +115,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} GROUP BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results - id: '1468' test_id: '1508' @@ -132,6 +135,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} GROUP BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results - id: '1262' test_id: '1508' @@ -151,6 +155,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} GROUP BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results test_templates: - id: '2507' diff --git a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml index af62dff..8f6d936 100644 --- a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml @@ -53,6 +53,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), INTERVAL 2 * {WINDOW_DAYS} DAY) AND {WINDOW_DATE_COLUMN} < DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), INTERVAL {WINDOW_DAYS} DAY) + LIMIT {LIMIT_2} ) UNION ALL ( @@ -66,7 +67,8 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), INTERVAL {WINDOW_DAYS} DAY) - ); + LIMIT {LIMIT_2} + ) error_type: Test Results - id: '1337' test_id: '1509' @@ -85,6 +87,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) - 2 * {WINDOW_DAYS} AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) UNION ALL ( @@ -98,6 +101,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) error_type: Test Results - id: '1267' @@ -107,7 +111,7 @@ test_types: lookup_type: null lookup_query: |2- ( - SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + SELECT TOP {LIMIT_2} 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) @@ -120,7 +124,7 @@ test_types: ) UNION ALL ( - SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + SELECT TOP {LIMIT_2} 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) @@ -149,6 +153,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) UNION ALL ( @@ -162,6 +167,7 @@ test_types: FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) error_type: Test Results - id: '1265' @@ -181,6 +187,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) UNION ALL ( @@ -194,6 +201,7 @@ test_types: FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) error_type: Test Results - id: '1469' @@ -213,6 +221,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) UNION ALL ( @@ -226,6 +235,7 @@ test_types: FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) error_type: Test Results - id: '1266' @@ -245,6 +255,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) UNION ALL ( @@ -258,6 +269,7 @@ test_types: FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) error_type: Test Results test_templates: diff --git a/testgen/template/dbsetup_test_types/test_types_US_State.yaml b/testgen/template/dbsetup_test_types/test_types_US_State.yaml index f2d2299..c9d51c5 100644 --- a/testgen/template/dbsetup_test_types/test_types_US_State.yaml +++ b/testgen/template/dbsetup_test_types/test_types_US_State.yaml @@ -111,7 +111,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1324' test_id: '1036' @@ -119,7 +119,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY `{COLUMN_NAME}` LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Test Results - id: '1166' test_id: '1036' @@ -127,7 +127,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}"; error_type: Test Results - id: '1109' test_id: '1036' @@ -135,7 +135,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1027' test_id: '1036' @@ -143,7 +143,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1427' test_id: '1036' @@ -151,7 +151,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1223' test_id: '1036' @@ -159,6 +159,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Unique.yaml b/testgen/template/dbsetup_test_types/test_types_Unique.yaml index c9cc6ca..61eabf8 100644 --- a/testgen/template/dbsetup_test_types/test_types_Unique.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Unique.yaml @@ -112,7 +112,7 @@ test_types: GROUP BY `{COLUMN_NAME}` HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1325' test_id: '1034' @@ -120,7 +120,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT {LIMIT}; error_type: Test Results - id: '1167' test_id: '1034' @@ -128,7 +128,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC; error_type: Test Results - id: '1110' test_id: '1034' @@ -136,7 +136,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results - id: '1028' test_id: '1034' @@ -144,7 +144,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results - id: '1428' test_id: '1034' @@ -152,7 +152,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results - id: '1224' test_id: '1034' @@ -160,6 +160,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml index 7665c97..374a4d5 100644 --- a/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml @@ -111,7 +111,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY COUNT(*) DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1326' test_id: '1035' @@ -119,7 +119,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT}; error_type: Test Results - id: '1168' test_id: '1035' @@ -127,7 +127,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; error_type: Test Results - id: '1111' test_id: '1035' @@ -135,7 +135,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results - id: '1029' test_id: '1035' @@ -143,7 +143,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results - id: '1429' test_id: '1035' @@ -151,7 +151,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results - id: '1225' test_id: '1035' @@ -159,6 +159,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml index fdef707..4d5f876 100644 --- a/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml @@ -115,7 +115,7 @@ test_types: OR CAST(`{COLUMN_NAME}` AS STRING) LIKE '"%' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC - LIMIT 20; + LIMIT {LIMIT}; error_type: Test Results - id: '1330' test_id: '1043' @@ -123,7 +123,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`, '.*[[:cntrl:]].*') OR `{COLUMN_NAME}`::STRING LIKE ' %' OR `{COLUMN_NAME}`::STRING LIKE '''%''' OR `{COLUMN_NAME}`::STRING LIKE '"%"' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20; + SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`, '.*[[:cntrl:]].*') OR `{COLUMN_NAME}`::STRING LIKE ' %' OR `{COLUMN_NAME}`::STRING LIKE '''%''' OR `{COLUMN_NAME}`::STRING LIKE '"%"' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1235' test_id: '1043' @@ -131,7 +131,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", NCHAR(160) || NCHAR(8203) || NCHAR(65279) || NCHAR(8239) || NCHAR(8201) || NCHAR(12288) || NCHAR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", NCHAR(160) || NCHAR(8203) || NCHAR(65279) || NCHAR(8239) || NCHAR(8201) || NCHAR(12288) || NCHAR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC; error_type: Test Results - id: '1234' test_id: '1043' @@ -139,7 +139,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC LIMIT 20; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1233' test_id: '1043' @@ -147,7 +147,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1459' test_id: '1043' @@ -155,7 +155,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1236' test_id: '1043' @@ -163,6 +163,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHAR(160) || CHAR(8203) || CHAR(65279) || CHAR(8239) || CHAR(8201) || CHAR(12288) || CHAR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHAR(160) || CHAR(8203) || CHAR(65279) || CHAR(8239) || CHAR(8201) || CHAR(12288) || CHAR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml index 6e8929c..6c08cc7 100644 --- a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml @@ -110,7 +110,7 @@ test_types: WHERE TRANSLATE(CAST(`{COLUMN_NAME}` AS STRING), '012345678', '999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC - LIMIT 20; + LIMIT {LIMIT}; error_type: Test Results - id: '1331' test_id: '1044' @@ -118,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20; + SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1239' test_id: '1044' @@ -126,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; error_type: Test Results - id: '1238' test_id: '1044' @@ -134,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1237' test_id: '1044' @@ -142,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1460' test_id: '1044' @@ -150,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1240' test_id: '1044' @@ -158,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml index acba07f..ab616fd 100644 --- a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml @@ -111,7 +111,7 @@ test_types: WHERE TRANSLATE(CAST(`{COLUMN_NAME}` AS STRING), '012345678', '999999999') != '999' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC - LIMIT 20; + LIMIT {LIMIT}; error_type: Test Results - id: '1332' test_id: '1045' @@ -119,7 +119,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') <> '999' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20; + SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') <> '999' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1243' test_id: '1045' @@ -127,7 +127,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; error_type: Test Results - id: '1242' test_id: '1045' @@ -135,7 +135,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1241' test_id: '1045' @@ -143,7 +143,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1461' test_id: '1045' @@ -151,7 +151,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1244' test_id: '1045' @@ -159,6 +159,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml index c774e4d..8217f3a 100644 --- a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml @@ -136,7 +136,8 @@ test_types: FROM p LEFT JOIN existing_periods e ON (p.prior_available_week = e.period) LEFT JOIN existing_periods f ON (p.next_available_week = f.period) - ORDER BY p.missing_period; + ORDER BY p.missing_period + LIMIT {LIMIT}; error_type: Test Results - id: '1327' test_id: '1037' @@ -144,7 +145,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - WITH daterange AS( SELECT explode(sequence( date_trunc('week', (SELECT min(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), date_trunc('week', (SELECT max(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), interval 1 week)) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc('week', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY date_trunc('week', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_week, e.period_count AS prior_available_week_count, p.next_available_week, f.period_count AS next_available_week_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_week = e.period LEFT JOIN existing_periods f ON p.next_available_week = f.period ORDER BY p.missing_period; + WITH daterange AS( SELECT explode(sequence( date_trunc('week', (SELECT min(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), date_trunc('week', (SELECT max(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), interval 1 week)) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc('week', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY date_trunc('week', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_week, e.period_count AS prior_available_week_count, p.next_available_week, f.period_count AS next_available_week_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_week = e.period LEFT JOIN existing_periods f ON p.next_available_week = f.period ORDER BY p.missing_period LIMIT {LIMIT}; error_type: Test Results - id: '1169' test_id: '1037' @@ -183,7 +184,7 @@ test_types: FROM check_periods c LEFT JOIN data_by_period d ON (c.check_period = d.data_period) ) - SELECT check_period, record_ct, + SELECT TOP {LIMIT} check_period, record_ct, CASE WHEN record_ct = 0 THEN 'MISSING' ELSE 'Present' @@ -200,7 +201,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week' , MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS (SELECT DISTINCT DATE_TRUNC('week', "{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week', "{COLUMN_NAME}") :: DATE) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates; + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week' , MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS (SELECT DISTINCT DATE_TRUNC('week', "{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week', "{COLUMN_NAME}") :: DATE) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT {LIMIT}; error_type: Test Results - id: '1030' test_id: '1037' @@ -208,7 +209,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates; + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT {LIMIT}; error_type: Test Results - id: '1430' test_id: '1037' @@ -216,7 +217,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates; + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT {LIMIT}; error_type: Test Results - id: '1226' test_id: '1037' @@ -224,6 +225,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_week, e.period_count as prior_available_week_count, p.next_available_week, f.period_count as next_available_week_count FROM( SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON (p.prior_available_week = e.period) LEFT JOIN existing_periods f ON (p.next_available_week = f.period) ORDER BY p.missing_period; + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_week, e.period_count as prior_available_week_count, p.next_available_week, f.period_count as next_available_week_count FROM( SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON (p.prior_available_week = e.period) LEFT JOIN existing_periods f ON (p.next_available_week = f.period) ORDER BY p.missing_period LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/ui/queries/source_data_queries.py b/testgen/ui/queries/source_data_queries.py index 467297c..4934862 100644 --- a/testgen/ui/queries/source_data_queries.py +++ b/testgen/ui/queries/source_data_queries.py @@ -14,9 +14,10 @@ from testgen.utils import to_dataframe LOG = logging.getLogger("testgen") +DEFAULT_LIMIT = 500 -def get_hygiene_issue_source_query(issue_data: dict) -> str: +def get_hygiene_issue_source_query(issue_data: dict, limit: int = DEFAULT_LIMIT) -> str: def generate_lookup_query(test_id: str, detail_exp: str, column_names: list[str], sql_flavor: SQLFlavor) -> str: if test_id in {"1019", "1020"}: start_index = detail_exp.find("Columns: ") @@ -62,6 +63,9 @@ def generate_lookup_query(test_id: str, detail_exp: str, column_names: list[str] "COLUMN_NAME": issue_data["column_name"], "DETAIL_EXPRESSION": issue_data["detail"], "PROFILE_RUN_DATE": issue_data["profiling_starttime"], + "LIMIT": limit, + "LIMIT_2": int(limit/2), + "LIMIT_4": int(limit/4), } lookup_query = replace_params(lookup_query, params) @@ -72,10 +76,11 @@ def generate_lookup_query(test_id: str, detail_exp: str, column_names: list[str] @st.cache_data(show_spinner=False) def get_hygiene_issue_source_data( issue_data: dict, - limit: int | None = None, + limit: int = DEFAULT_LIMIT, ) -> tuple[Literal["OK"], None, str, pd.DataFrame] | tuple[Literal["NA", "ND", "ERR"], str, str | None, None]: + lookup_query = None try: - lookup_query = get_hygiene_issue_source_query(issue_data) + lookup_query = get_hygiene_issue_source_query(issue_data, limit) if not lookup_query: return "NA", "Source data lookup is not available for this hygiene issue.", None, None @@ -99,7 +104,7 @@ def get_hygiene_issue_source_data( return "ERR", f"Source data lookup encountered an error:\n\n{e.args[0]}", lookup_query, None -def get_test_issue_source_query(issue_data: dict) -> str: +def get_test_issue_source_query(issue_data: dict, limit: int = DEFAULT_LIMIT) -> str: lookup_data = _get_lookup_data(issue_data["table_groups_id"], issue_data["test_type_id"], "Test Results") if not lookup_data or not lookup_data.lookup_query: return None @@ -136,6 +141,9 @@ def get_test_issue_source_query(issue_data: dict) -> str: "WINDOW_DAYS": test_definition.window_days, "CONCAT_COLUMNS": concat_columns(issue_data["column_names"], ""), "CONCAT_MATCH_GROUPBY": concat_columns(test_definition.match_groupby_names, ""), + "LIMIT": limit, + "LIMIT_2": int(limit/2), + "LIMIT_4": int(limit/4), } lookup_query = replace_params(lookup_data.lookup_query, params) @@ -146,14 +154,15 @@ def get_test_issue_source_query(issue_data: dict) -> str: @st.cache_data(show_spinner=False) def get_test_issue_source_data( issue_data: dict, - limit: int | None = None, + limit: int = DEFAULT_LIMIT, ) -> tuple[Literal["OK"], None, str, pd.DataFrame] | tuple[Literal["NA", "ND", "ERR"], str, str | None, None]: + lookup_query = None try: test_definition = TestDefinition.get(issue_data["test_definition_id_current"]) if not test_definition: return "NA", "Test definition no longer exists.", None, None - lookup_query = get_test_issue_source_query(issue_data) + lookup_query = get_test_issue_source_query(issue_data, limit) if not lookup_query: return "NA", "Source data lookup is not available for this test.", None, None From 2842a4c20af26f188ddc4795ca636af813c68d85 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Thu, 6 Nov 2025 15:08:29 -0500 Subject: [PATCH 14/28] feat(notifications): Adding basic email sending functionality --- pyproject.toml | 1 + testgen/common/email.py | 90 +++++++++++++++++++++++++++++++++ testgen/settings.py | 33 ++++++++++-- tests/unit/test_common_email.py | 68 +++++++++++++++++++++++++ 4 files changed, 188 insertions(+), 4 deletions(-) create mode 100644 testgen/common/email.py create mode 100644 tests/unit/test_common_email.py diff --git a/pyproject.toml b/pyproject.toml index 37d1e0f..0e24df5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ dependencies = [ "streamlit-pydantic==0.6.0", "cron-converter==1.2.1", "cron-descriptor==2.0.5", + "pybars3==0.9.7", # Pinned to match the manually compiled libs or for security "pyarrow==18.1.0", diff --git a/testgen/common/email.py b/testgen/common/email.py new file mode 100644 index 0000000..1579ce0 --- /dev/null +++ b/testgen/common/email.py @@ -0,0 +1,90 @@ +import logging +import smtplib +import ssl +from collections.abc import Mapping +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText + +from pybars import Compiler + +from testgen import settings + +LOG = logging.getLogger(__name__) + +MANDATORY_SETTINGS = ( + "EMAIL_FROM_ADDRESS", + "SMTP_ENDPOINT", + "SMTP_PORT", + "SMTP_USERNAME", + "SMTP_PASSWORD", +) + + +class EmailTemplateException(Exception): + pass + + +class BaseEmailTemplate: + + def __init__(self): + compiler = Compiler() + self.compiled_subject = compiler.compile(self.get_subject_template()) + self.compiled_body = compiler.compile(self.get_body_template()) + + def validate_settings(self): + missing_settings = [ + f"TG_{setting_name}" + for setting_name in MANDATORY_SETTINGS + if getattr(settings, setting_name) is None + ] + + if missing_settings: + LOG.error( + "Template '%s' can not send emails because the following settings are missing: %s", + self.__class__.__name__, + ", ".join(missing_settings), + ) + + raise EmailTemplateException("Invalid or insufficient email/SMTP settings") + + def get_subject_template(self) -> str: + raise NotImplementedError + + def get_body_template(self) -> str: + raise NotImplementedError + + def get_message(self, recipients: list[str], context: Mapping | None) -> MIMEMultipart: + subject = self.compiled_subject(context) + body = self.compiled_body(context) + + message = MIMEMultipart("alternative") + message["Subject"] = subject + message["To"] = ", ".join(recipients) + message["From"] = settings.EMAIL_FROM_ADDRESS + message.attach(MIMEText(body, "html")) + return message + + def send_mime_message(self, recipients: list[str], message: MIMEMultipart) -> dict: + ssl_context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) + try: + with smtplib.SMTP_SSL(settings.SMTP_ENDPOINT, settings.SMTP_PORT, context=ssl_context) as smtp_server: + smtp_server.login(settings.SMTP_USERNAME, settings.SMTP_PASSWORD) + response = smtp_server.sendmail(settings.EMAIL_FROM_ADDRESS, recipients, message.as_string()) + except Exception as e: + LOG.error("Template '%s' failed to send email with: %s", self.__class__.__name__, e) # noqa: TRY400 + else: + return response + + def send(self, recipients: list[str], context: Mapping | None) -> dict: + self.validate_settings() + mime_message = self.get_message(recipients, context) + response = self.send_mime_message(recipients, mime_message) + + LOG.info( + "Template '%s' successfully sent email to %d recipients -- %d failed.", + self.__class__.__name__, + len(recipients) - len(response), + len(response) + ) + + return response diff --git a/testgen/settings.py b/testgen/settings.py index 07f044f..cf71768 100644 --- a/testgen/settings.py +++ b/testgen/settings.py @@ -374,7 +374,7 @@ OBSERVABILITY_VERIFY_SSL: bool = os.getenv("TG_EXPORT_TO_OBSERVABILITY_VERIFY_SSL", "yes").lower() in ["yes", "true"] """ -When False, exporting events to your instance of Observabilty will skip +When False, exporting events to your instance of Observability will skip SSL verification. from env variable: `TG_EXPORT_TO_OBSERVABILITY_VERIFY_SSL` @@ -383,7 +383,7 @@ OBSERVABILITY_EXPORT_LIMIT: int = int(os.getenv("TG_OBSERVABILITY_EXPORT_MAX_QTY", "5000")) """ -When exporting to your instance of Observabilty, the maximum number of +When exporting to your instance of Observability, the maximum number of events that will be sent to the events API on a single export. from env variable: `TG_OBSERVABILITY_EXPORT_MAX_QTY` @@ -392,7 +392,7 @@ OBSERVABILITY_DEFAULT_COMPONENT_TYPE: str = os.getenv("OBSERVABILITY_DEFAULT_COMPONENT_TYPE", "dataset") """ -When exporting to your instance of Observabilty, the type of event that +When exporting to your instance of Observability, the type of event that will be sent to the events API. from env variable: `OBSERVABILITY_DEFAULT_COMPONENT_TYPE` @@ -401,7 +401,7 @@ OBSERVABILITY_DEFAULT_COMPONENT_KEY: str = os.getenv("OBSERVABILITY_DEFAULT_COMPONENT_KEY", "default") """ -When exporting to your instance of Observabilty, the key sent to the +When exporting to your instance of Observability, the key sent to the events API to identify the components. from env variable: `OBSERVABILITY_DEFAULT_COMPONENT_KEY` @@ -475,3 +475,28 @@ """ Limit the number of records used to generate the PDF with test results and hygiene issue reports. """ + +EMAIL_FROM_ADDRESS: str | None = os.getenv("TG_EMAIL_FROM_ADDRESS") +""" +Email: Sender address +""" + +SMTP_ENDPOINT: str | None = os.getenv("TG_SMTP_ENDPOINT") +""" +Email: SMTP endpoint +""" + +SMTP_PORT: int | None = int(os.getenv("TG_SMTP_PORT", 0)) or None +""" +Email: SMTP port +""" + +SMTP_USERNAME: str | None = os.getenv("TG_SMTP_USERNAME") +""" +Email: SMTP username +""" + +SMTP_PASSWORD: str | None = os.getenv("TG_SMTP_PASSWORD") +""" +Email: SMTP password +""" diff --git a/tests/unit/test_common_email.py b/tests/unit/test_common_email.py new file mode 100644 index 0000000..f0e94ce --- /dev/null +++ b/tests/unit/test_common_email.py @@ -0,0 +1,68 @@ +from unittest.mock import ANY, call, patch + +import pytest + +from testgen.common.email import BaseEmailTemplate, EmailTemplateException + + +class TestEmailTemplate(BaseEmailTemplate): + + def get_subject_template(self) -> str: + return "{{project}}: Test execution finished" + + def get_body_template(self) -> str: + return "

DataKitchen TestGen

Hi, {{user}}!

" + + +@pytest.fixture +def smtp_mock(): + with patch("testgen.common.email.smtplib.SMTP_SSL") as mock: + yield mock + + +@pytest.fixture +def def_settings(): + with patch("testgen.common.email.settings") as mock: + mock.EMAIL_FROM_ADDRESS = "from@email" + mock.SMTP_ENDPOINT = "smtp-endpoint" + mock.SMTP_PORT = 333 + mock.SMTP_USERNAME = "smtp-user" + mock.SMTP_PASSWORD = "smtp-pass" # noqa: S105 + yield mock + + +@pytest.fixture +def template(smtp_mock, def_settings): + yield TestEmailTemplate() + + +@pytest.fixture +def send_args(): + return ["test@data.kitchen"], {"project": "Test Project", "user": "Test user"} + + +def test_send_email(smtp_mock, template, send_args, def_settings): + template.send(*send_args) + + smtp_mock.assert_has_calls( + [ + call("smtp-endpoint", 333, context=ANY), + call().__enter__().login("smtp-user", "smtp-pass"), + call().__enter__().sendmail("from@email", ["test@data.kitchen"], ANY) + ], + any_order=True, + ) + email_body = smtp_mock().__enter__().sendmail.call_args_list[0][0][2] + assert "

DataKitchen TestGen

" in email_body + assert "Subject: Test Project: Test execution finished" in email_body + assert "

Hi, Test user!

" in email_body + + +@pytest.mark.parametrize( + "missing", + ("EMAIL_FROM_ADDRESS", "SMTP_ENDPOINT", "SMTP_PORT", "SMTP_USERNAME", "SMTP_PASSWORD") +) +def test_settings_validation(missing, template, def_settings, send_args): + setattr(def_settings, missing, None) + with pytest.raises(EmailTemplateException, match="Invalid or insufficient email/SMTP settings"): + template.send(*send_args) From 5be70ce1f90b1a0998b0e163243b0aa2c4776295 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Wed, 12 Nov 2025 18:30:00 -0500 Subject: [PATCH 15/28] feat(test-definitions): add sort dropdown --- testgen/ui/views/test_definitions.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index e5a3fb1..1f02a90 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -6,7 +6,7 @@ import pandas as pd import streamlit as st -from sqlalchemy import and_, asc, func, or_, tuple_ +from sqlalchemy import and_, asc, desc, func, or_, tuple_ from streamlit.delta_generator import DeltaGenerator from streamlit_extras.no_default_selectbox import selectbox @@ -74,7 +74,7 @@ def render( ], ) - table_filter_column, column_filter_column, test_filter_column, table_actions_column = st.columns([.3, .3, .3, .4], vertical_alignment="bottom") + table_filter_column, column_filter_column, test_filter_column, sort_column, table_actions_column = st.columns([.2, .2, .2, .1, .25], vertical_alignment="bottom") testgen.flex_row_end(table_actions_column) actions_column, disposition_column = st.columns([.5, .5]) @@ -123,6 +123,15 @@ def render( label="Test Type", ) + with sort_column: + sortable_columns = ( + ("Table", "table_name"), + ("Column", "column_name"), + ("Test Type", "test_type"), + ) + default = [(sortable_columns[i][1], "ASC") for i in (0, 1, 2)] + sorting_columns = testgen.sorting_selector(sortable_columns, default) + if user_can_disposition: with disposition_column: multi_select = st.toggle("Multi-Select", help="Toggle on to perform actions on multiple test definitions") @@ -142,7 +151,7 @@ def render( with st.container(): with st.spinner("Loading data ..."): - df = get_test_definitions(test_suite, table_name, column_name, test_type) + df = get_test_definitions(test_suite, table_name, column_name, test_type, sorting_columns) selected, selected_test_def = render_grid(df, multi_select, filters_changed) @@ -1147,6 +1156,7 @@ def get_test_definitions( table_name: str | None = None, column_name: str | None = None, test_type: str | None = None, + sorting_columns: list[str] | None = None, ) -> pd.DataFrame: clauses = [TestDefinition.test_suite_id == test_suite.id] if table_name: @@ -1155,7 +1165,15 @@ def get_test_definitions( clauses.append(TestDefinition.column_name.ilike(column_name)) if test_type: clauses.append(TestDefinition.test_type == test_type) - test_definitions = TestDefinition.select_where(*clauses) + + sort_funcs = {"ASC": asc, "DESC": desc} + test_definitions = TestDefinition.select_where( + *clauses, + order_by=tuple([ + sort_funcs[direction](func.lower(getattr(TestDefinition, attribute))) + for (attribute, direction) in sorting_columns + ]) if sorting_columns else None, + ) df = to_dataframe(test_definitions, TestDefinitionSummary.columns()) date_service.accommodate_dataframe_to_timezone(df, st.session_state) From c93d08b54e1290e18125cfe397c4b83ab5f037d1 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Wed, 12 Nov 2025 18:30:26 -0500 Subject: [PATCH 16/28] fix(grid): initial selection not working --- testgen/ui/services/form_service.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/testgen/ui/services/form_service.py b/testgen/ui/services/form_service.py index 2d9e99f..70e8f75 100644 --- a/testgen/ui/services/form_service.py +++ b/testgen/ui/services/form_service.py @@ -261,7 +261,11 @@ def render_grid_select( selected_column, paginator_column = st.columns([.5, .5]) with paginator_column: def on_page_change(): - st.session_state[f"{key}_page_change"] = True + # Ignore the on_change event fired during paginator initialization + if st.session_state.get(f"{key}_paginator_loaded", False): + st.session_state[f"{key}_page_change"] = True + else: + st.session_state[f"{key}_paginator_loaded"] = True page_index = testgen.paginator( count=len(df), From d173535dca8b8db72fb71e52cbd6267a377179c8 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Thu, 20 Nov 2025 12:04:07 -0500 Subject: [PATCH 17/28] fix(test): improve table freshness test to avoid overflow --- .../gen_table_changed_test.sql | 9 +- .../gen_table_changed_test.sql | 11 +- .../gen_table_changed_test.sql | 37 +++-- .../gen_table_changed_test.sql | 157 ------------------ .../gen_table_changed_test.sql | 7 +- 5 files changed, 45 insertions(+), 176 deletions(-) delete mode 100644 testgen/template/flavors/postgresql/gen_query_tests/gen_table_changed_test.sql diff --git a/testgen/template/flavors/bigquery/gen_query_tests/gen_table_changed_test.sql b/testgen/template/flavors/bigquery/gen_query_tests/gen_table_changed_test.sql index da6811b..23c60db 100644 --- a/testgen/template/flavors/bigquery/gen_query_tests/gen_table_changed_test.sql +++ b/testgen/template/flavors/bigquery/gen_query_tests/gen_table_changed_test.sql @@ -124,7 +124,14 @@ newtests AS ( WHEN general_type = 'A' THEN 'CAST(MIN(@@@) AS STRING) || "|" || CAST(MAX(@@@) AS STRING) || "|" || CAST(COUNT(DISTINCT @@@) AS STRING) || "|" || CAST(SUM(LENGTH(@@@)) AS STRING)' WHEN general_type = 'N' THEN - 'CAST(MIN(@@@) AS STRING) || "|" || CAST(MAX(@@@) AS STRING) || "|" || CAST(SUM(@@@) AS STRING) || "|" || CAST(ROUND(AVG(@@@), 5) AS STRING) || "|" || CAST(ROUND(STDDEV(CAST(@@@ AS FLOAT64)), 5) AS STRING)' + 'ARRAY_TO_STRING([ + CAST(COUNT(@@@) AS STRING), + CAST(COUNT(DISTINCT MOD(CAST(COALESCE(@@@,0) AS NUMERIC) * 1000000, CAST(1000003 AS NUMERIC))) AS STRING), + COALESCE(CAST(ROUND(MIN(CAST(@@@ AS NUMERIC)), 6) AS STRING), ''''), + COALESCE(CAST(ROUND(MAX(CAST(@@@ AS NUMERIC)), 6) AS STRING), ''''), + CAST(MOD(COALESCE(SUM(MOD(CAST(ABS(COALESCE(@@@,0)) AS NUMERIC) * 1000000, CAST(1000000007 AS NUMERIC))), CAST(0 AS NUMERIC)), CAST(1000000007 AS NUMERIC)) AS STRING), + CAST(MOD(COALESCE(SUM(MOD(CAST(ABS(COALESCE(@@@,0)) AS NUMERIC) * 1000000, CAST(1000000009 AS NUMERIC))), CAST(0 AS NUMERIC)), CAST(1000000009 AS NUMERIC)) AS STRING) + ], ''|'', '''')' END, '@@@', '`' || column_name || '`'), ' || "|" || ' diff --git a/testgen/template/flavors/databricks/gen_query_tests/gen_table_changed_test.sql b/testgen/template/flavors/databricks/gen_query_tests/gen_table_changed_test.sql index 1c6521b..17e085d 100644 --- a/testgen/template/flavors/databricks/gen_query_tests/gen_table_changed_test.sql +++ b/testgen/template/flavors/databricks/gen_query_tests/gen_table_changed_test.sql @@ -121,9 +121,16 @@ newtests CASE WHEN general_type = 'D' THEN 'MIN(@@@)::STRING || ''|'' || MAX(@@@::STRING) || ''|'' || COUNT(DISTINCT @@@)::STRING' WHEN general_type = 'A' THEN 'MIN(@@@)::STRING || ''|'' || MAX(@@@::STRING) || ''|'' || COUNT(DISTINCT @@@)::STRING || ''|'' || SUM(LENGTH(@@@))::STRING' - WHEN general_type = 'N' THEN 'MIN(@@@)::STRING || ''|'' || MAX(@@@::STRING) || ''|'' || SUM(@@@)::STRING || ''|'' || ROUND(AVG(@@@), 5)::STRING || ''|'' || ROUND(STDDEV(@@@::FLOAT), 5)::STRING' + WHEN general_type = 'N' THEN 'CONCAT_WS(''|'', + COUNT(@@@)::STRING, + COUNT(DISTINCT MOD((COALESCE(@@@,0)::DECIMAL(38,6) * 1000000)::DECIMAL(38,0), 1000003))::STRING, + COALESCE((MIN(@@@)::DECIMAL(38,6))::STRING, ''''), + COALESCE((MAX(@@@)::DECIMAL(38,6))::STRING, ''''), + COALESCE(MOD(COALESCE(SUM(MOD((ABS(COALESCE(@@@,0))::DECIMAL(38,6) * 1000000)::DECIMAL, 1000000007)), 0), 1000000007)::STRING, ''''), + COALESCE(MOD(COALESCE(SUM(MOD((ABS(COALESCE(@@@,0))::DECIMAL(38,6) * 1000000)::DECIMAL, 1000000009)), 0), 1000000009)::STRING, '''') + )' END, - '@@@', '"' || column_name || '"'), + '@@@', '`' || column_name || '`'), ' || ''|'' || ' ORDER BY element_type, fingerprint_order, column_name) as fingerprint FROM combined diff --git a/testgen/template/flavors/mssql/gen_query_tests/gen_table_changed_test.sql b/testgen/template/flavors/mssql/gen_query_tests/gen_table_changed_test.sql index 3f8be00..d352848 100644 --- a/testgen/template/flavors/mssql/gen_query_tests/gen_table_changed_test.sql +++ b/testgen/template/flavors/mssql/gen_query_tests/gen_table_changed_test.sql @@ -14,8 +14,9 @@ WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date AND ts.id = '{TEST_SUITE_ID}' AND p.run_date::DATE <= '{AS_OF_DATE}' GROUP BY r.table_groups_id), -curprof AS (SELECT p.profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, column_type, - distinct_value_ct, record_ct, max_value, min_value, avg_value, stdev_value, null_value_ct +curprof AS (SELECT p.profile_run_id, p.schema_name, p.table_name, p.column_name, p.functional_data_type, + p.general_type, p.distinct_value_ct, p.record_ct, p.max_value, p.min_value, + p.avg_value, p.stdev_value, p.null_value_ct FROM last_run lr INNER JOIN profile_results p ON (lr.table_groups_id = p.table_groups_id @@ -28,7 +29,7 @@ locked AS (SELECT schema_name, table_name AND lock_refresh = 'Y'), -- IDs - TOP 2 id_cols - AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, column_type, + AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, distinct_value_ct, ROW_NUMBER() OVER (PARTITION BY schema_name, table_name ORDER BY @@ -42,7 +43,7 @@ id_cols AND functional_data_type ILIKE 'ID%'), -- Process Date - TOP 1 process_date_cols - AS (SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, column_type, + AS (SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, distinct_value_ct, ROW_NUMBER() OVER (PARTITION BY schema_name, table_name ORDER BY @@ -57,7 +58,7 @@ process_date_cols AND functional_data_type ILIKE 'process%'), -- Transaction Date - TOP 1 tran_date_cols - AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, column_type, + AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, distinct_value_ct, ROW_NUMBER() OVER (PARTITION BY schema_name, table_name ORDER BY @@ -70,9 +71,9 @@ tran_date_cols -- Numeric Measures numeric_cols - AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, column_type, + AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, /* - -- Subscores + -- Subscores -- save for reference distinct_value_ct * 1.0 / NULLIF(record_ct, 0) AS cardinality_score, (max_value - min_value) / NULLIF(ABS(NULLIF(avg_value, 0)), 1) AS range_score, LEAST(1, LOG(GREATEST(distinct_value_ct, 2))) / LOG(GREATEST(record_ct, 2)) AS nontriviality_score, @@ -98,19 +99,19 @@ numeric_cols_ranked FROM numeric_cols WHERE change_detection_score IS NOT NULL), combined - AS ( SELECT profile_run_id, schema_name, table_name, column_name, 'ID' AS element_type, general_type, column_type, 10 + rank AS fingerprint_order + AS ( SELECT profile_run_id, schema_name, table_name, column_name, 'ID' AS element_type, general_type, 10 + rank AS fingerprint_order FROM id_cols WHERE rank <= 2 UNION ALL - SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_P' AS element_type, general_type, column_type, 20 + rank AS fingerprint_order + SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_P' AS element_type, general_type, 20 + rank AS fingerprint_order FROM process_date_cols WHERE rank = 1 UNION ALL - SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_T' AS element_type, general_type, column_type, 30 + rank AS fingerprint_order + SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_T' AS element_type, general_type, 30 + rank AS fingerprint_order FROM tran_date_cols WHERE rank = 1 UNION ALL - SELECT profile_run_id, schema_name, table_name, column_name, 'MEAS' AS element_type, general_type, column_type, 40 + rank AS fingerprint_order + SELECT profile_run_id, schema_name, table_name, column_name, 'MEAS' AS element_type, general_type, 40 + rank AS fingerprint_order FROM numeric_cols_ranked WHERE rank = 1 ), newtests AS ( @@ -121,10 +122,16 @@ newtests AS ( 'CAST(COUNT(*) AS varchar) + ''|'' + ' || STRING_AGG( REPLACE( CASE - WHEN general_type = 'D' THEN 'CAST(MIN(@@@) AS NVARCHAR) + ''|'' + MAX(CAST(@@@ AS NVARCHAR)) + ''|'' + CAST(COUNT(DISTINCT @@@) AS NVARCHAR)' - WHEN general_type = 'A' THEN 'CAST(MIN(@@@) AS NVARCHAR) + ''|'' + MAX(CAST(@@@ AS NVARCHAR)) + ''|'' + CAST(COUNT(DISTINCT @@@) AS NVARCHAR) + ''|'' + CAST(SUM(LEN(@@@)) AS NVARCHAR)' - WHEN general_type = 'N' AND column_type ILIKE '%int%' THEN 'CAST(MIN(@@@) AS NVARCHAR) + ''|'' + MAX(CAST(@@@ AS NVARCHAR)) + ''|'' + CAST(SUM(CAST(@@@ AS BIGINT)) AS NVARCHAR) + ''|'' + CAST(ROUND(AVG(CAST(@@@ AS DECIMAL(30,5))), 5) AS NVARCHAR) + ''|'' + CAST(ROUND(STDEV(CAST(@@@ AS FLOAT)), 5) AS NVARCHAR)' - WHEN general_type = 'N' AND column_type NOT ILIKE '%int%' THEN 'CAST(MIN(@@@) AS NVARCHAR) + ''|'' + MAX(CAST(@@@ AS NVARCHAR)) + ''|'' + CAST(SUM(@@@) AS NVARCHAR) + ''|'' + CAST(ROUND(AVG(@@@), 5) AS NVARCHAR) + ''|'' + CAST(ROUND(STDEV(CAST(@@@ AS FLOAT)), 5) AS NVARCHAR)' + WHEN general_type = 'D' THEN 'CAST(MIN(@@@) AS NVARCHAR) + ''|'' + CAST(MAX(@@@) AS NVARCHAR) + ''|'' + CAST(COUNT_BIG(DISTINCT @@@) AS NVARCHAR)' + WHEN general_type = 'A' THEN 'CAST(MIN(@@@) AS NVARCHAR) + ''|'' + CAST(MAX(@@@) AS NVARCHAR) + ''|'' + CAST(COUNT_BIG(DISTINCT @@@) AS NVARCHAR) + ''|'' + CAST(SUM(LEN(@@@)) AS NVARCHAR)' + WHEN general_type = 'N' THEN 'CONCAT_WS(''|'', + CAST(COUNT_BIG(@@@) AS VARCHAR(20)), + CAST(COUNT_BIG(DISTINCT CAST(CAST(CAST(COALESCE(@@@,0) AS DECIMAL(38,6)) * 1000000 AS DECIMAL(38,0)) % 1000003 AS INT)) AS VARCHAR(20)), + COALESCE(CAST(CAST(MIN(@@@) AS DECIMAL(38,6)) AS VARCHAR(50)), ''''), + COALESCE(CAST(CAST(MAX(@@@) AS DECIMAL(38,6)) AS VARCHAR(50)), ''''), + CAST((COALESCE(SUM(CAST(CAST(ABS(CAST(COALESCE(@@@,0) AS DECIMAL(38,6))) * 1000000 AS DECIMAL(38,0)) % 1000000007 AS DECIMAL(38,0))), 0) % 1000000007) AS VARCHAR(12)), + CAST((COALESCE(SUM(CAST(CAST(ABS(CAST(COALESCE(@@@,0) AS DECIMAL(38,6))) * 1000000 AS DECIMAL(38,0)) % 1000000009 AS DECIMAL(38,0))), 0) % 1000000009) AS VARCHAR(12)) + )' END, '@@@', '"' || column_name || '"' ), diff --git a/testgen/template/flavors/postgresql/gen_query_tests/gen_table_changed_test.sql b/testgen/template/flavors/postgresql/gen_query_tests/gen_table_changed_test.sql deleted file mode 100644 index fd3fe0a..0000000 --- a/testgen/template/flavors/postgresql/gen_query_tests/gen_table_changed_test.sql +++ /dev/null @@ -1,157 +0,0 @@ -INSERT INTO test_definitions (table_groups_id, profile_run_id, test_type, test_suite_id, - schema_name, table_name, - skip_errors, test_active, last_auto_gen_date, profiling_as_of_date, - lock_refresh, history_calculation, history_lookback, custom_query ) -WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date - FROM profile_results p - INNER JOIN profiling_runs r - ON (p.profile_run_id = r.id) - INNER JOIN test_suites ts - ON p.project_code = ts.project_code - AND p.connection_id = ts.connection_id - WHERE p.project_code = '{PROJECT_CODE}' - AND r.table_groups_id = '{TABLE_GROUPS_ID}'::UUID - AND ts.id = '{TEST_SUITE_ID}' - AND p.run_date::DATE <= '{AS_OF_DATE}' - GROUP BY r.table_groups_id), -curprof AS (SELECT p.profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, - distinct_value_ct, record_ct, max_value, min_value, avg_value, stdev_value, null_value_ct - FROM last_run lr - INNER JOIN profile_results p - ON (lr.table_groups_id = p.table_groups_id - AND lr.last_run_date = p.run_date) ), -locked AS (SELECT schema_name, table_name - FROM test_definitions - WHERE table_groups_id = '{TABLE_GROUPS_ID}'::UUID - AND test_suite_id = '{TEST_SUITE_ID}' - AND test_type = 'Table_Freshness' - AND lock_refresh = 'Y'), --- IDs - TOP 2 -id_cols - AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, - distinct_value_ct, - ROW_NUMBER() OVER (PARTITION BY schema_name, table_name - ORDER BY - CASE - WHEN functional_data_type ILIKE 'ID-Unique%' THEN 1 - WHEN functional_data_type = 'ID-Secondary' THEN 2 - ELSE 3 - END, distinct_value_ct, column_name DESC) AS rank - FROM curprof - WHERE general_type IN ('A', 'D', 'N') - AND functional_data_type ILIKE 'ID%'), --- Process Date - TOP 1 -process_date_cols - AS (SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, - distinct_value_ct, - ROW_NUMBER() OVER (PARTITION BY schema_name, table_name - ORDER BY - CASE - WHEN column_name ILIKE '%mod%' THEN 1 - WHEN column_name ILIKE '%up%' THEN 1 - WHEN column_name ILIKE '%cr%' THEN 2 - WHEN column_name ILIKE '%in%' THEN 2 - END , distinct_value_ct DESC, column_name) AS rank - FROM curprof - WHERE general_type IN ('A', 'D', 'N') - AND functional_data_type ILIKE 'process%'), --- Transaction Date - TOP 1 -tran_date_cols - AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, - distinct_value_ct, - ROW_NUMBER() OVER (PARTITION BY schema_name, table_name - ORDER BY - distinct_value_ct DESC, column_name) AS rank - FROM curprof - WHERE general_type IN ('A', 'D', 'N') - AND functional_data_type ILIKE 'transactional date%' - OR functional_data_type ILIKE 'period%' - OR functional_data_type = 'timestamp' ), - --- Numeric Measures -numeric_cols - AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, -/* - -- Subscores - distinct_value_ct * 1.0 / NULLIF(record_ct, 0) AS cardinality_score, - (max_value - min_value) / NULLIF(ABS(NULLIF(avg_value, 0)), 1) AS range_score, - LEAST(1, LOG(GREATEST(distinct_value_ct, 2))) / LOG(GREATEST(record_ct, 2)) AS nontriviality_score, - stdev_value / NULLIF(ABS(NULLIF(avg_value, 0)), 1) AS variability_score, - 1.0 - (null_value_ct * 1.0 / NULLIF(NULLIF(record_ct, 0), 1)) AS null_penalty, -*/ - -- Weighted score - ( - 0.25 * (distinct_value_ct * 1.0 / NULLIF(record_ct, 0)) + - 0.15 * ((max_value - min_value) / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) + - 0.10 * (LEAST(1, LOG(GREATEST(distinct_value_ct, 2))) / LOG(GREATEST(record_ct, 2))) + - 0.40 * (stdev_value / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) + - 0.10 * (1.0 - (null_value_ct * 1.0 / NULLIF(NULLIF(record_ct, 0), 1))) - ) AS change_detection_score - FROM curprof - WHERE general_type = 'N' - AND (functional_data_type ILIKE 'Measure%' OR functional_data_type IN ('Sequence', 'Constant')) - ), -numeric_cols_ranked - AS ( SELECT *, - ROW_NUMBER() OVER (PARTITION BY schema_name, table_name - ORDER BY change_detection_score DESC, column_name) as rank - FROM numeric_cols - WHERE change_detection_score IS NOT NULL), -combined - AS ( SELECT profile_run_id, schema_name, table_name, column_name, 'ID' AS element_type, general_type, 10 + rank AS fingerprint_order - FROM id_cols - WHERE rank <= 2 - UNION ALL - SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_P' AS element_type, general_type, 20 + rank AS fingerprint_order - FROM process_date_cols - WHERE rank = 1 - UNION ALL - SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_T' AS element_type, general_type, 30 + rank AS fingerprint_order - FROM tran_date_cols - WHERE rank = 1 - UNION ALL - SELECT profile_run_id, schema_name, table_name, column_name, 'MEAS' AS element_type, general_type, 40 + rank AS fingerprint_order - FROM numeric_cols_ranked - WHERE rank = 1 ), -newtests - AS (SELECT profile_run_id, schema_name, table_name, - 'COUNT(*)::VARCHAR || ''|'' || ' || - STRING_AGG( - REPLACE( - CASE - WHEN general_type = 'D' THEN 'MIN(@@@)::VARCHAR || ''|'' || MAX(@@@::VARCHAR) || ''|'' || COUNT(DISTINCT @@@)::VARCHAR' - WHEN general_type = 'A' THEN 'MIN(@@@)::VARCHAR || ''|'' || MAX(@@@::VARCHAR) || ''|'' || COUNT(DISTINCT @@@)::VARCHAR || ''|'' || SUM(LENGTH(@@@))::VARCHAR' - WHEN general_type = 'N' THEN 'MIN(@@@)::VARCHAR || ''|'' || MAX(@@@::VARCHAR) || ''|'' || SUM(@@@)::VARCHAR || ''|'' || ROUND(AVG(@@@), 5)::VARCHAR || ''|'' || ROUND(STDDEV(@@@::FLOAT)::NUMERIC, 5)::VARCHAR' - END, - '@@@', '"' || column_name || '"'), - ' || ''|'' || ' - ORDER BY element_type, fingerprint_order, column_name) as fingerprint - FROM combined - GROUP BY profile_run_id, schema_name, table_name) -SELECT '{TABLE_GROUPS_ID}'::UUID as table_groups_id, - n.profile_run_id, - 'Table_Freshness' AS test_type, - '{TEST_SUITE_ID}' AS test_suite_id, - n.schema_name, n.table_name, - 0 as skip_errors, 'Y' as test_active, - - '{RUN_DATE}'::TIMESTAMP as last_auto_gen_date, - '{AS_OF_DATE}'::TIMESTAMP as profiling_as_of_date, - 'N' as lock_refresh, - 'Value' as history_calculation, - 1 as history_lookback, - fingerprint as custom_query -FROM newtests n -INNER JOIN test_types t - ON ('Table_Freshness' = t.test_type - AND 'Y' = t.active) -LEFT JOIN generation_sets s - ON (t.test_type = s.test_type - AND '{GENERATION_SET}' = s.generation_set) -LEFT JOIN locked l - ON (n.schema_name = l.schema_name - AND n.table_name = l.table_name) -WHERE (s.generation_set IS NOT NULL - OR '{GENERATION_SET}' = '') - AND l.schema_name IS NULL; - diff --git a/testgen/template/gen_query_tests/gen_table_changed_test.sql b/testgen/template/gen_query_tests/gen_table_changed_test.sql index 918af28..4c578f1 100644 --- a/testgen/template/gen_query_tests/gen_table_changed_test.sql +++ b/testgen/template/gen_query_tests/gen_table_changed_test.sql @@ -121,7 +121,12 @@ newtests CASE WHEN general_type = 'D' THEN 'MIN(@@@)::VARCHAR || ''|'' || MAX(@@@::VARCHAR) || ''|'' || COUNT(DISTINCT @@@)::VARCHAR' WHEN general_type = 'A' THEN 'MIN(@@@)::VARCHAR || ''|'' || MAX(@@@::VARCHAR) || ''|'' || COUNT(DISTINCT @@@)::VARCHAR || ''|'' || SUM(LENGTH(@@@))::VARCHAR' - WHEN general_type = 'N' THEN 'MIN(@@@)::VARCHAR || ''|'' || MAX(@@@::VARCHAR) || ''|'' || SUM(@@@)::VARCHAR || ''|'' || ROUND(AVG(@@@), 5)::VARCHAR || ''|'' || ROUND(STDDEV(@@@::FLOAT), 5)::VARCHAR' + WHEN general_type = 'N' THEN 'COUNT(@@@)::VARCHAR || ''|'' || + COUNT(DISTINCT MOD((COALESCE(@@@,0)::DECIMAL(38,6) * 1000000)::DECIMAL(38,0), 1000003))::VARCHAR || ''|'' || + COALESCE((MIN(@@@)::DECIMAL(38,6))::VARCHAR, '''') || ''|'' || + COALESCE((MAX(@@@)::DECIMAL(38,6))::VARCHAR, '''') || ''|'' || + COALESCE(MOD(COALESCE(SUM(MOD((ABS(COALESCE(@@@,0))::DECIMAL(38,6) * 1000000)::DECIMAL, 1000000007)), 0), 1000000007)::VARCHAR, '''') || ''|'' || + COALESCE(MOD(COALESCE(SUM(MOD((ABS(COALESCE(@@@,0))::DECIMAL(38,6) * 1000000)::DECIMAL, 1000000009)), 0), 1000000009)::VARCHAR, '''')' END, '@@@', '"' || column_name || '"'), ' || ''|'' || ' From b2ce30df26b179dbd265093567750b958962cbd4 Mon Sep 17 00:00:00 2001 From: Luis Date: Fri, 14 Nov 2025 09:52:58 -0400 Subject: [PATCH 18/28] feat(connections): connect to azure mssql using managed identities --- deploy/testgen-base.dockerfile | 1 - deploy/testgen.dockerfile | 2 +- pyproject.toml | 1 + .../common/database/flavor/flavor_service.py | 4 + .../database/flavor/mssql_flavor_service.py | 26 ++- testgen/common/models/connection.py | 1 + .../030_initialize_new_schema_structure.sql | 1 + .../dbupgrade/0159_incremental_upgrade.sql | 2 + .../frontend/js/components/connection_form.js | 210 +++++++++++++++++- testgen/ui/views/connections.py | 8 + 10 files changed, 244 insertions(+), 12 deletions(-) create mode 100644 testgen/template/dbupgrade/0159_incremental_upgrade.sql diff --git a/deploy/testgen-base.dockerfile b/deploy/testgen-base.dockerfile index 08976d9..de45fcf 100644 --- a/deploy/testgen-base.dockerfile +++ b/deploy/testgen-base.dockerfile @@ -48,7 +48,6 @@ RUN apk del \ cmake \ musl-dev \ gfortran \ - curl \ gpg \ linux-headers \ openblas-dev \ diff --git a/deploy/testgen.dockerfile b/deploy/testgen.dockerfile index e759822..4ff2ff9 100644 --- a/deploy/testgen.dockerfile +++ b/deploy/testgen.dockerfile @@ -1,4 +1,4 @@ -ARG TESTGEN_BASE_LABEL=v8 +ARG TESTGEN_BASE_LABEL=v9 FROM datakitchen/dataops-testgen-base:${TESTGEN_BASE_LABEL} AS release-image diff --git a/pyproject.toml b/pyproject.toml index 0e24df5..877dbba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,7 @@ dependencies = [ "cron-converter==1.2.1", "cron-descriptor==2.0.5", "pybars3==0.9.7", + "azure-identity==1.25.1", # Pinned to match the manually compiled libs or for security "pyarrow==18.1.0", diff --git a/testgen/common/database/flavor/flavor_service.py b/testgen/common/database/flavor/flavor_service.py index 9849f1b..bb25359 100644 --- a/testgen/common/database/flavor/flavor_service.py +++ b/testgen/common/database/flavor/flavor_service.py @@ -22,6 +22,8 @@ class ConnectionParams(TypedDict): private_key_passphrase: bytes http_path: str service_account_key: dict[str,Any] + connect_with_identity: bool + sql_flavor_code: str class FlavorService: @@ -49,6 +51,8 @@ def init(self, connection_params: ConnectionParams): self.catalog = connection_params.get("catalog") or "" self.warehouse = connection_params.get("warehouse") or "" self.service_account_key = connection_params.get("service_account_key", None) + self.connect_with_identity = connection_params.get("connect_with_identity") or False + self.sql_flavor_code = connection_params.get("sql_flavor_code") or self.flavor password = connection_params.get("project_pw_encrypted", None) if isinstance(password, memoryview) or isinstance(password, bytes): diff --git a/testgen/common/database/flavor/mssql_flavor_service.py b/testgen/common/database/flavor/mssql_flavor_service.py index f4e3f1b..3956ee8 100644 --- a/testgen/common/database/flavor/mssql_flavor_service.py +++ b/testgen/common/database/flavor/mssql_flavor_service.py @@ -1,5 +1,7 @@ from urllib.parse import quote_plus +from sqlalchemy.engine import URL + from testgen import settings from testgen.common.database.flavor.flavor_service import FlavorService @@ -14,14 +16,28 @@ def get_connection_string_head(self): return f"mssql+pyodbc://{self.username}:{quote_plus(self.password)}@" def get_connection_string_from_fields(self): - strConnect = ( - f"mssql+pyodbc://{self.username}:{quote_plus(self.password)}@{self.host}:{self.port}/{self.dbname}?driver=ODBC+Driver+18+for+SQL+Server" + connection_url = URL.create( + "mssql+pyodbc", + username=self.username, + password=quote_plus(self.password or ""), + host=self.host, + port=int(self.port or 1443), + database=self.dbname, + query={ + "driver": "ODBC Driver 18 for SQL Server", + }, ) - if "synapse" in self.host: - strConnect += "&autocommit=True" + if self.connect_with_identity: + connection_url = connection_url._replace(username=None, password=None).update_query_dict({ + "encrypt": "yes", + "authentication": "ActiveDirectoryMsi", + }) + + if self.sql_flavor_code == "synapse_mssql": + connection_url = connection_url.update_query_dict({"autocommit": True}) - return strConnect + return connection_url.render_as_string(hide_password=False) def get_pre_connection_queries(self): return [ diff --git a/testgen/common/models/connection.py b/testgen/common/models/connection.py index 84f71aa..1b5a96f 100644 --- a/testgen/common/models/connection.py +++ b/testgen/common/models/connection.py @@ -62,6 +62,7 @@ class Connection(Entity): http_path: str = Column(String) warehouse: str = Column(String) service_account_key: JSON_TYPE = Column(EncryptedJson) + connect_with_identity: bool = Column(Boolean, default=False) _get_by = "connection_id" _default_order_by = (asc(func.lower(connection_name)),) diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index dbf27fc..2df5365 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -70,6 +70,7 @@ CREATE TABLE connections ( url VARCHAR(200) default '', connect_by_url BOOLEAN default FALSE, connect_by_key BOOLEAN DEFAULT FALSE, + connect_with_identity BOOLEAN DEFAULT FALSE, private_key BYTEA, private_key_passphrase BYTEA, http_path VARCHAR(200), diff --git a/testgen/template/dbupgrade/0159_incremental_upgrade.sql b/testgen/template/dbupgrade/0159_incremental_upgrade.sql new file mode 100644 index 0000000..bf28b8e --- /dev/null +++ b/testgen/template/dbupgrade/0159_incremental_upgrade.sql @@ -0,0 +1,2 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; +ALTER TABLE connections ADD COLUMN connect_with_identity BOOLEAN DEFAULT FALSE; diff --git a/testgen/ui/components/frontend/js/components/connection_form.js b/testgen/ui/components/frontend/js/components/connection_form.js index 5d6aa7e..b52924c 100644 --- a/testgen/ui/components/frontend/js/components/connection_form.js +++ b/testgen/ui/components/frontend/js/components/connection_form.js @@ -31,6 +31,7 @@ * @property {boolean} connect_by_url * @property {string?} url * @property {boolean} connect_by_key + * @property {boolean} connect_with_identity * @property {string?} private_key * @property {string?} private_key_passphrase * @property {string?} http_path @@ -126,6 +127,7 @@ const ConnectionForm = (props, saveButton) => { warehouse: connection?.warehouse ?? '', url: connection?.url ?? '', service_account_key: connection?.service_account_key ?? '', + connect_with_identity: connection?.connect_with_identity ?? false, sql_flavor_code: connectionFlavor.rawVal ?? '', connection_name: connectionName.rawVal ?? '', max_threads: connectionMaxThreads.rawVal ?? 4, @@ -550,7 +552,197 @@ const RedshiftSpectrumForm = RedshiftForm; const PostgresqlForm = RedshiftForm; -const AzureMSSQLForm = RedshiftForm; +const AzureMSSQLForm = ( + connection, + flavor, + onChange, + originalConnection, + dynamicConnectionUrl, +) => { + const isValid = van.state(true); + const connectByUrl = van.state(connection.rawVal.connect_by_url ?? false); + const connectionHost = van.state(connection.rawVal.project_host ?? ''); + const connectionPort = van.state(connection.rawVal.project_port || defaultPorts[flavor.flavor]); + const connectionDatabase = van.state(connection.rawVal.project_db ?? ''); + const connectionUsername = van.state(connection.rawVal.project_user ?? ''); + const connectionPassword = van.state(connection.rawVal?.project_pw_encrypted ?? ''); + const connectionUrl = van.state(connection.rawVal?.url ?? ''); + const connectWithIdentity = van.state(connection.rawVal?.connect_with_identity ?? ''); + + const validityPerField = {}; + + van.derive(() => { + onChange({ + project_host: connectionHost.val, + project_port: connectionPort.val, + project_db: connectionDatabase.val, + project_user: connectionUsername.val, + project_pw_encrypted: connectionPassword.val, + connect_by_url: connectByUrl.val, + url: connectByUrl.val ? connectionUrl.val : connectionUrl.rawVal, + connect_by_key: false, + connect_with_identity: connectWithIdentity.val, + }, isValid.val); + }); + + van.derive(() => { + const newUrlValue = (dynamicConnectionUrl.val ?? '').replace(extractPrefix(dynamicConnectionUrl.rawVal), ''); + if (!connectByUrl.rawVal) { + connectionUrl.val = newUrlValue; + } + }); + + return div( + {class: 'flex-column fx-gap-3 fx-flex'}, + div( + { class: 'flex-column border border-radius-1 p-3 mt-1 fx-gap-1', style: 'position: relative;' }, + Caption({content: 'Server', style: 'position: absolute; top: -10px; background: var(--app-background-color); padding: 0px 8px;' }), + RadioGroup({ + label: 'Connect by', + options: [ + { + label: 'Host', + value: false, + }, + { + label: 'URL', + value: true, + }, + ], + value: connectByUrl, + onChange: (value) => connectByUrl.val = value, + layout: 'inline', + }), + div( + { class: 'flex-row fx-gap-3 fx-flex' }, + Input({ + name: 'db_host', + label: 'Host', + value: connectionHost, + class: 'fx-flex', + disabled: connectByUrl, + onChange: (value, state) => { + connectionHost.val = value; + validityPerField['db_host'] = state.valid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + validators: [ + maxLength(250), + requiredIf(() => !connectByUrl.val), + ], + }), + Input({ + name: 'db_port', + label: 'Port', + value: connectionPort, + type: 'number', + disabled: connectByUrl, + onChange: (value, state) => { + connectionPort.val = value; + validityPerField['db_port'] = state.valid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + validators: [ + minLength(3), + maxLength(5), + requiredIf(() => !connectByUrl.val), + ], + }) + ), + Input({ + name: 'db_name', + label: 'Database', + value: connectionDatabase, + disabled: connectByUrl, + onChange: (value, state) => { + connectionDatabase.val = value; + validityPerField['db_name'] = state.valid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + validators: [ + maxLength(100), + requiredIf(() => !connectByUrl.val), + ], + }), + () => div( + { class: 'flex-row fx-gap-3 fx-align-stretch', style: 'position: relative;' }, + Input({ + label: 'URL', + value: connectionUrl, + class: 'fx-flex', + name: 'url_suffix', + prefix: span({ style: 'white-space: nowrap; color: var(--disabled-text-color)' }, extractPrefix(dynamicConnectionUrl.val)), + disabled: !connectByUrl.val, + onChange: (value, state) => { + connectionUrl.val = value; + validityPerField['url_suffix'] = state.valid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + validators: [ + requiredIf(() => connectByUrl.val), + ], + }), + ), + ), + + div( + { class: 'flex-column border border-radius-1 p-3 mt-1 fx-gap-1', style: 'position: relative;' }, + Caption({content: 'Authentication', style: 'position: absolute; top: -10px; background: var(--app-background-color); padding: 0px 8px;' }), + + RadioGroup({ + label: 'Connection Strategy', + options: [ + {label: 'Connect By Password', value: false}, + {label: 'Connect with Managed Identity', value: true}, + ], + value: connectWithIdentity, + onChange: (value) => connectWithIdentity.val = value, + layout: 'inline', + }), + + () => { + const _connectWithIdentity = connectWithIdentity.val; + if (_connectWithIdentity) { + return div( + {class: 'flex-row p-4 fx-justify-center text-secondary'}, + 'Configured Microsoft Entra ID credentials will be used', + ); + } + + return div( + {class: 'flex-column fx-gap-1'}, + Input({ + name: 'db_user', + label: 'Username', + value: connectionUsername, + onChange: (value, state) => { + connectionUsername.val = value; + validityPerField['db_user'] = state.valid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + validators: [ + requiredIf(() => !connectWithIdentity.val), + maxLength(50), + ], + }), + Input({ + name: 'password', + label: 'Password', + value: connectionPassword, + type: 'password', + passwordSuggestions: false, + placeholder: (originalConnection?.connection_id && originalConnection?.project_pw_encrypted) ? secretsPlaceholder : '', + onChange: (value, state) => { + connectionPassword.val = value; + validityPerField['password'] = state.valid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + }), + ) + }, + ), + ); +}; const SynapseMSSQLForm = RedshiftForm; @@ -1110,11 +1302,19 @@ const BigqueryForm = ( }; function extractPrefix(url) { - const parts = (url ?? '').split('@'); - if (!parts[0]) { + if (!url) { return ''; } - return `${parts[0]}@`; + + if (url.includes('@')) { + const parts = url.split('@'); + if (!parts[0]) { + return ''; + } + return `${parts[0]}@`; + } + + return url.slice(0, url.indexOf('://') + 3); } function shouldRefreshUrl(previous, current) { @@ -1122,7 +1322,7 @@ function shouldRefreshUrl(previous, current) { return false; } - const fields = ['sql_flavor', 'project_host', 'project_port', 'project_db', 'project_user', 'connect_by_key', 'http_path', 'warehouse']; + const fields = ['sql_flavor', 'project_host', 'project_port', 'project_db', 'project_user', 'connect_by_key', 'http_path', 'warehouse', 'connect_with_identity']; return fields.some((fieldName) => previous[fieldName] !== current[fieldName]); } diff --git a/testgen/ui/views/connections.py b/testgen/ui/views/connections.py index dfb5dc4..fc7938f 100644 --- a/testgen/ui/views/connections.py +++ b/testgen/ui/views/connections.py @@ -119,6 +119,10 @@ def on_save_connection_clicked(updated_connection): elif updated_connection.get("project_pw_encrypted") == CLEAR_SENTINEL: updated_connection["project_pw_encrypted"] = "" + if updated_connection.get("connect_with_identity"): + updated_connection["project_user"] = "" + updated_connection["project_pw_encrypted"] = "" + updated_connection["sql_flavor"] = self._get_sql_flavor_from_value(updated_connection["sql_flavor_code"]).flavor set_save(True) @@ -143,6 +147,10 @@ def on_test_connection_clicked(updated_connection: dict) -> None: elif updated_connection.get("private_key_passphrase") == CLEAR_SENTINEL: updated_connection["private_key_passphrase"] = "" + if updated_connection.get("connect_with_identity"): + updated_connection["project_user"] = "" + updated_connection["project_pw_encrypted"] = "" + updated_connection["sql_flavor"] = self._get_sql_flavor_from_value(updated_connection["sql_flavor_code"]).flavor set_check_status(True) From 433177bcd15f487c5c8aeecf923ae7633218d4bb Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Fri, 21 Nov 2025 12:51:21 -0500 Subject: [PATCH 19/28] fix: address review comments --- testgen/common/database/flavor/mssql_flavor_service.py | 2 +- testgen/ui/components/frontend/js/components/connection_form.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/testgen/common/database/flavor/mssql_flavor_service.py b/testgen/common/database/flavor/mssql_flavor_service.py index 3956ee8..088c11e 100644 --- a/testgen/common/database/flavor/mssql_flavor_service.py +++ b/testgen/common/database/flavor/mssql_flavor_service.py @@ -35,7 +35,7 @@ def get_connection_string_from_fields(self): }) if self.sql_flavor_code == "synapse_mssql": - connection_url = connection_url.update_query_dict({"autocommit": True}) + connection_url = connection_url.update_query_dict({"autocommit": "True"}) return connection_url.render_as_string(hide_password=False) diff --git a/testgen/ui/components/frontend/js/components/connection_form.js b/testgen/ui/components/frontend/js/components/connection_form.js index b52924c..679338f 100644 --- a/testgen/ui/components/frontend/js/components/connection_form.js +++ b/testgen/ui/components/frontend/js/components/connection_form.js @@ -705,7 +705,7 @@ const AzureMSSQLForm = ( if (_connectWithIdentity) { return div( {class: 'flex-row p-4 fx-justify-center text-secondary'}, - 'Configured Microsoft Entra ID credentials will be used', + 'Microsoft Entra ID credentials configured on host machine will be used', ); } From 4c4fe4fb35664e320abcf6b40362300b570b63e5 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Mon, 24 Nov 2025 15:02:26 -0500 Subject: [PATCH 20/28] fix: update default max query chars --- testgen/template/dbupgrade/0160_incremental_upgrade.sql | 5 +++++ .../ui/components/frontend/js/components/connection_form.js | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 testgen/template/dbupgrade/0160_incremental_upgrade.sql diff --git a/testgen/template/dbupgrade/0160_incremental_upgrade.sql b/testgen/template/dbupgrade/0160_incremental_upgrade.sql new file mode 100644 index 0000000..8026a5b --- /dev/null +++ b/testgen/template/dbupgrade/0160_incremental_upgrade.sql @@ -0,0 +1,5 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +UPDATE connections + SET max_query_chars = 20000 + WHERE max_query_chars = 9000; diff --git a/testgen/ui/components/frontend/js/components/connection_form.js b/testgen/ui/components/frontend/js/components/connection_form.js index 679338f..011e425 100644 --- a/testgen/ui/components/frontend/js/components/connection_form.js +++ b/testgen/ui/components/frontend/js/components/connection_form.js @@ -106,7 +106,7 @@ const ConnectionForm = (props, saveButton) => { const connectionFlavor = van.state(connection?.sql_flavor_code); const connectionName = van.state(connection?.connection_name ?? ''); const connectionMaxThreads = van.state(connection?.max_threads ?? 4); - const connectionQueryChars = van.state(connection?.max_query_chars ?? 9000); + const connectionQueryChars = van.state(connection?.max_query_chars ?? 20000); const privateKeyFile = van.state(getValue(props.cachedPrivateKeyFile) ?? null); const serviceAccountKeyFile = van.state(getValue(props.cachedServiceAccountKeyFile) ?? null); @@ -131,7 +131,7 @@ const ConnectionForm = (props, saveButton) => { sql_flavor_code: connectionFlavor.rawVal ?? '', connection_name: connectionName.rawVal ?? '', max_threads: connectionMaxThreads.rawVal ?? 4, - max_query_chars: connectionQueryChars.rawVal ?? 9000, + max_query_chars: connectionQueryChars.rawVal ?? 20000, }); const dynamicConnectionUrl = van.state(props.dynamicConnectionUrl?.rawVal ?? ''); @@ -337,7 +337,7 @@ const ConnectionForm = (props, saveButton) => { hint: 'Some tests are consolidated into queries for maximum performance. Default values should be retained unless test queries are failing.', value: connectionQueryChars.rawVal, min: 500, - max: 14000, + max: 50000, onChange: (value) => connectionQueryChars.val = value, }), ), From 458d77dceb03e041d21d75f4b1444195253f191c Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Tue, 25 Nov 2025 18:17:03 -0500 Subject: [PATCH 21/28] fix(ui): profiling dialog error --- testgen/ui/queries/profiling_queries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testgen/ui/queries/profiling_queries.py b/testgen/ui/queries/profiling_queries.py index 7216fd1..a128a24 100644 --- a/testgen/ui/queries/profiling_queries.py +++ b/testgen/ui/queries/profiling_queries.py @@ -400,10 +400,10 @@ def get_columns_by_condition( table_chars.approx_record_ct, {COLUMN_PROFILING_FIELDS} FROM data_column_chars column_chars - {""" LEFT JOIN data_table_chars table_chars ON ( column_chars.table_id = table_chars.table_id ) + {""" LEFT JOIN table_groups ON ( column_chars.table_groups_id = table_groups.id ) From b1cb52598d95a230676648fb27f04911633d1ead Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Mon, 1 Dec 2025 19:12:58 -0500 Subject: [PATCH 22/28] fix(test-results): error on refresh score --- .../commands/queries/rollup_scores_query.py | 18 ++++++++++++------ testgen/commands/run_rollup_scores.py | 14 +++----------- testgen/commands/run_test_execution.py | 3 ++- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/testgen/commands/queries/rollup_scores_query.py b/testgen/commands/queries/rollup_scores_query.py index 90d76e9..0d6bfc4 100644 --- a/testgen/commands/queries/rollup_scores_query.py +++ b/testgen/commands/queries/rollup_scores_query.py @@ -35,10 +35,16 @@ def rollup_profiling_scores(self) -> list[tuple[str, dict]]: queries.append(self._get_query("rollup_scores_profile_table_group.sql")) return queries - def rollup_test_scores(self) -> list[tuple[str, dict]]: + def rollup_test_scores(self, update_prevalence: bool = False, update_table_group: bool = False) -> list[tuple[str, dict]]: # Runs on App database - return [ - self._get_query("calc_prevalence_test_results.sql", no_bind=True), - self._get_query("rollup_scores_test_run.sql"), - self._get_query("rollup_scores_test_table_group.sql"), - ] + queries = [] + + if update_prevalence: + queries.append(self._get_query("calc_prevalence_test_results.sql", no_bind=True)) + + queries.append(self._get_query("rollup_scores_test_run.sql")) + + if update_table_group: + queries.append(self._get_query("rollup_scores_test_table_group.sql")) + + return queries diff --git a/testgen/commands/run_rollup_scores.py b/testgen/commands/run_rollup_scores.py index 45b0393..1676504 100644 --- a/testgen/commands/run_rollup_scores.py +++ b/testgen/commands/run_rollup_scores.py @@ -8,22 +8,14 @@ def run_profile_rollup_scoring_queries(project_code: str, run_id: str, table_group_id: str | None = None): - LOG.info("CurrentStep: Initializing Profiling Scores Rollup") sql_generator = RollupScoresSQL(run_id, table_group_id) - - LOG.info("CurrentStep: Rolling up profiling scores") execute_db_queries(sql_generator.rollup_profiling_scores()) run_refresh_score_cards_results(project_code=project_code) def run_test_rollup_scoring_queries(project_code: str, run_id: str, table_group_id: str | None = None): - LOG.info("CurrentStep: Initializing Testing Scores Rollup") sql_generator = RollupScoresSQL(run_id, table_group_id) - - queries = [sql_generator.GetRollupScoresTestRunQuery()] - if table_group_id: - queries.append(sql_generator.GetRollupScoresTestTableGroupQuery()) - - LOG.info("CurrentStep: Rolling up testing scores") - execute_db_queries(queries) + execute_db_queries( + sql_generator.rollup_test_scores(update_table_group=table_group_id is not None) + ) run_refresh_score_cards_results(project_code=project_code) diff --git a/testgen/commands/run_test_execution.py b/testgen/commands/run_test_execution.py index 0a0614f..75ec6db 100644 --- a/testgen/commands/run_test_execution.py +++ b/testgen/commands/run_test_execution.py @@ -308,8 +308,9 @@ def update_single_progress(progress: ThreadedProgress) -> None: def _rollup_test_scores(test_run: TestRun, table_group: TableGroup) -> None: try: LOG.info("Rolling up test scores") + sql_generator = RollupScoresSQL(test_run.id, table_group.id) execute_db_queries( - RollupScoresSQL(test_run.id, table_group.id).rollup_test_scores(), + sql_generator.rollup_test_scores(update_prevalence=True, update_table_group=True), ) run_refresh_score_cards_results( project_code=table_group.project_code, From a38786f74c411faa37453c9995d703b8c70647d1 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Mon, 1 Dec 2025 23:04:46 -0500 Subject: [PATCH 23/28] fix: error on download dialog --- testgen/common/date_service.py | 5 ----- testgen/ui/components/widgets/download_dialog.py | 7 ++++++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/testgen/common/date_service.py b/testgen/common/date_service.py index fc3ae5f..000f065 100644 --- a/testgen/common/date_service.py +++ b/testgen/common/date_service.py @@ -38,8 +38,3 @@ def get_timezoned_timestamp(streamlit_session, value, dateformat="%b %-d, %-I:%M df["value"] = df["value"].dt.tz_localize("UTC").dt.tz_convert(timezone).dt.strftime(dateformat) ret = df.iloc[0, 0] return ret - - -def get_timezoned_now(streamlit_session): - value = datetime.now(UTC) - return get_timezoned_timestamp(streamlit_session, value) diff --git a/testgen/ui/components/widgets/download_dialog.py b/testgen/ui/components/widgets/download_dialog.py index 0a43a74..712eeaa 100644 --- a/testgen/ui/components/widgets/download_dialog.py +++ b/testgen/ui/components/widgets/download_dialog.py @@ -1,5 +1,6 @@ import tempfile from collections.abc import Callable, Iterable +from datetime import datetime from io import BytesIO from typing import TypedDict from zipfile import ZipFile @@ -54,7 +55,11 @@ def get_excel_file_data( # Timestamp worksheet.write("A3", "Exported on", details_key_format) - worksheet.write("B3", date_service.get_timezoned_now(st.session_state), details_value_format) + worksheet.write( + "B3", + date_service.get_timezoned_timestamp(st.session_state, datetime.utcnow()), + details_value_format, + ) # Details if details: From 57c8447dfe6841b593b765924e35b32df386cbd2 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Mon, 1 Dec 2025 22:36:47 -0500 Subject: [PATCH 24/28] fix(profiling-runs): error on cancel and improve display --- testgen/common/models/profiling_run.py | 4 ++-- testgen/ui/components/frontend/js/pages/profiling_runs.js | 4 ++-- testgen/ui/views/profiling_runs.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/testgen/common/models/profiling_run.py b/testgen/common/models/profiling_run.py index 713a06e..0343b99 100644 --- a/testgen/common/models/profiling_run.py +++ b/testgen/common/models/profiling_run.py @@ -244,8 +244,8 @@ def cancel_all_running(cls) -> None: cls.clear_cache() @classmethod - def update_status(cls, run_id: str | UUID, status: ProfilingRunStatus) -> None: - query = update(cls).where(cls.id == run_id).values(status=status) + def cancel_run(cls, run_id: str | UUID) -> None: + query = update(cls).where(cls.id == run_id).values(status="Cancelled", profiling_endtime=datetime.now(UTC)) db_session = get_current_session() db_session.execute(query) db_session.commit() diff --git a/testgen/ui/components/frontend/js/pages/profiling_runs.js b/testgen/ui/components/frontend/js/pages/profiling_runs.js index e5fcaab..4fc62ba 100644 --- a/testgen/ui/components/frontend/js/pages/profiling_runs.js +++ b/testgen/ui/components/frontend/js/pages/profiling_runs.js @@ -314,7 +314,7 @@ const ProfilingRunItem = ( ) : div( { class: 'text-caption mt-1' }, - runningStep + item.status === 'Running' && runningStep ? [ div( runningStep.label, @@ -352,7 +352,7 @@ const ProfilingRunItem = ( ) : null, ), - item.status !== 'Running' && item.column_ct ? Link({ + item.status === 'Complete' && item.column_ct ? Link({ label: 'View results', href: 'profiling-runs:results', params: { 'run_id': item.id }, diff --git a/testgen/ui/views/profiling_runs.py b/testgen/ui/views/profiling_runs.py index 59ad015..f0b442b 100644 --- a/testgen/ui/views/profiling_runs.py +++ b/testgen/ui/views/profiling_runs.py @@ -122,7 +122,7 @@ def get_job_arguments(self, arg_value: str) -> tuple[list[typing.Any], dict[str, def on_cancel_run(profiling_run: dict) -> None: process_status, process_message = process_service.kill_profile_run(to_int(profiling_run["process_id"])) if process_status: - ProfilingRun.update_status(profiling_run["profiling_run_id"], "Cancelled") + ProfilingRun.cancel_run(profiling_run["id"]) fm.reset_post_updates(str_message=f":{'green' if process_status else 'red'}[{process_message}]", as_toast=True) @@ -171,7 +171,7 @@ def on_delete_confirmed(*_args) -> None: if profiling_run.status == "Running": process_status, _ = process_service.kill_profile_run(to_int(profiling_run.process_id)) if process_status: - ProfilingRun.update_status(profiling_run.id, "Cancelled") + ProfilingRun.cancel_run(profiling_run.id) ProfilingRun.cascade_delete(profiling_run_ids) st.rerun() except Exception: From 6a17e3ffa676b504c5b3c7c9b226fc1cd0869f6c Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Mon, 1 Dec 2025 23:50:20 -0500 Subject: [PATCH 25/28] fix(test-runs): improve display for canceled runs --- testgen/common/models/test_run.py | 4 ++-- testgen/ui/components/frontend/js/pages/test_runs.js | 2 +- testgen/ui/views/test_runs.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/testgen/common/models/test_run.py b/testgen/common/models/test_run.py index 4c2d464..4451fcf 100644 --- a/testgen/common/models/test_run.py +++ b/testgen/common/models/test_run.py @@ -245,8 +245,8 @@ def cancel_all_running(cls) -> None: cls.clear_cache() @classmethod - def update_status(cls, run_id: str | UUID, status: TestRunStatus) -> None: - query = update(cls).where(cls.id == run_id).values(status=status) + def cancel_run(cls, run_id: str | UUID) -> None: + query = update(cls).where(cls.id == run_id).values(status="Cancelled", test_endtime=datetime.now(UTC)) db_session = get_current_session() db_session.execute(query) db_session.commit() diff --git a/testgen/ui/components/frontend/js/pages/test_runs.js b/testgen/ui/components/frontend/js/pages/test_runs.js index 1904f52..b979646 100644 --- a/testgen/ui/components/frontend/js/pages/test_runs.js +++ b/testgen/ui/components/frontend/js/pages/test_runs.js @@ -325,7 +325,7 @@ const TestRunItem = ( ) : div( { class: 'text-caption mt-1' }, - runningStep + item.status === 'Running' && runningStep ? [ div( runningStep.label, diff --git a/testgen/ui/views/test_runs.py b/testgen/ui/views/test_runs.py index 765a2f6..a1a802e 100644 --- a/testgen/ui/views/test_runs.py +++ b/testgen/ui/views/test_runs.py @@ -133,7 +133,7 @@ def get_job_arguments(self, arg_value: str) -> tuple[list[typing.Any], dict[str, def on_cancel_run(test_run: dict) -> None: process_status, process_message = process_service.kill_test_run(to_int(test_run["process_id"])) if process_status: - TestRun.update_status(test_run["test_run_id"], "Cancelled") + TestRun.cancel_run(test_run["test_run_id"]) fm.reset_post_updates(str_message=f":{'green' if process_status else 'red'}[{process_message}]", as_toast=True) @@ -181,7 +181,7 @@ def on_delete_confirmed(*_args) -> None: if test_run.status == "Running": process_status, _ = process_service.kill_test_run(to_int(test_run.process_id)) if process_status: - TestRun.update_status(test_run.test_run_id, "Cancelled") + TestRun.cancel_run(test_run.test_run_id) TestRun.cascade_delete(test_run_ids) st.rerun() except Exception: From 808542f0ccbbf613b133adae5f682a8ab8eb929a Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Tue, 2 Dec 2025 01:29:20 -0500 Subject: [PATCH 26/28] fix(quality-dashboard): scorecard display and sorting --- .../frontend/js/pages/quality_dashboard.js | 13 ++++++++++++- testgen/ui/queries/scoring_queries.py | 2 +- testgen/ui/views/score_details.py | 2 +- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/testgen/ui/components/frontend/js/pages/quality_dashboard.js b/testgen/ui/components/frontend/js/pages/quality_dashboard.js index 0607015..371c9ce 100644 --- a/testgen/ui/components/frontend/js/pages/quality_dashboard.js +++ b/testgen/ui/components/frontend/js/pages/quality_dashboard.js @@ -38,12 +38,23 @@ const QualityDashboard = (/** @type {Properties} */ props) => { const sortedBy = van.state('name'); const filterTerm = van.state(''); + + const scoreToNumber = (score) => score ? (score.startsWith('>') ? 99.99 : Number(score)) : 101; + const sortFunctions = { + name: (a, b) => caseInsensitiveSort(a.name, b.name), + score: (a, b) => { + const scoreA = Math.min(scoreToNumber(a.score), scoreToNumber(a.cde_score)); + const scoreB = Math.min(scoreToNumber(b.score), scoreToNumber(b.cde_score)); + return scoreA - scoreB; + }, + }; + const scores = van.derive(() => { const sort = getValue(sortedBy) ?? 'name'; const filter = getValue(filterTerm) ?? ''; return getValue(props.scores) .filter(score => caseInsensitiveIncludes(score.name, filter)) - .sort((a, b) => caseInsensitiveSort(a[sort], b[sort])); + .sort(sortFunctions[sort]); }); return div( diff --git a/testgen/ui/queries/scoring_queries.py b/testgen/ui/queries/scoring_queries.py index 64ecbf7..d16243a 100644 --- a/testgen/ui/queries/scoring_queries.py +++ b/testgen/ui/queries/scoring_queries.py @@ -9,7 +9,7 @@ @st.cache_data(show_spinner="Loading data :gray[:small[(This might take a few minutes)]] ...") def get_all_score_cards(project_code: str) -> list["ScoreCard"]: results = [ - definition.as_cached_score_card() + definition.as_cached_score_card(include_definition=True) for definition in ScoreDefinition.all(project_code=project_code, last_history_items=50) ] return results diff --git a/testgen/ui/views/score_details.py b/testgen/ui/views/score_details.py index 25a19c2..9148128 100644 --- a/testgen/ui/views/score_details.py +++ b/testgen/ui/views/score_details.py @@ -80,7 +80,7 @@ def render( issues = None with st.spinner(text="Loading data :gray[:small[(This might take a few minutes)]] ..."): user_can_edit = session.auth.user_has_permission("edit") - score_card = format_score_card(score_definition.as_cached_score_card()) + score_card = format_score_card(score_definition.as_cached_score_card(include_definition=True)) if score_type not in typing.get_args(ScoreTypes): score_type = None if not score_type: From 9b54583a5432e78054b28418be2457284a36402a Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Tue, 2 Dec 2025 01:57:02 -0500 Subject: [PATCH 27/28] fix(test-execution): improve progress tooltip --- testgen/commands/run_test_execution.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/testgen/commands/run_test_execution.py b/testgen/commands/run_test_execution.py index 75ec6db..bb91f70 100644 --- a/testgen/commands/run_test_execution.py +++ b/testgen/commands/run_test_execution.py @@ -95,7 +95,7 @@ def run_test_execution(test_suite_id: str | UUID, username: str | None = None, r test_run.set_progress( "validation", "Warning" if invalid_count else "Completed", - error=f"{invalid_count} test{'s' if invalid_count > 1 else ''} had errors" if invalid_count else None, + error=f"{invalid_count} test{'s' if invalid_count > 1 else ''} had errors. See details in results." if invalid_count else None, ) if valid_test_defs: @@ -118,6 +118,7 @@ def run_test_execution(test_suite_id: str | UUID, username: str | None = None, r if (run_test_defs := [td for td in valid_test_defs if td.run_type == run_type]): run_functions[run_type](run_test_defs) else: + test_run.set_progress(run_type, "Completed") LOG.info(f"No {run_type} tests to run") else: LOG.info("No valid tests to run") @@ -174,7 +175,7 @@ def update_test_progress(progress: ThreadedProgress) -> None: run_type, "Running", detail=f"{progress['processed']} of {progress['total']}", - error=f"{progress['errors']} test{'s' if progress['errors'] > 1 else ''} had errors" + error=f"{progress['errors']} test{'s' if progress['errors'] > 1 else ''} had errors. See details in results." if progress["errors"] else None, ) @@ -299,7 +300,7 @@ def update_single_progress(progress: ThreadedProgress) -> None: test_run.set_progress( "CAT", "Warning" if error_count else "Completed", - error=f"{error_count} test{'s' if error_count > 1 else ''} had errors" + error=f"{error_count} test{'s' if error_count > 1 else ''} had errors. See details in results." if error_count else None, ) From 006fc4f15fbe8179541501bc2e60fb0ca121649b Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Tue, 2 Dec 2025 16:04:18 -0500 Subject: [PATCH 28/28] release: 4.32.5 -> 4.38.3 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 877dbba..b7714a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] name = "dataops-testgen" -version = "4.32.5" +version = "4.38.3" description = "DataKitchen's Data Quality DataOps TestGen" authors = [ { "name" = "DataKitchen, Inc.", "email" = "info@datakitchen.io" },