diff --git a/deploy/charts/testgen-app/Chart.yaml b/deploy/charts/testgen-app/Chart.yaml index 8a0824a7..01b2c072 100644 --- a/deploy/charts/testgen-app/Chart.yaml +++ b/deploy/charts/testgen-app/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 1.0.0 +version: 1.0.1 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/deploy/charts/testgen-app/templates/_environment.yaml b/deploy/charts/testgen-app/templates/_environment.yaml index a630b1ee..c329f75c 100644 --- a/deploy/charts/testgen-app/templates/_environment.yaml +++ b/deploy/charts/testgen-app/templates/_environment.yaml @@ -31,8 +31,6 @@ value: {{ .Values.testgen.trustTargetDatabaseCertificate | ternary "yes" "no" | quote }} - name: TG_EXPORT_TO_OBSERVABILITY_VERIFY_SSL value: {{ .Values.testgen.observabilityVerifySsl | ternary "yes" "no" | quote }} -- name: TG_DOCKER_RELEASE_CHECK_ENABLED - value: {{ .Values.testgen.releaseCheck | ternary "yes" "no" | quote }} {{- end -}} {{- define "testgen.hookEnvironment" -}} diff --git a/deploy/charts/testgen-app/values.yaml b/deploy/charts/testgen-app/values.yaml index f97987cb..d958ae09 100644 --- a/deploy/charts/testgen-app/values.yaml +++ b/deploy/charts/testgen-app/values.yaml @@ -15,7 +15,6 @@ testgen: uiPassword: trustTargetDatabaseCertificate: false observabilityVerifySsl: true - releaseCheck: true labels: cliHooks: diff --git a/deploy/docker-bake.hcl b/deploy/docker-bake.hcl index 2518cfc4..35efb2b4 100644 --- a/deploy/docker-bake.hcl +++ b/deploy/docker-bake.hcl @@ -4,12 +4,16 @@ variable "TESTGEN_VERSION" {} variable "TESTGEN_DOCKER_HUB_REPO" { default = "datakitchen/dataops-testgen" } +variable "TESTGEN_SUPPORT_EMAIL" { + default = "open-source-support@datakitchen.io" +} target "testgen-release" { args = { TESTGEN_VERSION = "${TESTGEN_VERSION}" TESTGEN_BASE_LABEL = "${TESTGEN_BASE_LABEL}" TESTGEN_DOCKER_HUB_REPO = "${TESTGEN_DOCKER_HUB_REPO}" + TESTGEN_SUPPORT_EMAIL = "${TESTGEN_SUPPORT_EMAIL}" } context = "." dockerfile = "deploy/testgen.dockerfile" @@ -31,6 +35,7 @@ target "testgen-qa" { TESTGEN_VERSION = "${TESTGEN_VERSION}" TESTGEN_BASE_LABEL = "${TESTGEN_BASE_LABEL}" TESTGEN_DOCKER_HUB_REPO = "${TESTGEN_DOCKER_HUB_REPO}" + TESTGEN_SUPPORT_EMAIL = "${TESTGEN_SUPPORT_EMAIL}" } context = "." dockerfile = "deploy/testgen.dockerfile" diff --git a/deploy/testgen.dockerfile b/deploy/testgen.dockerfile index 318a3add..58e15db3 100644 --- a/deploy/testgen.dockerfile +++ b/deploy/testgen.dockerfile @@ -1,10 +1,11 @@ -ARG TESTGEN_BASE_LABEL=v6 +ARG TESTGEN_BASE_LABEL=v7 FROM datakitchen/dataops-testgen-base:${TESTGEN_BASE_LABEL} AS release-image # Args have to be set in current build stage: https://github.com/moby/moby/issues/37345 ARG TESTGEN_VERSION ARG TESTGEN_DOCKER_HUB_REPO +ARG TESTGEN_SUPPORT_EMAIL ENV PYTHONPATH=/dk/lib/python3.12/site-packages ENV PATH=$PATH:/dk/bin @@ -24,6 +25,7 @@ RUN chown -R testgen:testgen /var/lib/testgen /dk/lib/python3.12/site-packages/s ENV TESTGEN_VERSION=${TESTGEN_VERSION} ENV TESTGEN_DOCKER_HUB_REPO=${TESTGEN_DOCKER_HUB_REPO} +ENV TESTGEN_SUPPORT_EMAIL=${TESTGEN_SUPPORT_EMAIL} ENV TG_RELEASE_CHECK=docker USER testgen diff --git a/docker-compose.yml b/docker-compose.yml index 929bb50a..a8903ddb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,7 +9,6 @@ x-common-variables: &common-variables TG_METADATA_DB_HOST: postgres TG_TARGET_DB_TRUST_SERVER_CERTIFICATE: yes TG_EXPORT_TO_OBSERVABILITY_VERIFY_SSL: no - TG_DOCKER_RELEASE_CHECK_ENABLED: yes services: diff --git a/docs/configuration.md b/docs/configuration.md index d5a10358..1c3c9177 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -282,9 +282,3 @@ default: `dataset` When exporting to your instance of Observabilty, the key sent to the events API to identify the components. default: `default` - -#### `TG_DOCKER_RELEASE_CHECK_ENABLED` - -Enables calling Docker Hub API to fetch the latest released image tag. The fetched tag is displayed in the UI menu. - -default: `yes` diff --git a/pyproject.toml b/pyproject.toml index 9b0879ba..0320b0c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] name = "dataops-testgen" -version = "4.1.3" +version = "4.12.6" description = "DataKitchen's Data Quality DataOps TestGen" authors = [ { "name" = "DataKitchen, Inc.", "email" = "info@datakitchen.io" }, @@ -40,7 +40,7 @@ dependencies = [ "requests_extensions==1.1.3", "numpy==1.26.4", "pandas==2.1.4", - "streamlit==1.44.1", + "streamlit==1.46.1", "streamlit-extras==0.3.0", "streamlit-aggrid==0.3.4.post3", "plotly_express==0.4.1", @@ -61,7 +61,6 @@ dependencies = [ # Pinned to match the manually compiled libs or for security "pyarrow==18.1.0", - "snowflake-connector-python==3.13.1", "matplotlib==3.9.2", "scipy==1.14.1", "jinja2==3.1.6", diff --git a/testgen/__main__.py b/testgen/__main__.py index 2f09b169..74541d76 100644 --- a/testgen/__main__.py +++ b/testgen/__main__.py @@ -48,6 +48,7 @@ LOG = logging.getLogger("testgen") APP_MODULES = ["ui", "scheduler"] +VERSION_DATA = version_service.get_version() @dataclass @@ -69,7 +70,13 @@ def invoke(self, ctx: Context): @click.group( cls=CliGroup, - help=f"This version: {settings.VERSION} \n\nLatest version: {version_service.get_latest_version()} \n\nSchema revision: {get_schema_revision()}" + help=f""" + {VERSION_DATA.edition} {VERSION_DATA.current or ""} + + {f"New version available! {VERSION_DATA.latest}" if VERSION_DATA.latest != VERSION_DATA.current else ""} + + Schema revision: {get_schema_revision()} + """ ) @click.option( "-v", diff --git a/testgen/commands/queries/execute_tests_query.py b/testgen/commands/queries/execute_tests_query.py index 20b0cf2d..00b9d4d4 100644 --- a/testgen/commands/queries/execute_tests_query.py +++ b/testgen/commands/queries/execute_tests_query.py @@ -29,12 +29,30 @@ def __init__(self, strProjectCode, strFlavor, strTestSuiteId, strTestSuite, minu def _AssembleDisplayParameters(self): - lst_parms = ["column_name", "skip_errors", "baseline_ct", "baseline_unique_ct", "baseline_value", - "baseline_value_ct", "baseline_sum", "baseline_avg", "baseline_sd", "subset_condition", - "groupby_names", "having_condition", "window_date_column", "window_days", - "match_column_names", "match_subset_condition", "match_schema_name", "match_table_name", - "match_groupby_names", "match_having_condition", - ] + lst_parms = [ + "column_name", + "skip_errors", + "baseline_ct", + "baseline_unique_ct", + "baseline_value", + "baseline_value_ct", + "baseline_sum", + "baseline_avg", + "baseline_sd", + "lower_tolerance", + "upper_tolerance", + "subset_condition", + "groupby_names", + "having_condition", + "window_date_column", + "window_days", + "match_column_names", + "match_subset_condition", + "match_schema_name", + "match_table_name", + "match_groupby_names", + "match_having_condition", + ] str_parms = "; ".join(f"{key}={self.dctTestParms[key]}" for key in lst_parms if key.lower() in self.dctTestParms and self.dctTestParms[key] not in [None, ""]) @@ -107,11 +125,6 @@ def GetTestsNonCAT(self, booClean): return strQ - def AddTestRecordtoTestRunTable(self): - strQ = self._ReplaceParms(read_template_sql_file("ex_write_test_record_to_testrun_table.sql", "execution")) - - return strQ - def PushTestRunStatusUpdateSQL(self): # Runs on DK DB strQ = self._ReplaceParms(read_template_sql_file("ex_update_test_record_in_testrun_table.sql", "execution")) diff --git a/testgen/commands/queries/generate_tests_query.py b/testgen/commands/queries/generate_tests_query.py index 73fbb325..374e50f5 100644 --- a/testgen/commands/queries/generate_tests_query.py +++ b/testgen/commands/queries/generate_tests_query.py @@ -60,11 +60,11 @@ def GetTestTypesSQL(self, booClean): return strQuery - def GetTestDerivationQueriesAsList(self, booClean): + def GetTestDerivationQueriesAsList(self, template_directory, booClean): # This assumes the queries run in no particular order, # and will order them alphabetically by file name lstQueries = sorted( - get_template_files(mask=r"^.*sql$", sub_directory="gen_funny_cat_tests"), key=lambda key: str(key) + get_template_files(mask=r"^.*sql$", sub_directory=template_directory), key=lambda key: str(key) ) lstTemplate = [] diff --git a/testgen/commands/queries/profiling_query.py b/testgen/commands/queries/profiling_query.py index 6aba1c8c..155cc98a 100644 --- a/testgen/commands/queries/profiling_query.py +++ b/testgen/commands/queries/profiling_query.py @@ -125,6 +125,7 @@ def ReplaceParms(self, strInputString): strInputString = strInputString.replace("{CONTINGENCY_COLUMNS}", self.contingency_columns) strInputString = strInputString.replace("{CONTINGENCY_MAX_VALUES}", self.contingency_max_values) strInputString = strInputString.replace("{PROCESS_ID}", str(self.process_id)) + strInputString = strInputString.replace("{SQL_FLAVOR}", self.flavor) strInputString = replace_templated_functions(strInputString, self.flavor) return strInputString diff --git a/testgen/commands/queries/refresh_data_chars_query.py b/testgen/commands/queries/refresh_data_chars_query.py index 4934601f..694eeefb 100644 --- a/testgen/commands/queries/refresh_data_chars_query.py +++ b/testgen/commands/queries/refresh_data_chars_query.py @@ -10,7 +10,7 @@ class CRefreshDataCharsSQL: sql_flavor: str table_group_schema: str table_group_id: str - + max_query_chars: int profiling_table_set: str profiling_include_mask: str @@ -37,20 +37,31 @@ def _replace_params(self, sql_query: str) -> str: sql_query = sql_query.replace("{RUN_DATE}", self.run_date) sql_query = sql_query.replace("{SOURCE_TABLE}", self.source_table) return sql_query - + def _get_mask_query(self, mask: str, is_include: bool) -> str: sub_query = "" if mask: sub_query += " AND (" if is_include else " AND NOT (" is_first = True + escape = "" + if self.sql_flavor.startswith("mssql"): + escaped_underscore = "[_]" + elif self.sql_flavor == "snowflake": + escaped_underscore = "\\\\_" + escape = "ESCAPE '\\\\'" + elif self.sql_flavor == "redshift": + escaped_underscore = "\\\\_" + else: + escaped_underscore = "\\_" for item in mask.split(","): if not is_first: sub_query += " OR " - sub_query += "(c.table_name LIKE '" + item.strip() + "')" + item = item.strip().replace("_", escaped_underscore) + sub_query += f"(c.table_name LIKE '{item}' {escape})" is_first = False sub_query += ")" return sub_query - + def GetDDFQuery(self) -> str: # Runs on Project DB sql_query = self._replace_params( @@ -67,18 +78,18 @@ def GetDDFQuery(self) -> str: sql_query = sql_query.replace("{TABLE_CRITERIA}", table_criteria) return sql_query - + def GetRecordCountQueries(self, schema_tables: list[str]) -> list[str]: count_queries = [ f"SELECT '{item}', COUNT(*) FROM {item}" for item in schema_tables ] return chunk_queries(count_queries, " UNION ALL ", self.max_query_chars) - + def GetDataCharsUpdateQuery(self) -> str: # Runs on DK Postgres Server return self._replace_params(read_template_sql_file("data_chars_update.sql", sub_directory="data_chars")) - + def GetStagingDeleteQuery(self) -> str: # Runs on DK Postgres Server return self._replace_params(read_template_sql_file("data_chars_staging_delete.sql", sub_directory="data_chars")) diff --git a/testgen/commands/run_execute_tests.py b/testgen/commands/run_execute_tests.py index dc8028d1..a5799006 100644 --- a/testgen/commands/run_execute_tests.py +++ b/testgen/commands/run_execute_tests.py @@ -17,7 +17,7 @@ WriteListToDB, date_service, ) -from testgen.common.database.database_service import empty_cache +from testgen.common.database.database_service import ExecuteDBQuery, empty_cache from .run_execute_cat_tests import run_cat_test_queries from .run_refresh_data_chars import run_refresh_data_chars_queries @@ -26,6 +26,17 @@ LOG = logging.getLogger("testgen") +def add_test_run_record(test_run_id, test_suite_id, test_time, process_id): + query = f""" + INSERT INTO test_runs(id, test_suite_id, test_starttime, process_id) + (SELECT '{test_run_id}':: UUID as id, + '{test_suite_id}' as test_suite_id, + '{test_time}' as test_starttime, + '{process_id}' as process_id); + """ + ExecuteDBQuery("DKTG", query) + + def run_test_queries(dctParms, strTestRunID, strTestTime, strProjectCode, strTestSuite, minutes_offset=0, spinner=None): booErrors = False error_msg = "" @@ -38,11 +49,6 @@ def run_test_queries(dctParms, strTestRunID, strTestTime, strProjectCode, strTes clsExecute.process_id = process_service.get_current_process_id() booClean = False - # Add a record in Test Run table for the new Test Run - strTestRunQuery = clsExecute.AddTestRecordtoTestRunTable() - lstTestRunQuery = [strTestRunQuery] - RunActionQueryList("DKTG", lstTestRunQuery) - try: # Retrieve non-CAT Queries LOG.info("CurrentStep: Retrieve Non-CAT Queries") @@ -131,6 +137,11 @@ def run_execution_steps( LOG.info("CurrentStep: Retrieving TestExec Parameters") test_exec_params = RetrieveTestExecParms(project_code, test_suite) + # Add a record in Test Run table for the new Test Run + add_test_run_record( + test_run_id, test_exec_params["test_suite_id"], test_time, process_service.get_current_process_id() + ) + LOG.info("CurrentStep: Assigning Connection Parms") AssignConnectParms( test_exec_params["project_code"], diff --git a/testgen/commands/run_generate_tests.py b/testgen/commands/run_generate_tests.py index 17d97266..01fd8fe0 100644 --- a/testgen/commands/run_generate_tests.py +++ b/testgen/commands/run_generate_tests.py @@ -65,7 +65,8 @@ def run_test_gen_queries(strTableGroupsID, strTestSuite, strGenerationSet=None): LOG.info("CurrentStep: Compiling Test Gen Queries") - lstFunnyTemplateQueries = clsTests.GetTestDerivationQueriesAsList(booClean) + lstFunnyTemplateQueries = clsTests.GetTestDerivationQueriesAsList("gen_funny_cat_tests", booClean) + lstQueryTemplateQueries = clsTests.GetTestDerivationQueriesAsList("gen_query_tests", booClean) lstGenericTemplateQueries = [] # Delete old Tests @@ -102,7 +103,7 @@ def run_test_gen_queries(strTableGroupsID, strTestSuite, strGenerationSet=None): LOG.info("TestGen CAT Queries were compiled") # Make sure delete, then generic templates run before the funny templates - lstQueries = [strDeleteQuery, *lstGenericTemplateQueries, *lstFunnyTemplateQueries] + lstQueries = [strDeleteQuery, *lstGenericTemplateQueries, *lstFunnyTemplateQueries, *lstQueryTemplateQueries] if lstQueries: LOG.info("Running Test Generation Template Queries") @@ -110,7 +111,7 @@ def run_test_gen_queries(strTableGroupsID, strTestSuite, strGenerationSet=None): message = "Test generation completed successfully." else: message = "No TestGen Queries were compiled." - + MixpanelService().send_event( "generate-tests", source=settings.ANALYTICS_JOB_SOURCE, diff --git a/testgen/commands/run_launch_db_config.py b/testgen/commands/run_launch_db_config.py index 2a50126a..bdfa6ab1 100644 --- a/testgen/commands/run_launch_db_config.py +++ b/testgen/commands/run_launch_db_config.py @@ -38,6 +38,8 @@ def _get_params_mapping() -> dict: "PROJECT_USER": settings.PROJECT_DATABASE_USER, "PROJECT_PORT": settings.PROJECT_DATABASE_PORT, "PROJECT_HOST": settings.PROJECT_DATABASE_HOST, + "PROJECT_PW_ENCRYPTED": EncryptText(settings.PROJECT_DATABASE_PASSWORD), + "PROJECT_HTTP_PATH": "", "PROJECT_SCHEMA": settings.PROJECT_DATABASE_SCHEMA, "PROFILING_TABLE_SET": settings.DEFAULT_PROFILING_TABLE_SET, "PROFILING_INCLUDE_MASK": settings.DEFAULT_PROFILING_INCLUDE_MASK, @@ -54,7 +56,6 @@ def _get_params_mapping() -> dict: "TEST_SUITE_DESCRIPTION": settings.DEFAULT_TEST_SUITE_DESCRIPTION, "MAX_THREADS": settings.PROJECT_CONNECTION_MAX_THREADS, "MAX_QUERY_CHARS": settings.PROJECT_CONNECTION_MAX_QUERY_CHAR, - "PROJECT_PW_ENCRYPTED": EncryptText(settings.PROJECT_DATABASE_PASSWORD), "OBSERVABILITY_API_URL": settings.OBSERVABILITY_API_URL, "OBSERVABILITY_API_KEY": settings.OBSERVABILITY_API_KEY, "OBSERVABILITY_COMPONENT_KEY": settings.OBSERVABILITY_DEFAULT_COMPONENT_KEY, diff --git a/testgen/commands/run_test_parameter_validation.py b/testgen/commands/run_test_parameter_validation.py index b2b98936..71668bcd 100644 --- a/testgen/commands/run_test_parameter_validation.py +++ b/testgen/commands/run_test_parameter_validation.py @@ -29,6 +29,10 @@ def run_parameter_validation_queries( strColumnList = clsExecute.GetTestValidationColumns(booClean) test_columns, _ = RetrieveDBResultsToList("DKTG", strColumnList) + invalid_tests = [ test_ids for col, test_ids in test_columns if not col ] + invalid_tests = { item for sublist in invalid_tests for item in sublist } + test_columns = [ item for item in test_columns if item[0] ] + if not test_columns: LOG.warning(f"No test columns are present to validate in Test Suite {strTestSuite}") missing_columns = [] @@ -71,7 +75,7 @@ def run_parameter_validation_queries( if missing_tables: LOG.info("Missing tables: %s", ", ".join(missing_tables)) - if missing_columns or missing_tables: + if missing_columns or missing_tables or invalid_tests: # Flag test_definitions tests with missing tables or columns LOG.info("CurrentStep: Flagging Tests That Failed Validation") @@ -86,7 +90,7 @@ def run_parameter_validation_queries( tests_missing_columns[column_name].extend(test_ids) clsExecute.flag_val = "D" - clsExecute.test_ids = list(set(chain(*tests_missing_tables.values(), *tests_missing_columns.values()))) + clsExecute.test_ids = list(set(chain(*tests_missing_tables.values(), *tests_missing_columns.values(), invalid_tests))) strPrepFlagTests = clsExecute.PrepFlagTestsWithFailedValidation() RunActionQueryList("DKTG", [strPrepFlagTests]) @@ -101,6 +105,12 @@ def run_parameter_validation_queries( clsExecute.test_ids = test_ids strFlagTests = clsExecute.FlagTestsWithFailedValidation() RunActionQueryList("DKTG", [strFlagTests]) + + if invalid_tests: + clsExecute.message = "Invalid test: schema, table, or column not defined" + clsExecute.test_ids = invalid_tests + strFlagTests = clsExecute.FlagTestsWithFailedValidation() + RunActionQueryList("DKTG", [strFlagTests]) # Copy test results to DK DB, using temporary flagged D value to identify LOG.info("CurrentStep: Saving error results for invalid tests") diff --git a/testgen/common/database/database_service.py b/testgen/common/database/database_service.py index 714aae01..643217ec 100644 --- a/testgen/common/database/database_service.py +++ b/testgen/common/database/database_service.py @@ -313,8 +313,7 @@ def _InitDBConnection_target_db(flavor_service, strCredentialSet, strRaw="N", us is_password_overwritten = pwd_override is not None strConnect = flavor_service.get_connection_string(strPW, is_password_overwritten) - connect_args = {"connect_timeout": 3600} - connect_args.update(flavor_service.get_connect_args(is_password_overwritten)) + connect_args = flavor_service.get_connect_args(is_password_overwritten) try: # Timeout in seconds: 1 hour = 60 * 60 second = 3600 diff --git a/testgen/common/database/flavor/databricks_flavor_service.py b/testgen/common/database/flavor/databricks_flavor_service.py index da451e9b..a31367f5 100644 --- a/testgen/common/database/flavor/databricks_flavor_service.py +++ b/testgen/common/database/flavor/databricks_flavor_service.py @@ -4,9 +4,6 @@ class DatabricksFlavorService(FlavorService): - def __init__(self): - self.http_path = None - def get_connection_string_head(self, strPW): strConnect = f"{self.flavor}://{self.username}:{quote_plus(strPW)}@" return strConnect @@ -17,9 +14,3 @@ def get_connection_string_from_fields(self, strPW, is_password_overwritten: bool f"?http_path={self.http_path}" ) return strConnect - - def get_pre_connection_queries(self): - return [] - - def get_connect_args(self, is_password_overwritten: bool = False): # NOQA ARG002 - return {} diff --git a/testgen/common/database/flavor/flavor_service.py b/testgen/common/database/flavor/flavor_service.py index 06c539ac..7b7f7246 100644 --- a/testgen/common/database/flavor/flavor_service.py +++ b/testgen/common/database/flavor/flavor_service.py @@ -1,6 +1,5 @@ from abc import abstractmethod -from testgen import settings from testgen.common.encrypt import DecryptText @@ -51,13 +50,14 @@ def get_db_name(self) -> str: def is_connect_by_key(self) -> str: return self.connect_by_key + + def get_pre_connection_queries(self) -> list[str]: + return [] + + def get_connect_args(self, _is_password_overwritten: bool = False) -> dict: + return {"connect_timeout": 3600} - def get_connect_args(self, is_password_overwritten: bool = False): # NOQA ARG002 - if settings.SKIP_DATABASE_CERTIFICATE_VERIFICATION: - return {"TrustServerCertificate": "yes"} - return {} - - def get_concat_operator(self): + def get_concat_operator(self) -> str: return "||" def get_connection_string(self, strPW, is_password_overwritten: bool = False): diff --git a/testgen/common/database/flavor/mssql_flavor_service.py b/testgen/common/database/flavor/mssql_flavor_service.py index cfbb9c55..d472f3cd 100644 --- a/testgen/common/database/flavor/mssql_flavor_service.py +++ b/testgen/common/database/flavor/mssql_flavor_service.py @@ -1,9 +1,10 @@ from urllib.parse import quote_plus +from testgen import settings from testgen.common.database.flavor.flavor_service import FlavorService -class MssqlFlavorService(FlavorService): +class MssqlFlavorService(FlavorService): def get_connection_string_head(self, strPW): username = self.username password = quote_plus(strPW) @@ -29,6 +30,12 @@ def get_pre_connection_queries(self): # ARG002 "SET ANSI_DEFAULTS ON;", "SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;", ] + + def get_connect_args(self, is_password_overwritten: bool = False): + connect_args = super().get_connect_args(is_password_overwritten) + if settings.SKIP_DATABASE_CERTIFICATE_VERIFICATION: + connect_args["TrustServerCertificate"] = "yes" + return connect_args def get_concat_operator(self): return "+" diff --git a/testgen/common/database/flavor/redshift_flavor_service.py b/testgen/common/database/flavor/redshift_flavor_service.py index e3ed1a21..1d29e3f2 100644 --- a/testgen/common/database/flavor/redshift_flavor_service.py +++ b/testgen/common/database/flavor/redshift_flavor_service.py @@ -17,6 +17,3 @@ def get_pre_connection_queries(self): return [ "SET SEARCH_PATH = '" + self.dbschema + "'", ] - - def get_connect_args(self, is_password_overwritten: bool = False): # NOQA ARG002 - return {} diff --git a/testgen/common/database/flavor/trino_flavor_service.py b/testgen/common/database/flavor/trino_flavor_service.py index 788fcaeb..12db762b 100644 --- a/testgen/common/database/flavor/trino_flavor_service.py +++ b/testgen/common/database/flavor/trino_flavor_service.py @@ -16,6 +16,3 @@ def get_pre_connection_queries(self): return [ "USE " + self.catalog + "." + self.dbschema, ] - - def get_connect_args(self, is_password_overwritten: bool = False): # NOQA ARG002 - return {} diff --git a/testgen/common/get_pipeline_parms.py b/testgen/common/get_pipeline_parms.py index cade94cf..d8d2e213 100644 --- a/testgen/common/get_pipeline_parms.py +++ b/testgen/common/get_pipeline_parms.py @@ -13,20 +13,6 @@ def RetrieveProfilingParms(strTableGroupsID): if lstParms is None: raise ValueError("Project Connection Parameters not found") - required_params = ( - "project_code", - "connection_id", - "sql_flavor", - "project_user", - "profile_use_sampling", - "profile_sample_percent", - "profile_sample_min_count", - "table_group_schema", - ) - - if missing := [param for param in required_params if not lstParms[0][param]]: - raise ValueError(f"Project Connection parameters are missing: {', '.join(missing)}.") - return lstParms[0] diff --git a/testgen/common/mixpanel_service.py b/testgen/common/mixpanel_service.py index dd6608b3..fd3908f8 100644 --- a/testgen/common/mixpanel_service.py +++ b/testgen/common/mixpanel_service.py @@ -1,3 +1,4 @@ +import functools import json import logging import ssl @@ -8,7 +9,12 @@ from urllib.parse import urlencode from urllib.request import Request, urlopen +import streamlit as st + +import testgen.ui.services.database_service as db from testgen import settings +from testgen.common.models import with_database_session +from testgen.common.models.settings import PersistedSetting, SettingNotFound from testgen.ui.session import session from testgen.utils.singleton import Singleton @@ -30,25 +36,33 @@ def wrapped(*args, **kwargs): class MixpanelService(Singleton): @cached_property + @with_database_session def instance_id(self): - return settings.INSTANCE_ID or blake2b(uuid.getnode().to_bytes(8), digest_size=8).hexdigest() + try: + instance_id = PersistedSetting.get("INSTANCE_ID") + except SettingNotFound: + instance_id = settings.INSTANCE_ID or blake2b(uuid.getnode().to_bytes(8), digest_size=8).hexdigest() + PersistedSetting.set("INSTANCE_ID", instance_id) + return instance_id - @cached_property - def distinct_id(self): - return self._hash_value(session.username or "") + def get_distinct_id(self, username): + return self._hash_value(username or "") + @functools.cache # noqa: B019 def _hash_value(self, value: bytes | str, digest_size: int = 8) -> str: if isinstance(value, str): value = value.encode() return blake2b(value, salt=self.instance_id.encode(), digest_size=digest_size).hexdigest() @safe_method - def send_event(self, event_name, **properties): + def send_event(self, event_name, include_usage=False, **properties): properties.setdefault("instance_id", self.instance_id) properties.setdefault("edition", settings.DOCKER_HUB_REPOSITORY) properties.setdefault("version", settings.VERSION) - properties.setdefault("distinct_id", self.distinct_id) properties.setdefault("username", session.username) + properties.setdefault("distinct_id", self.get_distinct_id(properties["username"])) + if include_usage: + properties.update(self.get_usage()) track_payload = { "event": event_name, @@ -77,3 +91,15 @@ def send_mp_request(self, endpoint, payload): urlopen(req, context=self.get_ssl_context(), timeout=settings.MIXPANEL_TIMEOUT) # noqa: S310 except Exception: LOG.exception("Failed to send analytics data") + + def get_usage(self): + schema: str = st.session_state["dbschema"] + query = f""" + SELECT + (SELECT COUNT(*) FROM {schema}.auth_users) AS user_count, + (SELECT COUNT(*) FROM {schema}.projects) AS project_count, + (SELECT COUNT(*) FROM {schema}.connections) AS connection_count, + (SELECT COUNT(*) FROM {schema}.table_groups) AS table_group_count, + (SELECT COUNT(*) FROM {schema}.test_suites) AS test_suite_count; + """ + return db.retrieve_data(query).iloc[0].to_dict() diff --git a/testgen/common/models/settings.py b/testgen/common/models/settings.py new file mode 100644 index 00000000..4d9d67c9 --- /dev/null +++ b/testgen/common/models/settings.py @@ -0,0 +1,43 @@ +from typing import Any + +from sqlalchemy import Column, String +from sqlalchemy.dialects.postgresql import JSONB + +from testgen.common.models import Base, get_current_session + +NO_DEFAULT = type("NoDefaultSentinel", (), {})() + + +class SettingNotFound(ValueError): + pass + + +class PersistedSetting(Base): + __tablename__ = "settings" + + key: str = Column(String, primary_key=True) + value: Any = Column(JSONB, nullable=False) + + @classmethod + def get(cls, key: str, default=NO_DEFAULT) -> Any: + # This caches all the settings in the session, so it hits the database only once + get_current_session().query(cls).all() + + if ps := get_current_session().query(cls).filter_by(key=key).first(): + return ps.value + elif default is NO_DEFAULT: + raise SettingNotFound(f"Setting '{key}' not found") + else: + return default + + @classmethod + def set(cls, key: str, value: Any): + session = get_current_session() + if ps := session.query(cls).filter_by(key=key).first(): + ps.value = value + else: + session.add(cls(key=key, value=value)) + session.commit() + + def __repr__(self): + return f"{self.__class__.__name__}(key={self.key!r} value={self.value!r})" diff --git a/testgen/common/version_service.py b/testgen/common/version_service.py index 8e03cb17..355c5d9a 100644 --- a/testgen/common/version_service.py +++ b/testgen/common/version_service.py @@ -1,79 +1,56 @@ import logging +from dataclasses import dataclass import requests from testgen import settings +from testgen.ui.session import session LOG = logging.getLogger("testgen") +LATEST_VERSIONS_URL = "https://dk-support-external.s3.us-east-1.amazonaws.com/testgen-observability/testgen-latest-versions.json" -def get_latest_version() -> str: - try: - return { - "pypi": _get_last_pypi_release, - "docker": _get_last_docker_release, - "yes": _get_last_docker_release, # NOTE: kept for retrocompatibility - }.get(settings.CHECK_FOR_LATEST_VERSION, lambda: "unknown")() - except: - return "unknown" - - -def _get_last_pypi_release() -> str: - response = requests.get("https://pypi.org/pypi/dataops-testgen/json", timeout=3) - if response.status_code != 200: - LOG.warning(f"version_service: Failed to fetch PyPi releases. Status code: {response.status_code}") - return "unknown" - - package_data = response.json() - package_releases = list((package_data.get("releases") or {}).keys()) - - return _sorted_tags(package_releases)[0] +@dataclass +class Version: + edition: str + current: str + latest: str -def _get_last_docker_release() -> str: - headers = {} - if settings.DOCKER_HUB_USERNAME and settings.DOCKER_HUB_PASSWORD: - auth_response = requests.post( - "https://hub.docker.com/v2/users/login", - json={"username": settings.DOCKER_HUB_USERNAME, "password": settings.DOCKER_HUB_PASSWORD}, - timeout=5, +def get_version() -> Version: + if not session.version: + session.version = Version( + edition=_get_app_edition(), + current=settings.VERSION, + latest=_get_latest_version(), ) - if auth_response.status_code != 200: - LOG.warning( - "version_service: unable to login against https://hub.docker.com." - f" Status code: {auth_response.status_code}" - ) - return "unknown" - headers["Authorization"] = f"Bearer {auth_response.json()['token']}" + return session.version - response = requests.get( - f"https://hub.docker.com/v2/repositories/{settings.DOCKER_HUB_REPOSITORY}/tags", - headers=headers, - params={"page_size": 25, "page": 1, "ordering": "last_updated"}, - timeout=3, - ) - - if response.status_code != 200: - LOG.debug(f"version_service: Failed to fetch docker tags. Status code: {response.status_code}") - return "unknown" - tags_to_return = [] - tags_data = response.json() - results = tags_data.get("results", []) - for result in results: - tag_name = result["name"] - if tag_name.count(".") >= 2 and "experimental" not in tag_name: - tags_to_return.append(tag_name) +def _get_app_edition() -> str: + edition = ( + settings.DOCKER_HUB_REPOSITORY + .replace("datakitchen/dataops-testgen", "") + .replace("-", " ") + .strip() + .title() + .replace("Qa", "QA") + ) + return f"TestGen{' ' + edition if edition else ''}" - if len(tags_to_return) <= 0: - return "unkown" - return _sorted_tags(tags_to_return)[0] +def _get_latest_version() -> str | None: + try: + response = requests.get(LATEST_VERSIONS_URL, timeout=3) + if response.status_code != 200: + LOG.warning(f"Failed to fetch latest versions from S3. Status code: {response.status_code}") + return None + + latest_versions = response.json() + if settings.CHECK_FOR_LATEST_VERSION == "pypi": + return latest_versions.get("pypi") -def _sorted_tags(tags: list[str]) -> list[str]: - sorted_tags_as_tuples = sorted( - [tuple([ int(i) for i in tag.replace("v", "").split(".") ]) for tag in tags], - reverse=True, - ) - return [".".join([str(i) for i in tag_tuple]) for tag_tuple in sorted_tags_as_tuples] + return latest_versions.get("docker", {}).get(settings.DOCKER_HUB_REPOSITORY) + except: + return None diff --git a/testgen/settings.py b/testgen/settings.py index 2d2c91c7..07f044fa 100644 --- a/testgen/settings.py +++ b/testgen/settings.py @@ -408,17 +408,12 @@ defaults to: `default` """ -CHECK_FOR_LATEST_VERSION: typing.Literal["pypi", "docker", "no"] = typing.cast( - typing.Literal["pypi", "docker", "no"], - os.getenv("TG_RELEASE_CHECK", os.getenv("TG_DOCKER_RELEASE_CHECK_ENABLED", "pypi")).lower(), +CHECK_FOR_LATEST_VERSION: typing.Literal["pypi", "docker"] = typing.cast( + typing.Literal["pypi", "docker"], + os.getenv("TG_RELEASE_CHECK", "pypi").lower(), ) """ -When set to, enables calling Docker Hub API to fetch the latest released -image tag. The fetched tag is displayed in the UI menu. - -from env variable: `TG_DOCKER_RELEASE_CHECK_ENABLED` -choices: `pypi`, `docker`, `no` -defaults to: `pypi` +Specifies whether the latest version check should be based on PyPI or DockerHub. """ DOCKER_HUB_REPOSITORY: str = os.getenv( @@ -429,35 +424,16 @@ URL to the docker hub repository containing the dataops testgen image. Used to check for new releases when `CHECK_FOR_LATEST_VERSION` is set to `docker`. - -from env variable: `TESTGEN_DOCKER_HUB_URL` -defaults to: datakitchen/dataops-testgen -""" - -DOCKER_HUB_USERNAME: str | None = os.getenv("TESTGEN_DOCKER_HUB_USERNAME", None) """ -Username to authenticate against Docker Hub API before fetching the list -of tags. Required if `DOCKER_HUB_REPOSITORY` is a private repository. -from env variable: `TESTGEN_DOCKER_HUB_USERNAME` -defaults to: None +VERSION: str = os.getenv("TESTGEN_VERSION", None) """ - -DOCKER_HUB_PASSWORD: str | None = os.getenv("TESTGEN_DOCKER_HUB_PASSWORD", None) -""" -Password to authenticate against Docker Hub API before fetching the list -of tags. Required if `DOCKER_HUB_REPOSITORY` is a private repository. - -from env variable: `TESTGEN_DOCKER_HUB_PASSWORD` -defaults to: None +Current deployed version. The value is displayed in the UI menu. """ -VERSION: str = os.getenv("TESTGEN_VERSION", "unknown") +SUPPORT_EMAIL: str = os.getenv("TESTGEN_SUPPORT_EMAIL", "open-source-support@datakitchen.io") """ -Current deployed version. The value is displayed in the UI menu. - -from env variable: `TESTGEN_VERSION` -defaults to: `unknown` +Email for contacting DataKitchen support. """ SSL_CERT_FILE: str = os.getenv("SSL_CERT_FILE", "") @@ -494,3 +470,8 @@ """ Random key used to sign/verify the authentication token """ + +ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT: int = os.getenv("TG_ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT", 50) +""" +Limit the number of records used to generate the PDF with test results and hygiene issue reports. +""" diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index 897cb7c5..2caa893a 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -186,8 +186,10 @@ CREATE TABLE test_definitions ( baseline_sum VARCHAR(1000), baseline_avg VARCHAR(1000), baseline_sd VARCHAR(1000), + lower_tolerance VARCHAR(1000), + upper_tolerance VARCHAR(1000), subset_condition VARCHAR(500), - groupby_names VARCHAR(200), + groupby_names VARCHAR, having_condition VARCHAR(500), window_date_column VARCHAR(100), window_days INTEGER, @@ -195,7 +197,7 @@ CREATE TABLE test_definitions ( match_table_name VARCHAR(100), match_column_names VARCHAR(200), match_subset_condition VARCHAR(500), - match_groupby_names VARCHAR(200), + match_groupby_names VARCHAR, match_having_condition VARCHAR(500), test_mode VARCHAR(20), custom_query VARCHAR, @@ -255,6 +257,7 @@ CREATE TABLE profile_results ( upper_case_ct BIGINT, lower_case_ct BIGINT, non_alpha_ct BIGINT, + non_printing_ct BIGINT, mixed_case_ct BIGINT GENERATED ALWAYS AS ( value_ct - upper_case_ct - lower_case_ct - non_alpha_ct ) STORED, numeric_ct BIGINT, date_ct BIGINT, @@ -529,7 +532,7 @@ CREATE TABLE test_results ( table_name VARCHAR(100), column_names VARCHAR(500), skip_errors INTEGER, - input_parameters VARCHAR(1000), + input_parameters VARCHAR, result_code INTEGER, severity VARCHAR(10), result_status VARCHAR(10), @@ -894,6 +897,10 @@ CREATE TABLE job_schedules ( CREATE INDEX job_schedules_idx ON job_schedules (project_code, key); +CREATE TABLE settings ( + key VARCHAR(50) NOT NULL PRIMARY KEY, + value JSONB NOT NULL +); INSERT INTO tg_revision (component, revision) VALUES ('metadata_db', 0); diff --git a/testgen/template/dbsetup/040_populate_new_schema_project.sql b/testgen/template/dbsetup/040_populate_new_schema_project.sql index 84d4d961..3e8adfdf 100644 --- a/testgen/template/dbsetup/040_populate_new_schema_project.sql +++ b/testgen/template/dbsetup/040_populate_new_schema_project.sql @@ -12,18 +12,18 @@ INSERT INTO connections (project_code, sql_flavor, sql_flavor_code, project_host, project_port, project_user, project_db, connection_name, project_pw_encrypted, http_path, max_threads, max_query_chars) -SELECT '{PROJECT_CODE}' as project_code, - '{SQL_FLAVOR}' as sql_flavor, - '{SQL_FLAVOR}' as sql_flavor_code, - '{PROJECT_HOST}' as project_host, - '{PROJECT_PORT}' as project_port, - '{PROJECT_USER}' as project_user, - '{PROJECT_DB}' as project_db, - '{CONNECTION_NAME}' as connection_name, - '{PROJECT_PW_ENCRYPTED}' as project_pw_encrypted, - '{PROJECT_HTTP_PATH}' as http_path, - '{MAX_THREADS}'::INTEGER as max_threads, - '{MAX_QUERY_CHARS}'::INTEGER as max_query_chars; +SELECT '{PROJECT_CODE}' as project_code, + '{SQL_FLAVOR}' as sql_flavor, + '{SQL_FLAVOR}' as sql_flavor_code, + NULLIF('{PROJECT_HOST}', '') as project_host, + NULLIF('{PROJECT_PORT}', '') as project_port, + NULLIF('{PROJECT_USER}', '') as project_user, + NULLIF('{PROJECT_DB}', '') as project_db, + '{CONNECTION_NAME}' as connection_name, + NULLIF('{PROJECT_PW_ENCRYPTED}', ''::BYTEA) as project_pw_encrypted, + NULLIF('{PROJECT_HTTP_PATH}', '') as http_path, + '{MAX_THREADS}'::INTEGER as max_threads, + '{MAX_QUERY_CHARS}'::INTEGER as max_query_chars; INSERT INTO table_groups (id, project_code, connection_id, table_groups_name, table_group_schema, profiling_table_set, profiling_include_mask, profiling_exclude_mask, diff --git a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql index f7dfed09..f0a8b8ab 100644 --- a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql +++ b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql @@ -17,12 +17,12 @@ INSERT INTO profile_anomaly_types VALUES ('1001', 'Suggested_Type', 'Column', 'Suggested Data Type', 'Data stored as text all meets criteria for a more suitable type. ', '(functional_data_type NOT IN (''Boolean'', ''Flag'') ) AND (column_type ILIKE ''%ch ar%'' OR column_type ILIKE ''text'') AND NOT (datatype_suggestion ILIKE ''%char%'' OR datatype_suggestion ILIKE ''text'')', 'p.datatype_suggestion::VARCHAR(200)', 'Likely', 'Consider changing the column data type to tighte n controls over data ingested and to make values more efficient, consistent and suitable for downstream analysis.', NULL, NULL, NULL), - ('1002', 'Non_Standard_Blanks', 'Column', 'Non-Standard Blank Values', 'Values representing missing data may be unexpected or inconsistent. Non-standard values may include empty strings as opposed to nulls, dummy entries such as "MISSING" or repeated characters that may have been used to bypass entry requirements, processing artifacts such as "NULL", or spreadsheet artifacts such as "NA", "ERROR".', '(p.filled_value_ct > 0 OR p.zero_length_ct > 0)', '''Filled Values: '' || p.filled_value_ct::VARCHAR || '', Empty String: '' || p.zero_length_ct::VARCHAR || '', Null: '' || p.null_value_ct::VARCHAR || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Consider cleansing the column upon ingestion to replace all variants of missing data with a standard designation, like Null.', 'p.filled_value_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '1.0', 'Completeness'), - ('1003', 'Invalid_Zip_USA', 'Column', 'Invalid USA Zip Code Format', 'Some values present do not conform with the expected format of USA Zip Codes.', 'p.functional_data_type = ''ZIP_USA'' AND (p.general_type <> ''A'' OR p.filled_value_ct > 0 OR p.min_length >= 1 AND p.min_length <= 4 OR p.max_length > 10)', 'CASE WHEN p.general_type = ''N'' THEN ''Type: '' || p.column_type || '', '' ELSE '''' END || ''Min Length: '' || p.min_length::VARCHAR || '', Max Length: '' || p.max_length::VARCHAR || '', Dummy Values: '' || p.filled_value_ct::VARCHAR', 'Definite', 'Consider correcting invalid column values or changing them to indicate a missing value if corrections cannot be made.', NULL, '1.0', 'Validity'), + ('1002', 'Non_Standard_Blanks', 'Column', 'Non-Standard Blank Values', 'Values representing missing data may be unexpected or inconsistent. Non-standard values may include empty strings as opposed to nulls, dummy entries such as "MISSING" or repeated characters that may have been used to bypass entry requirements, processing artifacts such as "NULL", or spreadsheet artifacts such as "NA", "ERROR".', '(p.filled_value_ct > 0 OR p.zero_length_ct > 0)', '''Dummy Values: '' || p.filled_value_ct::VARCHAR || '', Empty String: '' || p.zero_length_ct::VARCHAR || '', Null: '' || p.null_value_ct::VARCHAR || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Consider cleansing the column upon ingestion to replace all variants of missing data with a standard designation, like Null.', 'p.filled_value_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '1.0', 'Completeness'), + ('1003', 'Invalid_Zip_USA', 'Column', 'Invalid USA Zip Code Format', 'Some values present do not conform with the expected format of USA Zip Codes.', 'p.functional_data_type = ''Zip'' AND (p.general_type <> ''A'' OR p.filled_value_ct > 0 OR EXISTS (SELECT 1 FROM UNNEST(STRING_TO_ARRAY(p.top_patterns, '' | '')) WITH ORDINALITY AS u(val, idx) WHERE idx % 2 = 0 AND val NOT IN (''NNNNN'',''NNNNN-NNNN'',''NNNNNNNNN'')))', 'CASE WHEN p.general_type = ''N'' THEN ''Type: '' || p.column_type ELSE '''' END || CASE WHEN p.general_type = ''A'' THEN ''Patterns: '' || (SELECT string_agg(val, '','') FROM UNNEST(STRING_TO_ARRAY(top_patterns, '' | '')) WITH ORDINALITY AS u(val, idx) WHERE idx % 2 = 0) || '', Dummy Values: '' || p.filled_value_ct::VARCHAR ELSE '''' END', 'Definite', 'Consider correcting invalid column values or changing them to indicate a missing value if corrections cannot be made.', NULL, '1.0', 'Validity'), ('1004', 'Multiple_Types_Minor', 'Multi-Col', 'Multiple Data Types per Column Name - Minor', 'Columns with the same name have the same general type across tables, but the types do not exactly match. Truncation issues may result if columns are commingled and assumed to be the same format.', 'm.general_type_ct = 1 AND m.type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Possible', 'Consider changing the column data types to be fully consistent. This will tighten your standards at ingestion and assure that data is consistent between tables.', NULL, NULL, 'Consistency'), ('1005', 'Multiple_Types_Major', 'Multi-Col', 'Multiple Data Types per Column Name - Major', 'Columns with the same name have broadly different types across tables. Differences could be significant enough to cause errors in downstream analysis, extra steps resulting in divergent business logic and inconsistencies in results.', 'm.general_type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Likely', 'Ideally, you should change the column data types to be fully consistent. If the data is meant to be different, you should change column names so downstream users aren''t led astray.', NULL, NULL, 'Consistency'), ('1006', 'No_Values', 'Column', 'No Column Values Present', 'This column is present in the table, but no values have been ingested or assigned in any records. This could indicate missing data or a processing error. Note that this considers dummy values and zero-length values as missing data. ', '(p.null_value_ct + p.filled_value_ct + p.zero_length_ct) = p.record_ct', '''Null: '' || p.null_value_ct::VARCHAR(10) || '', Dummy: '' || p.filled_value_ct::VARCHAR(10) || '', Zero Len: '' || p.zero_length_ct::VARCHAR(10)', 'Possible', 'Review your source data, ingestion process, and any processing steps that update this column.', '1.0', '0.33', 'Completeness'), - ('1007', 'Column_Pattern_Mismatch', 'Column', 'Pattern Inconsistency Within Column', 'Alpha-numeric string data within this column conforms to 2-4 different patterns, with 95% matching the first pattern. This could indicate data errors in the remaining values. ', 'p.general_type = ''A'' + ('1007', 'Column_Pattern_Mismatch', 'Column', 'Pattern Inconsistency Within Column', 'Alpha-numeric string data within this column conforms to 2-4 different patterns, with 95% matching the first pattern. This could indicate data errors in the remaining values. ', 'p.general_type = ''A'' AND functional_data_type NOT ILIKE ''Measurement%'' AND p.max_length > 3 AND p.value_ct > (p.numeric_ct + p.filled_value_ct + p.zero_length_ct) AND p.distinct_pattern_ct BETWEEN 2 AND 4 @@ -45,6 +45,7 @@ n controls over data ingested and to make values more efficient, consistent and ('1011', 'Char_Column_Number_Values', 'Column', 'Character Column with Mostly Numeric Values', 'This column is defined as alpha, but more than 95% of its values are numeric. Numbers in alpha columns won''t sort correctly, and might contradict user expectations downstream. It''s also possible that more than one type of information is stored in the column, making it harder to retrieve.', 'p.general_type = ''A'' AND p.column_name NOT ILIKE ''%zip%'' AND p.functional_data_type NOT ILIKE ''id%'' + AND p.functional_data_type NOT ILIKE ''Period%'' AND p.value_ct > p.numeric_ct AND p.numeric_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC)', '''Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and ingestion process. Consider whether it might be better to store the numeric data in a numeric column. If the alpha data is significant, you could store it in a different column.', 'p.numeric_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66', 'Validity'), ('1012', 'Char_Column_Date_Values', 'Column', 'Character Column with Mostly Date Values', 'This column is defined as alpha, but more than 95% of its values are dates. Dates in alpha columns might not sort correctly, and might contradict user expectations downstream. It''s also possible that more than one type of information is stored in the column, making it harder to retrieve. ', 'p.general_type = ''A'' @@ -55,7 +56,7 @@ n controls over data ingested and to make values more efficient, consistent and '' of '' || p.record_ct::VARCHAR(20) || '' blank values: '' || ROUND(100.0 * (p.record_ct - (p.value_ct - p.zero_length_ct - p.filled_value_ct))::NUMERIC(18, 5) / NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2)::VARCHAR(40) || ''%''', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected, supplemented or excluded.', '(p.null_value_ct + filled_value_ct + zero_length_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.33', 'Completeness'), - ('1014', 'Small Divergent Value Ct', 'Column', 'Small Percentage of Divergent Values Found', 'Under 3% of values in this column were found to be different from the most common value. This could indicate a data error.', '(100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT / + ('1014', 'Small Divergent Value Ct', 'Column', 'Small Percentage of Divergent Values Found', 'Under 3% of values in this column were found to be different from the most common value. This could indicate a data error.', 'functional_data_type <> ''Boolean'' AND (100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT / p.value_ct::FLOAT) > 97::FLOAT AND (100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT / NULLIF(p.value_ct, 0)::FLOAT) < 100::FLOAT', '''Single Value Pct: '' || ROUND(100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT @@ -76,11 +77,11 @@ n controls over data ingested and to make values more efficient, consistent and ('1020', 'Recency_Six_Months', 'Dates', 'Recency - No Table Dates within 6 Months', 'Among all date columns present in the table, the most recent date falls 6 months to 1 year back from Profile date. ', 'MAX(p.max_date) >= CURRENT_DATE - INTERVAL ''1 year'' AND MAX(p.max_date) < CURRENT_DATE - INTERVAL ''6 months''', '''Most Recent Date: '' || MAX(p.max_date)::VARCHAR', 'Possible', 'Review your source data and follow-up with data owners to determine whether dates in table should be more recent.', NULL, NULL, 'Timeliness'), ('1021', 'Unexpected US States', 'Column', 'Unexpected Column Contains US States', 'This column is not labeled as a state, but contains mostly US State abbreviations. This could indicate shifted or switched source data columns.', 'p.std_pattern_match = ''STATE_USA'' AND p.distinct_value_ct > 5 - AND NOT (p.column_name ILIKE ''%state%'' OR p.column_name ILIKE ''%_st'')', '''Value Range: '' || p.min_text || '' thru '' || max_text || CASE WHEN p.top_freq_values > '''' THEN ''Top Freq Values: '' || REPLACE(p.top_freq_values, CHR(10), '' ; '') ELSE '''' END ', 'Possible', 'Review your source data and follow-up with data owners to determine whether column should be populated with US states.', NULL, '0.33', 'Consistency'), + AND NOT (p.column_name = ''st'' OR p.column_name ILIKE ''%state%'' OR p.column_name ILIKE ''%_st'')', '''Value Range: '' || p.min_text || '' thru '' || max_text || CASE WHEN p.top_freq_values > '''' THEN ''Top Freq Values: '' || REPLACE(p.top_freq_values, CHR(10), '' ; '') ELSE '''' END ', 'Possible', 'Review your source data and follow-up with data owners to determine whether column should be populated with US states.', NULL, '0.33', 'Consistency'), ('1022', 'Unexpected Emails', 'Column', 'Unexpected Column Contains Emails', 'This column is not labeled as email, but contains mostly email addresses. This could indicate shifted or switched source data columns.', 'p.std_pattern_match = ''EMAIL'' AND NOT (p.column_name ILIKE ''%email%'' OR p.column_name ILIKE ''%addr%'')', '''Value Range: '' || p.min_text || '' thru '' || max_text', 'Possible', 'Review your source data and follow-up with data owners to determine whether column should be populated with email addresses.', NULL, '0.33', 'Consistency'), - ('1023', 'Small_Numeric_Value_Ct', 'Column', 'Unexpected Numeric Values Found', 'Under 3% of values in this column were found to be numeric. This could indicate a data error.', 'p.general_type = ''A'' - AND p.numeric_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT < 0.03 + ('1023', 'Small_Numeric_Value_Ct', 'Column', 'Unexpected Numeric Values Found', 'A small fraction (under 3%) of values in this column were found to be numeric. They could be erroneous.', 'p.general_type = ''A'' + AND p.numeric_ct::FLOAT/NULLIF(p.value_ct, 0)::FLOAT < 0.03 AND p.numeric_ct > 0', '''Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5)/NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and follow-up with data owners to determine whether numeric values are invalid entries here.', 'p.numeric_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66', 'Validity'), ('1024', 'Invalid_Zip3_USA', 'Column', 'Invalid USA ZIP-3 Format', 'The majority of values in this column are 3-digit zips, but divergent patterns were found. This could indicate an incorrect roll-up category or a PII concern.', 'p.distinct_pattern_ct > 1 AND (p.column_name ilike ''%zip%'' OR p.column_name ILIKE ''%postal%'') @@ -89,7 +90,11 @@ n controls over data ingested and to make values more efficient, consistent and ('1025', 'Delimited_Data_Embedded', 'Column', 'Delimited Data Embedded in Column', 'Delimited data, separated by a common delimiter (comma, tab, pipe or caret) is present in over 80% of column values. This could indicate data that was incorrectly ingested, or data that would be better represented in parsed form.', 'p.std_pattern_match = ''DELIMITED_DATA''', 'CASE WHEN p.top_freq_values IS NULL THEN ''Min: '' || p.min_text || '', Max: '' || p.max_text ELSE ''Top Freq: '' || p.top_freq_values END', 'Likely', 'Review your source data and follow-up with data consumers to determine the most useful representation of this data.', NULL, '0.66', 'Validity'), ('1026', 'Char_Column_Number_Units', 'Column', 'Character Column with Numbers and Units', 'This column is defined as alpha, but values include numbers with percents or common units. Embedded measures in alpha columns are harder to access, won''t sort correctly, and might contradict user expectations downstream. Consider parsing into numeric and UOM columns to improve usability.', 'p.includes_digit_ct::FLOAT/NULLIF(p.value_ct, 0)::FLOAT > 0.5 AND TRIM(fn_parsefreq(p.top_freq_values, 1, 1)) ~ ''(?i)^[0-9]+(\.[0-9]+)? ?(%|lb|oz|kg|g|mg|km|m|cm|mm|mi|ft|in)$''', '''Top Freq: '' || p.top_freq_values', 'Possible', 'Review your source data and ingestion process. Consider whether it might be better to parse the numeric and unit data and store in separate columns.', NULL, '0.33', 'Consistency'), ('1027', 'Variant_Coded_Values', 'Variant', 'Variant Codings for Same Values', 'This column contains more than one common variants that represent a single value or state. This can occur when data is integrated from multiple sources with different standards, or when free entry is permitted without validation. The variations can cause confusion and error for downstream data users and multiple versions of the truth. ', 'p.distinct_value_ct <= 20', '''Variants Found: '' || intersect_list', 'Definite', 'Review your source data and ingestion process. Consider cleansing this data to standardize on a single set of definitive codes.', NULL, NULL, 'Consistency'), - ('1100', 'Potential_PII', 'Column', 'Personally Identifiable Information', 'This column contains data that could be Personally Identifiable Information (PII)', 'p.pii_flag > ''''', '''Risk: '' || CASE LEFT(p.pii_flag, 1) WHEN ''A'' THEN ''HIGH'' WHEN ''B'' THEN ''MODERATE'' WHEN ''C'' THEN ''LOW'' END || '', PII Type: '' || SUBSTRING(p.pii_flag, 3)', 'Potential PII', 'PII may require steps to ensure data security and compliance with relevant privacy regulations and legal requirements. You may have to classify and inventory PII, implement appropriate access controls, encrypt data, and monitor for unauthorized access. Your organization might be required to update privacy policies and train staff on data protection practices. Note that PII that is lower-risk in isolation might be high-risk in conjunction with other data.', NULL, 'CASE LEFT(p.pii_flag, 1) WHEN ''A'' THEN 1 WHEN ''B'' THEN 0.66 WHEN ''C'' THEN 0.33 END', 'Validity') + ('1100', 'Potential_PII', 'Column', 'Personally Identifiable Information', 'This column contains data that could be Personally Identifiable Information (PII)', 'p.pii_flag > ''''', '''Risk: '' || CASE LEFT(p.pii_flag, 1) WHEN ''A'' THEN ''HIGH'' WHEN ''B'' THEN ''MODERATE'' WHEN ''C'' THEN ''LOW'' END || '', PII Type: '' || SUBSTRING(p.pii_flag, 3)', 'Potential PII', 'PII may require steps to ensure data security and compliance with relevant privacy regulations and legal requirements. You may have to classify and inventory PII, implement appropriate access controls, encrypt data, and monitor for unauthorized access. Your organization might be required to update privacy policies and train staff on data protection practices. Note that PII that is lower-risk in isolation might be high-risk in conjunction with other data.', NULL, 'CASE LEFT(p.pii_flag, 1) WHEN ''A'' THEN 1 WHEN ''B'' THEN 0.66 WHEN ''C'' THEN 0.33 END', 'Validity'), + ('1028', 'Inconsistent_Casing', 'Column', 'Inconsistent Casing', 'Casing is inconsistent for a column representing an entity name or address elements. Mixed-Case and All-Upper-Case values were found in the same column.', 'mixed_case_ct > 0 AND upper_case_ct > 0 AND functional_data_type IN (''Address'', ''City'', ''Entity Name'', ''Person Given Name'', ''Person Last Name'', ''Person Full Name'')', '''Mixed-Case: '' || p.mixed_case_ct::VARCHAR || '', All-Upper-Case: '' || p.upper_case_ct::VARCHAR || '' for Semantic Data Type: '' || p.functional_data_type || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Review your source data and follow-up with data owners to determine whether consistent casing should be applied at the source. If source data corrections are not possible, consider standardizing the column upon ingestion to ensure consistent casing.', 'LEAST(p.mixed_case_ct, p.upper_case_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '1.0', 'Validity'), + ('1029', 'Non_Alpha_Name_Address', 'Column', 'Non-Alpha Name or Address', 'Entirely non-alphabetic values were found in a column representing an entity name or address element.', 'non_alpha_ct - zero_length_ct > 0 AND functional_data_type IN (''Address'', ''City'', ''Entity Name'', ''Person Given Name'', ''Person Last Name'', ''Person Full Name'')', '''Non-Alpha Values: '' || (non_alpha_ct - zero_length_ct)::VARCHAR || '', Semantic Type: '' || p.functional_data_type || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Non-alphabetic values are highly likely to be invalid for this kind of column. This may indicate a file format change, error in an ingestion process, or incorrect source data. Review your pipeline process and source data to determine the root-cause. If this data accurately reflects source data, and upstream corrections are not possible, consider assigning the processed value to null to reflect that data is missing.', '(non_alpha_ct - zero_length_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '1.0', 'Validity'), + ('1030', 'Non_Alpha_Prefixed_Name', 'Column', 'Non-Alpha Prefixed Name', 'Non-alphabetic characters were found at the start of a column representing an entity name.', 'min_text < ''A'' AND LEFT(min_text, 1) NOT IN (''"'', '' '') AND RIGHT(min_text, 1) <> '''''''' AND functional_data_type IN (''City'', ''Person Given Name'', ''Person Last Name'', ''Person Full Name'')', '''Minimum Value: '' || min_text', 'Definite', 'Values starting with a non-alphabetic character are highly likely to be invalid for this kind of column. This may indicate a file format change, error in an ingestion process, or incorrect source data. It could also indicate flagging or coding of some kind that can be broken out in a separate column in processed data. Review your pipeline process and source data to determine the root-cause. If this data accurately reflects source data, and upstream corrections are not possible, consider applying corrections directly to processed data where possible.', '0.25', '1.0', 'Validity'), + ('1031', 'Non_Printing_Chars', 'Column', 'Non-Printing Characters', 'Non-printing characters were found embedded in a text column.', 'non_printing_ct > 0', '''Non-Printing Chars: '' || non_printing_ct::VARCHAR || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Embedded non-printing characters are typically stripped from data. They affect filters and aggregations, and may cause problems for downstream users who don''t recognize their presence. Review your source data and follow-up with data owners to determine whether this data can be corrected upstream. If not, strip these characters from processed data.', 'non_printing_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '1.0', 'Validity') ; @@ -97,17 +102,17 @@ TRUNCATE TABLE test_types; INSERT INTO test_types (id, test_type, test_name_short, test_name_long, test_description, except_message, measure_uom, measure_uom_description, selection_criteria, dq_score_prevalence_formula, dq_score_risk_factor, column_name_prompt, column_name_help, default_parm_columns, default_parm_values, default_parm_prompts, default_parm_help, default_severity, run_type, test_scope, dq_dimension, health_dimension, threshold_description, usage_notes, active) -VALUES ('1004', 'Alpha_Trunc', 'Alpha Truncation', 'Maximum character count consistent', 'Tests that the maximum count of characters in a column value has not dropped vs. baseline data', 'Maximum length of values has dropped from prior expected length.', 'Values over max', NULL, 'general_type =''A'' AND max_length > 0 AND ( (min_length = avg_length AND max_length = avg_length) OR (numeric_ct <> value_ct ) ) AND functional_table_type NOT LIKE ''%window%'' /* The conditions below are to eliminate overlap with : LOV_Match (excluded selection criteria for this test_type), Pattern_Match (excluded selection criteria for this test_type), Constant (excluded functional_data_type Constant and Boolean) */ AND ( (distinct_value_ct NOT BETWEEN 2 AND 10 AND functional_data_type NOT IN ( ''Constant'', ''Boolean'') ) AND NOT ( fn_charcount(top_patterns, E'' \| '' ) = 1 AND fn_charcount(top_patterns, E'' \| '' ) IS NOT NULL AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > ''''))', '{VALUE_CT}::FLOAT * (FN_NORMAL_CDF(({MAX_LENGTH}::FLOAT - {AVG_LENGTH}::FLOAT) / (NULLIF({MAX_LENGTH}::FLOAT, 0) / 3)) - FN_NORMAL_CDF(({RESULT_MEASURE}::FLOAT - {AVG_LENGTH}::FLOAT) / (NULLIF({MAX_LENGTH}::FLOAT, 0) / 3)) ) /NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', 'max_length', 'Maximum String Length at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Maximum length expected', 'Alpha Truncation tests that the longest text value in a column hasn''t become shorter than the longest value at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'), - ('1005', 'Avg_Shift', 'Average Shift', 'Column mean is consistent with reference', 'Tests for statistically-significant shift in mean value for column from average calculated at baseline.', 'Standardized difference between averages is over the selected threshold level.', 'Difference Measure', 'Cohen''s D Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', '2.0 * (1.0 - fn_normal_cdf(ABS({RESULT_MEASURE}::FLOAT) / 2.0))', '0.75', NULL, NULL, 'baseline_value_ct,baseline_avg,baseline_sd,threshold_value', 'value_ct,avg_value,stdev_value,0.5::VARCHAR', 'Value Ct at Baseline,Mean at Baseline,Std Deviation at Baseline,Threshold Difference Measure ', NULL, 'Warning', 'CAT', 'column', 'Consistency', 'Data Drift', 'Standardized Difference Measure', 'Average Shift tests that the average of a numeric column has not significantly changed since baseline, when profiling was done. A significant shift may indicate errors in processing, differences in source data, or valid changes that may nevertheless impact assumptions in downstream data products. The test uses Cohen''s D, a statistical technique to identify significant shifts in a value. Cohen''s D measures the difference between the two averages, reporting results on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. Depending on your data, some difference may be expected, so it''s reasonable to adjust the threshold value that triggers test failure. This test works well for measures, or even for identifiers if you expect them to increment consistently. You may want to periodically adjust the expected threshold, or even the expected average value if you expect shifting over time. Consider this test along with Variability Increase. If variability rises too, process or measurement flaws could be at work. If variability remains consistent, the issue is more likely to be with the source data itself. ', 'Y'), +VALUES ('1004', 'Alpha_Trunc', 'Alpha Truncation', 'Maximum character count consistent', 'Tests that the maximum count of characters in a column value has not dropped vs. baseline data', 'Maximum length of values has dropped from prior expected length.', 'Values over max', NULL, 'general_type =''A'' AND max_length > 0 AND ( (min_length = avg_length AND max_length = avg_length) OR (numeric_ct <> value_ct ) ) AND functional_table_type NOT LIKE ''%window%'' /* The conditions below are to eliminate overlap with : LOV_Match (excluded selection criteria for this test_type), Pattern_Match (excluded selection criteria for this test_type), Constant (excluded functional_data_type Constant and Boolean) */ AND ( (distinct_value_ct NOT BETWEEN 2 AND 10 AND functional_data_type NOT IN ( ''Constant'', ''Boolean'') ) AND NOT ( fn_charcount(top_patterns, E'' \| '' ) = 1 AND fn_charcount(top_patterns, E'' \| '' ) IS NOT NULL AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > ''''))', '{VALUE_CT}::FLOAT * (FN_NORMAL_CDF(({MAX_LENGTH}::FLOAT - {AVG_LENGTH}::FLOAT) / (NULLIF({MAX_LENGTH}::FLOAT, 0) / 3)) - FN_NORMAL_CDF(({RESULT_MEASURE}::FLOAT - {AVG_LENGTH}::FLOAT) / (NULLIF({MAX_LENGTH}::FLOAT, 0) / 3)) ) /NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', 'FLOOR(0.95 * max_length::FLOAT)', 'Maximum String Length at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Maximum length expected', 'Alpha Truncation tests that the longest text value in a column hasn''t become shorter than the defined threshold, initially 95% of the longest value at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'), + ('1005', 'Avg_Shift', 'Average Shift', 'Column mean is consistent with reference', 'Tests for statistically-significant shift in mean value for column from average calculated at baseline.', 'Standardized difference between averages is over the selected threshold level.', 'Difference Measure', 'Cohen''s D Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND functional_data_type <> ''Measurement Spike'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', '2.0 * (1.0 - fn_normal_cdf(ABS({RESULT_MEASURE}::FLOAT) / 2.0))', '0.75', NULL, NULL, 'baseline_value_ct,baseline_avg,baseline_sd,threshold_value', 'value_ct,avg_value,stdev_value,0.5::VARCHAR', 'Value Ct at Baseline,Mean at Baseline,Std Deviation at Baseline,Threshold Difference Measure ', NULL, 'Warning', 'CAT', 'column', 'Consistency', 'Data Drift', 'Standardized Difference Measure', 'Average Shift tests that the average of a numeric column has not significantly changed since baseline, when profiling was done. A significant shift may indicate errors in processing, differences in source data, or valid changes that may nevertheless impact assumptions in downstream data products. The test uses Cohen''s D, a statistical technique to identify significant shifts in a value. Cohen''s D measures the difference between the two averages, reporting results on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. Depending on your data, some difference may be expected, so it''s reasonable to adjust the threshold value that triggers test failure. This test works well for measures, or even for identifiers if you expect them to increment consistently. You may want to periodically adjust the expected threshold, or even the expected average value if you expect shifting over time. Consider this test along with Variability Increase. If variability rises too, process or measurement flaws could be at work. If variability remains consistent, the issue is more likely to be with the source data itself. ', 'Y'), ('1007', 'Constant', 'Constant Match', 'All column values match constant value', 'Tests that all values in the column match the constant value identified in baseline data', 'A constant value is expected for this column.', 'Mismatched values', NULL, 'TEMPLATE', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value,threshold_value', NULL, 'Constant Value at Baseline,Threshold Error Count', 'The single, unchanging value of the column, per baseline|The number of errors that are acceptable before test fails.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Count of records with unexpected values', 'Constant Match tests that a single value determined to be a constant in baseline profiling is still the only value for the column that appears in subsequent versions of the dataset. Sometimes new data or business knowledge may reveal that the value is not a constant at all, even though only one value was present at profiling. In this case, you will want to disable this test. Alternatively, you can use the Value Match test to provide a limited number of valid values for the column.', 'Y'), ('1009', 'Daily_Record_Ct', 'Daily Records', 'All dates present within date range', 'Tests for presence of every calendar date within min/max date range, per baseline data', 'Not every date value between min and max dates is present, unlike at baseline.', 'Missing dates', NULL, 'general_type= ''D'' AND date_days_present > 21 AND date_days_present - (DATEDIFF(''day'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''day'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DATE_DAYS_PRESENT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Calendar Days', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Volume', 'Missing calendar days within min/max range', 'Daily Records tests that at least one record is present for every day within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each day. A failure here would suggest missing records for the number of days identified without data. You can adjust the threshold to accept a number of days that you know legitimately have no records. ', 'Y'), - ('1011', 'Dec_Trunc', 'Decimal Truncation', 'Sum of fractional values at or above reference', 'Tests for decimal truncation by confirming that the sum of fractional values in data is no less than the sum at baseline', 'The sum of fractional values is under baseline, which may indicate decimal truncation', 'Fractional sum', 'The sum of all decimal values from all data for this column', 'fractional_sum IS NOT NULL AND functional_table_type LIKE''%cumulative%''', '1', '1.0', NULL, NULL, 'threshold_value', 'ROUND(fractional_sum, 0)', 'Sum of Fractional Values at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Minimum expected sum of all fractional values', 'Decimal Truncation tests that the fractional (decimal) part of a numeric column has not been truncated since Baseline. This works by summing all the fractional values after the decimal point and confirming that the total is at least equal to the fractional total at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'), + ('1011', 'Dec_Trunc', 'Decimal Truncation', 'Sum of fractional values at or above reference', 'Tests for decimal truncation by confirming that the sum of fractional values in data is no less than the sum at baseline', 'The sum of fractional values is under baseline, which may indicate decimal truncation', 'Fractional sum', 'The sum of all decimal values from all data for this column', 'fractional_sum > 0 AND functional_table_type LIKE''%cumulative%''', '1', '1.0', NULL, NULL, 'threshold_value', 'ROUND(fractional_sum, 0)', 'Sum of Fractional Values at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Minimum expected sum of all fractional values', 'Decimal Truncation tests that the fractional (decimal) part of a numeric column has not been truncated since Baseline. This works by summing all the fractional values after the decimal point and confirming that the total is at least equal to the fractional total at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'), ('1012', 'Distinct_Date_Ct', 'Date Count', 'Count of distinct dates at or above reference', 'Tests that the count of distinct dates referenced in the column has not dropped vs. baseline data', 'Drop in count of unique dates recorded in column.', 'Unique dates', 'Count of unique dates in transactional date column', 'functional_data_type ILIKE ''Transactional Date%'' AND date_days_present > 1 AND functional_table_type ILIKE ''%cumulative%''', '(({RECORD_CT}-{PRO_RECORD_CT})::FLOAT*{DISTINCT_VALUE_CT}::FLOAT/NULLIF({PRO_RECORD_CT}::FLOAT, 0))/NULLIF({PRO_RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'date_days_present,date_days_present', 'Distinct Date Count at Baseline,Min Expected Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Minimum distinct date count expected', 'Date Count tests that the count of distinct dates present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained. A failure here would indicate missing records, which could be caused by a processing error or changed upstream data sources.', 'Y'), ('1013', 'Distinct_Value_Ct', 'Value Count', 'Count of distinct values has not dropped', 'Tests that the count of unique values in the column has not changed from baseline.', 'Count of unique values in column has changed from baseline.', 'Unique Values', NULL, 'distinct_value_ct between 2 and 10 AND value_ct > 50 AND functional_data_type IN (''Code'', ''Category'', ''Attribute'', ''Description'') AND NOT coalesce(top_freq_values,'''') > ''''', 'ABS({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DISTINCT_VALUE_CT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value_ct,threshold_value', 'distinct_value_ct,distinct_value_ct', 'Distinct Value Count at Baseline,Min Expected Value Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected distinct value count', 'Value Count tests that the count of unique values present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained, or for any dataset where you would expect a set number of distinct values should be present. A failure here would indicate missing records or a change in categories or value assignment.', 'Y'), ('1014', 'Email_Format', 'Email Format', 'Email is correctly formatted', 'Tests that non-blank, non-empty email addresses match the standard format', 'Invalid email address formats found.', 'Invalid emails', 'Number of emails that do not match standard format', 'std_pattern_match=''EMAIL''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Maximum Invalid Email Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid email addresses', NULL, 'Y'), ('1015', 'Future_Date', 'Past Dates', 'Latest date is prior to test run date', 'Tests that the maximum date referenced in the column is no greater than the test date, consistent with baseline data', 'Future date found when absent in baseline data.', 'Future dates', NULL, 'general_type=''D''AND future_date_ct = 0', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Maximum Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates', NULL, 'Y'), ('1016', 'Future_Date_1Y', 'Future Year', 'Future dates within year of test run date', 'Tests that the maximum date referenced in the column is no greater than one year beyond the test date, consistent with baseline data', 'Future date beyond one-year found when absent in baseline.', 'Future dates post 1 year', NULL, 'general_type=''D''AND future_date_ct > 0 AND max_date <=''{AS_OF_DATE}''::DATE + INTERVAL''365 DAYS''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Maximum Post 1-Year Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates beyond one year', 'Future Year looks for date values in the column that extend beyond one year after the test date. This would be appropriate for transactional dates where you would expect to find dates in the near future, but not beyond one year ahead. Errors could indicate invalid entries or possibly dummy dates representing blank values.', 'Y'), - ('1017', 'Incr_Avg_Shift', 'New Shift', 'New record mean is consistent with reference', 'Tests for statistically-significant shift in mean of new values for column compared to average calculated at baseline.', 'Significant shift in average of new values vs. baseline avg', 'Z-score of mean shift', 'Absolute Z-score (number of SD''s outside mean) of prior avg - incremental avg', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', '{RECORD_CT}::FLOAT*(1-FN_NORMAL_CDF({RESULT_MEASURE}::FLOAT))/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'baseline_value_ct,baseline_sum,baseline_avg,baseline_sd,threshold_value', 'value_ct,(avg_value * value_ct)::FLOAT,avg_value,stdev_value,2', 'Value Count at Baseline,Sum at Baseline,Mean Value at Baseline,Std Deviation at Baseline,Threshold Max Z-Score', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Maximum Z-Score (number of SD''s beyond mean) expected', 'This is a more sensitive test than Average Shift, because it calculates an incremental difference in the average of new values compared to the average of values at baseline. This is appropriate for a cumulative dataset only, because it calculates the average of new entries based on the assumption that the count and average of records present at baseline are still present at the time of the test. This test compares the mean of new values with the standard deviation of the baseline average to calculate a Z-score. If the new mean falls outside the Z-score threshold, a shift is detected. Potential Z-score thresholds may range from 0 to 3, depending on the sensitivity you prefer. A failed test could indicate a quality issue or a legitimate shift in new data that should be noted and assessed by business users. Consider this test along with Variability Increase. If variability rises too, process, methodology or measurement flaws could be at issue. If variability remains consistent, the problem is more likely to be with the source data itself.', 'Y'), + ('1017', 'Incr_Avg_Shift', 'New Shift', 'New record mean is consistent with reference', 'Tests for statistically-significant shift in mean of new values for column compared to average calculated at baseline.', 'Significant shift in average of new values vs. baseline avg', 'Z-score of mean shift', 'Absolute Z-score (number of SD''s outside mean) of prior avg - incremental avg', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND functional_data_type <> ''Measurement Spike'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', '{RECORD_CT}::FLOAT*(1-FN_NORMAL_CDF({RESULT_MEASURE}::FLOAT))/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'baseline_value_ct,baseline_sum,baseline_avg,baseline_sd,threshold_value', 'value_ct,(avg_value * value_ct)::FLOAT,avg_value,stdev_value,2', 'Value Count at Baseline,Sum at Baseline,Mean Value at Baseline,Std Deviation at Baseline,Threshold Max Z-Score', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Maximum Z-Score (number of SD''s beyond mean) expected', 'This is a more sensitive test than Average Shift, because it calculates an incremental difference in the average of new values compared to the average of values at baseline. This is appropriate for a cumulative dataset only, because it calculates the average of new entries based on the assumption that the count and average of records present at baseline are still present at the time of the test. This test compares the mean of new values with the standard deviation of the baseline average to calculate a Z-score. If the new mean falls outside the Z-score threshold, a shift is detected. Potential Z-score thresholds may range from 0 to 3, depending on the sensitivity you prefer. A failed test could indicate a quality issue or a legitimate shift in new data that should be noted and assessed by business users. Consider this test along with Variability Increase. If variability rises too, process, methodology or measurement flaws could be at issue. If variability remains consistent, the problem is more likely to be with the source data itself.', 'Y'), ('1018', 'LOV_All', 'Value Match All', 'List of expected values all present in column', 'Tests that all values match a pipe-delimited list of expected values and that all expected values are present', 'Column values found don''t exactly match the expected list of values', 'Values found', NULL, NULL, '1', '1.0', NULL, NULL, 'threshold_value', NULL, 'List of Expected Values', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This is a more restrictive form of Value Match, testing that all values in the dataset match the list provided, and also that all values present in the list appear at least once in the dataset. This would be appropriate for tables where all category values in the column are represented at least once.', 'Y'), ('1019', 'LOV_Match', 'Value Match', 'All column values present in expected list', 'Tests that all values in the column match the list-of-values identified in baseline data.', 'Values not matching expected List-of-Values from baseline.', 'Non-matching records', NULL, 'functional_data_type IN (''Boolean'', ''Code'', ''Category'') AND top_freq_values > '''' AND distinct_value_ct BETWEEN 2 and 10 AND value_ct > 5', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value,threshold_value', '''('' || SUBSTRING( CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 2) > '''' THEN '','''''' || TRIM( REPLACE ( SPLIT_PART(top_freq_values, ''|'' , 2), '''''''' , '''''''''''' ) ) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 4) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 4), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 6) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 6), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 8) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 8), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 10) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 10), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 12) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 12), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 14) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 14), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 16) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 16), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 18) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 18), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 20) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 20), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END, 2, 999) || '')'',0', 'List of Expected Values,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This tests that all values in the column match the hard-coded list provided. This is relevant when the list of allowable values is small and not expected to change often. Even if new values might occasionally be added, this test is useful for downstream data products to provide warning that assumptions and logic may need to change.', 'Y'), ('1020', 'Min_Date', 'Minimum Date', 'All dates on or after set minimum', 'Tests that the earliest date referenced in the column is no earlier than baseline data', 'The earliest date value found is before the earliest value at baseline.', 'Dates prior to limit', NULL, 'general_type=''D''and min_date IS NOT NULL AND distinct_value_ct > 1', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'min_date,0', 'Minimum Date at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of dates prior to minimum', 'This test is appropriate for a cumulative dataset only, because it assumes all prior values are still present. It''s appropriate where new records are added with more recent dates, but old dates dates do not change.', 'Y'), @@ -118,14 +123,14 @@ VALUES ('1004', 'Alpha_Trunc', 'Alpha Truncation', 'Maximum character count con ('1025', 'Outlier_Pct_Below', 'Outliers Below', 'Consistent outlier counts under 2 SD below mean', 'Tests that percent of outliers over 2 SD below Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD below the mean is greater than expected threshold.', 'Pct records under limit', NULL, 'functional_data_type = ''Measurement'' AND distinct_value_ct > 30 AND NOT distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct::FLOAT/value_ct::FLOAT > 0.1 AND stdev_value::FLOAT/avg_value::FLOAT > 0.01 AND column_name NOT ILIKE ''%latitude%'' AND column_name NOT ilike ''%longitude%''', 'GREATEST(0, {RESULT_MEASURE}::FLOAT-{THRESHOLD_VALUE}::FLOAT)', '0.75', NULL, NULL, 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over lower 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations below the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y'), ('1026', 'Pattern_Match', 'Pattern Match', 'Column values match alpha-numeric pattern', 'Tests that all values in the column match the same alpha-numeric pattern identified in baseline data', 'Alpha values do not match consistent pattern in baseline.', 'Pattern Mismatches', NULL, '(functional_data_type IN (''Attribute'', ''DateTime Stamp'', ''Phone'') OR functional_data_type ILIKE ''ID%'' OR functional_data_type ILIKE ''Period%'') AND fn_charcount(top_patterns, E'' \| '' ) = 1 AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > '''' AND distinct_value_ct > 10', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'TRIM(REPLACE(REPLACE(REPLACE(REGEXP_REPLACE(SPLIT_PART(top_patterns, '' | '', 2), ''([*+\-%_])'', ''[\1]'', ''g''), ''A'', ''[A-Z]''), ''N'', ''[0-9]''), ''a'', ''[a-z]'')),0', 'Pattern at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of pattern mismatches', 'This test is appropriate for character fields that are expected to appear in a consistent format. It uses pattern matching syntax as appropriate for your database: REGEX matching if available, otherwise LIKE expressions. The expected threshold is the number of records that fail to match the defined pattern.', 'Y'), ('1028', 'Recency', 'Recency', 'Latest date within expected range of test date', 'Tests that the latest date in column is within a set number of days of the test date', 'Most recent date value not within expected days of test date.', 'Days before test', 'Number of days that most recent date precedes the date of test', 'general_type= ''D'' AND max_date <= run_date AND NOT column_name IN ( ''filedate'' , ''file_date'' ) AND NOT functional_data_type IN (''Future Date'', ''Schedule Date'') AND DATEDIFF( ''DAY'' , max_date, run_date) <= 62', '(ABS({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/(1.0+DATEDIFF(''DAY'', ''{MIN_DATE}'', ''{MAX_DATE}''))::FLOAT)/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'threshold_value', 'CASE WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 3 THEN DATEDIFF(''DAY'', max_date, run_date) + 3 WHEN DATEDIFF(''DAY'', max_date, run_date) <= 7 then DATEDIFF(''DAY'', max_date, run_date) + 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 7.0) * 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) > 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 30.0) * 30 END', 'Threshold Maximum Days before Test', NULL, 'Warning', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected maximum count of days preceding test date', 'This test evaluates recency based on the latest referenced dates in the column. The test is appropriate for transactional dates and timestamps. The test can be especially valuable because timely data deliveries themselves may not assure that the most recent data is present. You can adjust the expected threshold to the maximum number of days that you expect the data to age before the dataset is refreshed. ', 'Y'), - ('1030', 'Required', 'Required Entry', 'Required non-null value present', 'Tests that a non-null value is present in each record for the column, consistent with baseline data', 'Every record for this column is expected to be filled, but some are missing.', 'Missing values', NULL, 'record_ct = value_ct', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Value Count', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Schema Drift', 'Expected count of missing values', NULL, 'Y'), + ('1030', 'Required', 'Required Entry', 'Required non-null value present', 'Tests that a non-null value is present in each record for the column, consistent with baseline data', 'Every record for this column is expected to be filled, but some are missing.', 'Missing values', NULL, 'record_ct = value_ct AND record_ct > 10', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Value Count', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Schema Drift', 'Expected count of missing values', NULL, 'Y'), ('1033', 'Street_Addr_Pattern', 'Street Address', 'Enough street address entries match defined pattern', 'Tests for percent of records matching standard street address pattern.', 'Percent of values matching standard street address format is under expected threshold.', 'Percent matches', 'Percent of records that match street address pattern', '(std_pattern_match=''STREET_ADDR'') AND (avg_length <> round(avg_length)) AND (avg_embedded_spaces BETWEEN 2 AND 6) AND (avg_length < 35)', '({VALUE_CT}::FLOAT * ({RESULT_MEASURE}::FLOAT - {THRESHOLD_VALUE}::FLOAT)/100.0)/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '75', 'Threshold Pct that Match Address Pattern', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected percent of records that match standard street address pattern', 'The street address pattern used in this test should match the vast majority of USA addresses. You can adjust the threshold percent of matches based on the results you are getting -- you may well want to tighten it to make the test more sensitive to invalid entries.', 'Y'), ('1034', 'Unique', 'Unique Values', 'Each column value is unique', 'Tests that no values for the column are repeated in multiple records.', 'Column values should be unique per row.', 'Duplicate values', 'Count of non-unique values', 'record_ct > 500 and record_ct = distinct_value_ct and value_ct > 0', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Duplicate Value Count', NULL, 'Fail', 'CAT', 'column', 'Uniqueness', 'Schema Drift', 'Expected count of duplicate values', 'This test is ideal when the database itself does not enforce a primary key constraint on the table. It serves as an independent check on uniqueness. If''s also useful when there are a small number of exceptions to uniqueness, which can be reflected in the expected threshold count of duplicates.', 'Y'), - ('1035', 'Unique_Pct', 'Percent Unique', 'Consistent ratio of unique values', 'Tests for statistically-significant shift in percentage of unique values vs. baseline data.', 'Significant shift in percent of unique values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'distinct_value_ct > 10', '2.0 * (1.0 - fn_normal_cdf(ABS({RESULT_MEASURE}::FLOAT) / 2.0))', '0.75', NULL, NULL, 'baseline_value_ct,baseline_unique_ct,threshold_value', 'value_ct,distinct_value_ct,0.5', 'Value Count at Baseline,Distinct Value Count at Baseline,Standardized Difference Measure (0 to 1)', NULL, 'Warning', 'CAT', 'column', 'Uniqueness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'You can think of this as a test of similarity that measures whether the percentage of unique values is consistent with the percentage at baseline. A significant change might indicate duplication or a telling shift in cardinality between entities. The test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'), + ('1035', 'Unique_Pct', 'Percent Unique', 'Consistent ratio of unique values', 'Tests for statistically-significant shift in percentage of unique values vs. baseline data.', 'Significant shift in percent of unique values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'distinct_value_ct > 10 AND functional_data_type NOT ILIKE ''Measurement%''', '2.0 * (1.0 - fn_normal_cdf(ABS({RESULT_MEASURE}::FLOAT) / 2.0))', '0.75', NULL, NULL, 'baseline_value_ct,baseline_unique_ct,threshold_value', 'value_ct,distinct_value_ct,0.5', 'Value Count at Baseline,Distinct Value Count at Baseline,Standardized Difference Measure (0 to 1)', NULL, 'Warning', 'CAT', 'column', 'Uniqueness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'You can think of this as a test of similarity that measures whether the percentage of unique values is consistent with the percentage at baseline. A significant change might indicate duplication or a telling shift in cardinality between entities. The test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'), ('1036', 'US_State', 'US State', 'Column value is two-letter US state code', 'Tests that the recorded column value is a valid US state.', 'Column Value is not a valid US state.', 'Not US States', 'Values that doo not match 2-character US state abbreviations.', 'general_type= ''A'' AND column_name ILIKE ''%state%'' AND distinct_value_ct < 70 AND max_length = 2', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Count not Matching State Abbreviations', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values that are not US state abbreviations', 'This test validates entries against a fixed list of two-character US state codes and related Armed Forces codes.', 'Y'), ('1037', 'Weekly_Rec_Ct', 'Weekly Records', 'At least one date per week present within date range', 'Tests for presence of at least one date per calendar week within min/max date range, per baseline data', 'At least one date per week expected in min/max date range.', 'Missing weeks', 'Calendar weeks without date values present', 'functional_data_type ILIKE ''Transactional Date%'' AND date_days_present > 1 AND functional_table_type ILIKE ''%cumulative%'' AND date_weeks_present > 3 AND date_weeks_present - (DATEDIFF(''week'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''week'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/NULLIF({DATE_WEEKS_PRESENT}::FLOAT, 0)/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Weeks without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar weeks without dates present', 'Weekly Records tests that at least one record is present for every calendar week within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each week. A failure here would suggest missing records for the number of weeks identified without data. You can adjust the threshold to accept a number of weeks that you know legitimately have no records.', 'Y'), - ('1040', 'Variability_Increase', 'Variability Increase', 'Variability has increased above threshold', 'Tests that the spread or dispersion of column values has increased significantly over baseline, indicating a drop in stability of the measure.', 'The Standard Deviation of the measure has increased beyond the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', '1', '0.75', NULL, NULL, 'baseline_sd,threshold_value', 'stdev_value,120', 'Std Deviation at Baseline,Expected Maximum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. An increase in particular could mark new problems in measurement, a more heterogeneous cohort, or that significant outliers have been introduced. Consider this test along with Average Shift and New Shift. If the average shifts as well, there may be a fundamental shift in the dataset or process used to collect the data point. This might suggest a data shift that should be noted and assessed by business users. If the average does not shift, this may point to a data quality or data collection problem. ', 'Y'), - ('1041', 'Variability_Decrease', 'Variability Decrease', 'Variability has decreased below threshold', 'Tests that the spread or dispersion of column values has decreased significantly over baseline, indicating a shift in stability of the measure. This could signal a change in a process or a data quality issue.', 'The Standard Deviation of the measure has decreased below the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', '1', '0.75', NULL, NULL, 'baseline_sd,threshold_value', 'stdev_value, 80', 'Std Deviation at Baseline,Expected Minimum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected minimum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. A decrease in particular could indicate an improved process, better precision in measurement, the elimination of outliers, or a more homogeneous cohort. ', 'Y'), + ('1040', 'Variability_Increase', 'Variability Increase', 'Variability has increased above threshold', 'Tests that the spread or dispersion of column values has increased significantly over baseline, indicating a drop in stability of the measure.', 'The Standard Deviation of the measure has increased beyond the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND functional_data_type <> ''Measurement Spike'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', '1', '0.75', NULL, NULL, 'baseline_sd,threshold_value', 'stdev_value,120', 'Std Deviation at Baseline,Expected Maximum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. An increase in particular could mark new problems in measurement, a more heterogeneous cohort, or that significant outliers have been introduced. Consider this test along with Average Shift and New Shift. If the average shifts as well, there may be a fundamental shift in the dataset or process used to collect the data point. This might suggest a data shift that should be noted and assessed by business users. If the average does not shift, this may point to a data quality or data collection problem. ', 'Y'), + ('1041', 'Variability_Decrease', 'Variability Decrease', 'Variability has decreased below threshold', 'Tests that the spread or dispersion of column values has decreased significantly over baseline, indicating a shift in stability of the measure. This could signal a change in a process or a data quality issue.', 'The Standard Deviation of the measure has decreased below the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND functional_data_type <> ''Measurement Spike'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', '1', '0.75', NULL, NULL, 'baseline_sd,threshold_value', 'stdev_value, 80', 'Std Deviation at Baseline,Expected Minimum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected minimum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. A decrease in particular could indicate an improved process, better precision in measurement, the elimination of outliers, or a more homogeneous cohort. ', 'Y'), ('1042', 'Valid_Month', 'Valid Month', 'Valid calendar month in expected format', 'Tests for the presence of a valid representation of a calendar month consistent with the format at baseline.', 'Column values are not a valid representation of a calendar month consistent with the format at baseline.', 'Invalid months', NULL, 'functional_data_type = ''Period Month''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value,baseline_value', '0,CASE WHEN max_length > 3 AND initcap(min_text) = min_text THEN ''''''January'''',''''February'''',''''March'''',''''April'''',''''May'''',''''June'''',''''July'''',''''August'''',''''September'''',''''October'''',''''November'''',''''December'''''' WHEN max_length > 3 AND upper(min_text) = min_text THEN ''''''JANUARY'''',''''FEBRUARY'''',''''MARCH'''',''''APRIL'''',''''MAY'''',''''JUNE'''',''''JULY'''',''''AUGUST'''',''''SEPTEMBER'''',''''OCTOBER'''',''''NOVEMBER'''',''''DECEMBER'''''' WHEN max_length > 3 AND lower(min_text) = min_text THEN ''''''january'''',''''february'''',''''march'''',''''april'''',''''may'''',''''june'''',''''july'''',''''august'''',''''september'''',''''october'''',''''november'''',''''december'''''' WHEN max_length = 3 AND initcap(min_text) = min_text THEN ''''''Jan'''',''''Feb'''',''''Mar'''',''''Apr'''',''''May'''',''''Jun'''',''''Jul'''',''''Aug'''',''''Sep'''',''''Oct'''',''''Nov'''',''''Dec'''''' WHEN max_length = 3 AND upper(min_text) = min_text THEN ''''''JAN'''',''''FEB'''',''''MAR'''',''''APR'''',''''MAY'''',''''JUN'''',''''JUL'''',''''AUG'''',''''SEP'''',''''OCT'''',''''NOV'''',''''DEC'''''' WHEN max_length = 3 AND lower(min_text) = min_text THEN ''''''jan'''',''''feb'''',''''mar'''',''''apr'''',''''may'''',''''jun'''',''''jul'''',''''aug'''',''''sep'''',''''oct'''',''''nov'''',''''dec'''''' WHEN max_length = 2 AND min_text = ''01'' THEN ''''''01'''',''''02'''',''''03'''',''''04'''',''''05'''',''''06'''',''''07'''',''''08'''',''''09'''',''''10'''',''''11'''',''''12'''''' WHEN max_length = 2 AND min_text = ''1'' THEN ''''''1'''',''''2'''',''''3'''',''''4'''',''''5'''',''''6'''',''''7'''',''''8'''',''''9'''',''''10'''',''''11'''',''''12'''''' WHEN min_value = 1 THEN ''1,2,3,4,5,6,7,8,9,10,11,12'' ELSE ''NULL'' END', 'Threshold Invalid Months,Valid Month List', 'The acceptable number of records with invalid months present.|List of valid month values for this field, in quotes if field is numeric, separated by commas.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid months', NULL, 'N'), ('1043', 'Valid_Characters', 'Valid Characters', 'Column contains no invalid characters', 'Tests for the presence of non-printing characters, leading spaces, or surrounding quotes.', 'Invalid characters, such as non-printing characters, leading spaces, or surrounding quotes, were found.', 'Invalid records', 'Expected count of values with invalid characters', 'general_type = ''A''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'threshold_value', '0', NULL, 'The acceptable number of records with invalid character values present.', 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', 'This test looks for the presence of non-printing ASCII characters that are considered non-standard in basic text processing. It also identifies leading spaces and values enclosed in quotes. Values that fail this test may be artifacts of data conversion, or just more difficult to process or analyze downstream.', 'N'), ('1044', 'Valid_US_Zip', 'Valid US Zip', 'Valid USA Postal Codes', 'Tests that postal codes match the 5 or 9 digit standard US format', 'Invalid US Zip Code formats found.', 'Invalid Zip Codes', 'Expected count of values with invalid Zip Codes', 'functional_data_type = ''Zip''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', NULL, 'Y'), @@ -141,12 +146,11 @@ VALUES ('1004', 'Alpha_Trunc', 'Alpha Truncation', 'Maximum character count con ('1501', 'Aggregate_Minimum', 'Aggregate Minimum', 'Aggregate values per group are at or above reference', 'Tests that aggregate values for each set of column values are at least the same as reference dataset', 'Aggregate measure per set of column values is not at least the same as reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of reference table|Reference table name|Aggregate column expression in reference table (e.g. `SUM(sales)`)|Condition defining a subset of records in reference table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in reference table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in reference table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals below aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations, but requires a match or increase in the aggregate value, rather than an exact match, across two different tables. Both tables must be accessible at the same time. Use this to confirm that aggregate values have not dropped for any set of categories, even if some values may rise. This test is useful to compare an older and newer version of a cumulative dataset. An error here means that one or more values per category set fail to match or exceed the prior dataset. New categories or combinations are allowed (but can be restricted independently with a Combo_Match test). Both tables must be present to run this test.', 'Y'), ('1502', 'Combo_Match', 'Reference Match', 'Column values or combinations found in reference', 'Tests for the presence of one or a set of column values in a reference table', 'Column value combinations are not found in reference table values.', 'Missing values', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. \n\nDo not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,having_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition,match_having_condition', NULL, 'Record Subset Condition,Group Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns,Matching Record Subset Condition,Matching Group Subset Condition', 'Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in source table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to validate source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in reference table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Validity', 'Schema Drift', 'Expected count of non-matching value combinations', 'This test verifies that values, or combinations of values, that are present in the main table are also found in a reference table. This is a useful test for referential integrity between fact and dimension tables. You can also use it to confirm the validity of a code or category, or of combinations of values that should only be found together within each record, such as product/size/color. An error here means that one or more category combinations in the main table are not found in the reference table. Both tables must be present to run this test.', 'Y'), ('1503', 'Distribution_Shift', 'Distribution Shift', 'Probability distribution consistent with reference', 'Tests the closeness of match between two distributions of aggregate measures across combinations of column values, using Jensen-Shannon Divergence test', 'Divergence between two distributions exceeds specified threshold.', 'Divergence level (0-1)', 'Jensen-Shannon Divergence, from 0 (identical distributions), to 1.0 (max divergence)', NULL, '1', '0.75', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Do not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition', NULL, 'Record Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns to Compare,Matching Record Subset Condition', 'Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to compare counts with source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL', 'Warning', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected maximum divergence level between 0 and 1', 'This test measures the similarity of two sets of counts per categories, by using their proportional counts as probability distributions. Using Jensen-Shannon divergence, a measure of relative entropy or difference between two distributions, the test assigns a score ranging from 0, meaning that the distributions are identical, to 1, meaning that the distributions are completely unrelated. This test can be used to compare datasets that may not match exactly, but should have similar distributions. For example, it is a useful sanity check for data from different sources that you would expect to have a consistent spread, such as shipment of building materials per state and construction projects by state. Scores can be compared over time even if the distributions are not identical -- a dataset can be expected to maintain a comparable divergence score with a reference dataset over time. Both tables must be present to run this test.', 'Y'), + ('1504', 'Aggregate_Balance_Percent', 'Aggregate Balance Percent', 'Aggregate measure per group within percent of reference', 'Tests that aggregate measure for each set of column values fall within a percent range above or below the measure for reference dataset', 'Aggregate measure per set of column values is outside percent range of reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition,lower_tolerance,upper_tolerance', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition,Lower Tolerance Percent,Upper Tolerance Percent', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL|Allowable tolerance below the reference measure expressed as a percent|Allowable tolerance above the reference measure expressed as a percent', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of group totals not matching aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. Use it to confirm that two datasets closely match within the tolerance you set -- that the sum of a measure or count of a value remains sufficiently consistent between categories. You could use this test compare sales per product within one month to another, when you want to be alerted if the difference for any product falls outside of the range defined as 5% below to 10% above the prior month. An error here means that one or more value combinations fail to match within the set tolerances. New categories or combinations will cause failure.', 'Y'), + ('1505', 'Aggregate_Balance_Range', 'Aggregate Balance Range', 'Aggregate measure per group within hard range of reference', 'Tests that aggregate measure for each set of column values fall within a hard range above or below the measure for reference dataset', 'Aggregate measure per set of column values is outside expected range of reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition,lower_tolerance,upper_tolerance', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition,Lower Tolerance Constant,Upper Tolerance Constant', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL|Allowable tolerance below the reference measure expressed as a constant value|Allowable tolerance above the reference measure expressed as a constant value', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of group totals not matching aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. Use it to confirm that two datasets closely match within the tolerances you define as specific values above or below the aggregate measure for the same categories in the reference dataset -- that the sum of a measure or count of a value remains sufficiently consistent between categories. For instance, you can use this test to compare sales per product within one month to another, when you want to be alerted if the difference for any product falls outside of the range defined as 10000 dollars above or below the prior week. An error here means that one or more value combinations fail to match within the set tolerances. New categories or combinations will cause failure.', 'Y'), ('1508', 'Timeframe_Combo_Gain', 'Timeframe No Drops', 'Latest timeframe has at least all value combinations from prior period', 'Tests that column values in most recent time-window include at least same as prior time window', 'Column values in most recent time-window don''t include all values in prior window.', 'Mismatched values', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Make sure not to use continuous measurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', 'The date column used to define the time windows. This must be a DATE or DATETIME type.|Length in days of the time window. The test will compare the most recent period of days to the prior period of the same duration.|Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of missing value combinations', 'This test checks a single transactional table to verify that categorical values or combinations that are present in the most recent time window you define include at least all those found in the prior time window of the same duration. Missing values in the latest time window will trigger the test to fail. New values are permitted. Use this test to confirm that codes or categories are not lost across successive time periods in a transactional table.', 'Y'), ('1509', 'Timeframe_Combo_Match', 'Timeframe Match', 'Column value combinations from latest timeframe same as prior period', 'Tests for presence of same column values in most recent time-window vs. prior time window', 'Column values don''t match in most recent time-windows.', 'Mismatched values', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Do not use continuous measurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', NULL, 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of non-matching value combinations', 'This test checks a single transactional table (such as a fact table) to verify that categorical values or combinations that are present in the most recent time window you define match those found in the prior time window of the same duration. New or missing values in the latest time window will trigger the test to fail. Use this test to confirm the consistency in the occurrence of codes or categories across successive time periods in a transactional table.', 'Y'), - - ('1504', 'Aggregate_Pct_Above', 'Aggregate Pct Above', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals with not exceeding aggregate measure', NULL, 'N'), - ('1505', 'Aggregate_Pct_Within', 'Aggregate Pct Within', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals with not exceeding aggregate measure', NULL, 'N'), - ('1506', 'Aggregate_Increase', 'Aggregate Increase', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals below reference value', NULL, 'N') + ('1510', 'Dupe_Rows', 'Duplicate Rows', 'Rows are not duplicated in table', 'Tests for the absence of duplicate rows based on unique combination of column values', 'Column value combinations are duplicated in the table.', 'Duplicate records', NULL, NULL, '(({RESULT_MEASURE}-{THRESHOLD_VALUE}))::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', 'null', 'null', 'groupby_names', NULL, 'Columns to Compare', 'List of columns in the table that define a duplicate record when the combination of values is repeated on multiple rows', 'Fail', 'QUERY', 'table', 'Uniqueness', 'Schema Drift', 'Expected count of duplicate value combinations', 'This test verifies that combinations of values are not repeated within the table. By default when auto-generated, the test considers all columns to protect against duplication of entire rows. If you know the minimum columns that should constitute a unique record, such as a set of ID''s, you should use those to make the test as sensitive as possible. Alternatively, if you know of columns you can always exclude, such as file_date or refresh_snapshot_id, remove them to tighten the test somewhat.', 'Y') ; @@ -171,7 +175,9 @@ VALUES ('2001', 'Combo_Match', 'redshift', 'ex_data_match_generic.sql'), ('2006', 'Aggregate_Balance', 'redshift', 'ex_aggregate_match_same_generic.sql'), ('2007', 'Timeframe_Combo_Gain', 'redshift', 'ex_window_match_no_drops_generic.sql'), ('2008', 'Timeframe_Combo_Match', 'redshift', 'ex_window_match_same_generic.sql'), - ('2009', 'Aggregate_Increase', 'redshift', 'ex_aggregate_match_num_incr_generic.sql'), + ('2009', 'Aggregate_Balance_Percent', 'redshift', 'ex_aggregate_match_percent_generic.sql'), + ('2010', 'Aggregate_Balance_Range', 'redshift', 'ex_aggregate_match_range_generic.sql'), + ('2011', 'Dupe_Rows', 'redshift', 'ex_dupe_rows_generic.sql'), ('2101', 'Combo_Match', 'snowflake', 'ex_data_match_generic.sql'), ('2102', 'Aggregate_Minimum', 'snowflake', 'ex_aggregate_match_no_drops_generic.sql'), @@ -180,7 +186,9 @@ VALUES ('2001', 'Combo_Match', 'redshift', 'ex_data_match_generic.sql'), ('2106', 'Aggregate_Balance', 'snowflake', 'ex_aggregate_match_same_generic.sql'), ('2107', 'Timeframe_Combo_Gain', 'snowflake', 'ex_window_match_no_drops_generic.sql'), ('2108', 'Timeframe_Combo_Match', 'snowflake', 'ex_window_match_same_generic.sql'), - ('2109', 'Aggregate_Increase', 'snowflake', 'ex_aggregate_match_num_incr_generic.sql'), + ('2109', 'Aggregate_Balance_Percent', 'snowflake', 'ex_aggregate_match_percent_generic.sql'), + ('2110', 'Aggregate_Balance_Range', 'snowflake', 'ex_aggregate_match_range_generic.sql'), + ('2111', 'Dupe_Rows', 'snowflake', 'ex_dupe_rows_generic.sql'), ('2201', 'Combo_Match', 'mssql', 'ex_data_match_generic.sql'), ('2202', 'Aggregate_Minimum', 'mssql', 'ex_aggregate_match_no_drops_generic.sql'), @@ -189,7 +197,9 @@ VALUES ('2001', 'Combo_Match', 'redshift', 'ex_data_match_generic.sql'), ('2206', 'Aggregate_Balance', 'mssql', 'ex_aggregate_match_same_generic.sql'), ('2207', 'Timeframe_Combo_Gain', 'mssql', 'ex_window_match_no_drops_generic.sql'), ('2208', 'Timeframe_Combo_Match', 'mssql', 'ex_window_match_same_generic.sql'), - ('2209', 'Aggregate_Increase', 'mssql', 'ex_aggregate_match_num_incr_generic.sql'), + ('2209', 'Aggregate_Balance_Percent', 'mssql', 'ex_aggregate_match_percent_generic.sql'), + ('2210', 'Aggregate_Balance_Range', 'mssql', 'ex_aggregate_match_range_generic.sql'), + ('2211', 'Dupe_Rows', 'mssql', 'ex_dupe_rows_generic.sql'), ('2301', 'Combo_Match', 'postgresql', 'ex_data_match_generic.sql'), ('2302', 'Aggregate_Minimum', 'postgresql', 'ex_aggregate_match_no_drops_generic.sql'), @@ -198,7 +208,9 @@ VALUES ('2001', 'Combo_Match', 'redshift', 'ex_data_match_generic.sql'), ('2306', 'Aggregate_Balance', 'postgresql', 'ex_aggregate_match_same_generic.sql'), ('2307', 'Timeframe_Combo_Gain', 'postgresql', 'ex_window_match_no_drops_postgresql.sql'), ('2308', 'Timeframe_Combo_Match', 'postgresql', 'ex_window_match_same_postgresql.sql'), - ('2309', 'Aggregate_Increase', 'postgresql', 'ex_aggregate_match_num_incr_generic.sql'), + ('2309', 'Aggregate_Balance_Percent', 'postgresql', 'ex_aggregate_match_percent_generic.sql'), + ('2310', 'Aggregate_Balance_Range', 'postgresql', 'ex_aggregate_match_range_generic.sql'), + ('2311', 'Dupe_Rows', 'postgresql', 'ex_dupe_rows_generic.sql'), ('2401', 'Combo_Match', 'databricks', 'ex_data_match_generic.sql'), ('2402', 'Aggregate_Minimum', 'databricks', 'ex_aggregate_match_no_drops_generic.sql'), @@ -207,7 +219,10 @@ VALUES ('2001', 'Combo_Match', 'redshift', 'ex_data_match_generic.sql'), ('2406', 'Aggregate_Balance', 'databricks', 'ex_aggregate_match_same_generic.sql'), ('2407', 'Timeframe_Combo_Gain', 'databricks', 'ex_window_match_no_drops_databricks.sql'), ('2408', 'Timeframe_Combo_Match', 'databricks', 'ex_window_match_same_databricks.sql'), - ('2409', 'Aggregate_Increase', 'databricks', 'ex_aggregate_match_num_incr_generic.sql'); + ('2409', 'Aggregate_Balance_Percent', 'databricks', 'ex_aggregate_match_percent_generic.sql'), + ('2410', 'Aggregate_Balance_Range', 'databricks', 'ex_aggregate_match_range_generic.sql'), + ('2411', 'Dupe_Rows', 'databricks', 'ex_dupe_rows_generic.sql') +; TRUNCATE TABLE cat_test_conditions; @@ -217,7 +232,7 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('1003', 'Condition_Flag', 'redshift', 'SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('1004', 'Constant', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('1005', 'Daily_Record_Ct', 'redshift', 'DATEDIFF(''DAY'', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), - ('1006', 'Dec_Trunc', 'redshift', 'ROUND(SUM(ABS({COLUMN_NAME})::DECIMAL(18,4) % 1), 0)', '<', '{THRESHOLD_VALUE}'), + ('1006', 'Dec_Trunc', 'redshift', 'SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1', '<', '{THRESHOLD_VALUE}'), ('1007', 'Distinct_Date_Ct', 'redshift', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), ('1008', 'Distinct_Value_Ct', 'redshift', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), ('1009', 'Email_Format', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} !~ ''^[A-Za-z0-9._''''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), @@ -227,7 +242,7 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('1013', 'LOV_All', 'redshift', 'LISTAGG(DISTINCT {COLUMN_NAME}, ''|'') WITHIN GROUP (ORDER BY {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), ('1014', 'LOV_Match', 'redshift', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('1015', 'Min_Date', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} < ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('1016', 'Min_Val', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('1016', 'Min_Val', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('1017', 'Missing_Pct', 'redshift', 'ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), ('1018', 'Monthly_Rec_Ct', 'redshift', '(MAX(DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) - MIN(DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE))', '>', '{THRESHOLD_VALUE}'), ('1019', 'Outlier_Pct_Above', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), @@ -247,7 +262,7 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('2003', 'Condition_Flag', 'snowflake', 'SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('2004', 'Constant', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('2005', 'Daily_Record_Ct', 'snowflake', 'DATEDIFF(day, MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), - ('2006', 'Dec_Trunc', 'snowflake', 'ROUND(SUM(ABS({COLUMN_NAME})::DECIMAL(18,4) % 1), 0)', '<', '{THRESHOLD_VALUE}'), + ('2006', 'Dec_Trunc', 'snowflake', 'SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1', '<', '{THRESHOLD_VALUE}'), ('2007', 'Distinct_Date_Ct', 'snowflake', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), ('2008', 'Distinct_Value_Ct', 'snowflake', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), ('2009', 'Email_Format', 'snowflake', 'SUM(CASE WHEN NOT REGEXP_LIKE({COLUMN_NAME}::VARCHAR, ''^[A-Za-z0-9._''''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), @@ -257,7 +272,7 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('2013', 'LOV_All', 'snowflake', 'LISTAGG(DISTINCT {COLUMN_NAME}, ''|'') WITHIN GROUP (ORDER BY {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), ('2014', 'LOV_Match', 'snowflake', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('2015', 'Min_Date', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME} < ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('2016', 'Min_Val', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('2016', 'Min_Val', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('2017', 'Missing_Pct', 'snowflake', 'ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), ('2018', 'Monthly_Rec_Ct', 'snowflake', '(MAX(DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) - MIN(DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE))', '>', '{THRESHOLD_VALUE}'), ('2019', 'Outlier_Pct_Above', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), @@ -277,7 +292,7 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('3003', 'Condition_Flag', 'mssql', 'SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('3004', 'Constant', 'mssql', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('3005', 'Daily_Record_Ct', 'mssql', 'DATEDIFF(day, MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), - ('3006', 'Dec_Trunc', 'mssql', 'ROUND(SUM(ABS(CAST({COLUMN_NAME} AS DECIMAL(18,4))) % 1), 0)', '<', '{THRESHOLD_VALUE}'), + ('3006', 'Dec_Trunc', 'mssql', 'SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1', '<', '{THRESHOLD_VALUE}'), ('3007', 'Distinct_Date_Ct', 'mssql', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), ('3008', 'Distinct_Value_Ct', 'mssql', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), ('3009', 'Email_Format', 'mssql', 'SUM(CASE WHEN {COLUMN_NAME} NOT LIKE ''[A-Za-z0-9._''''%+-]%@[A-Za-z0-9.-]%.[A-Za-z][A-Za-z]%'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), @@ -287,7 +302,7 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('3013', 'LOV_All', 'mssql', 'STRING_AGG(DISTINCT {COLUMN_NAME}, ''|'') WITHIN GROUP (ORDER BY {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), ('3014', 'LOV_Match', 'mssql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('3015', 'Min_Date', 'mssql', 'SUM(CASE WHEN {COLUMN_NAME} < ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('3016', 'Min_Val', 'mssql', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('3016', 'Min_Val', 'mssql', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('3017', 'Missing_Pct', 'mssql', 'ABS( 2.0 * ASIN( SQRT( CAST({BASELINE_VALUE_CT} AS FLOAT) / CAST({BASELINE_CT} AS FLOAT) ) ) - 2 * ASIN( SQRT( CAST(COUNT( {COLUMN_NAME} ) AS FLOAT) / CAST(NULLIF(COUNT(*), 0) AS FLOAT) )) )', '>=', '{THRESHOLD_VALUE}'), ('3018', 'Monthly_Rec_Ct', 'mssql', '(MAX(DATEDIFF(month, {COLUMN_NAME}, CAST(''{RUN_DATE}''AS DATE))) - MIN(DATEDIFF(month, {COLUMN_NAME}, CAST(''{RUN_DATE}'' AS DATE))) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, CAST(''{RUN_DATE}''AS DATE)))', '>', '{THRESHOLD_VALUE}'), ('3019', 'Outlier_Pct_Above', 'mssql', 'CAST(SUM(CASE WHEN CAST({COLUMN_NAME} AS FLOAT) > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS FLOAT) / CAST(COUNT({COLUMN_NAME}) AS FLOAT)', '>', '{THRESHOLD_VALUE}'), @@ -307,7 +322,7 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('4003', 'Condition_Flag', 'postgresql', 'SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('4004', 'Constant', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('4005', 'Daily_Record_Ct', 'postgresql', '<%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), - ('4006', 'Dec_Trunc', 'postgresql', 'ROUND(SUM(ABS({COLUMN_NAME})::DECIMAL(18,4) % 1), 0)', '<', '{THRESHOLD_VALUE}'), + ('4006', 'Dec_Trunc', 'postgresql', 'SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1', '<', '{THRESHOLD_VALUE}'), ('4007', 'Distinct_Date_Ct', 'postgresql', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), ('4008', 'Distinct_Value_Ct', 'postgresql', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), ('4009', 'Email_Format', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} !~ ''^[A-Za-z0-9._''''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), @@ -317,7 +332,7 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('4013', 'LOV_All', 'postgresql', 'STRING_AGG(DISTINCT {COLUMN_NAME}, ''|'') WITHIN GROUP (ORDER BY {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), ('4014', 'LOV_Match', 'postgresql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('4015', 'Min_Date', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} < ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('4016', 'Min_Val', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('4016', 'Min_Val', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('4017', 'Missing_Pct', 'postgresql', 'ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), ('4018', 'Monthly_Rec_Ct', 'postgresql', '(MAX(<%DATEDIFF_MONTH;{COLUMN_NAME};''{RUN_DATE}''::DATE%>) - MIN(<%DATEDIFF_MONTH;{COLUMN_NAME};''{RUN_DATE}''::DATE%>) + 1) - COUNT(DISTINCT <%DATEDIFF_MONTH;{COLUMN_NAME};''{RUN_DATE}''::DATE%>)', '>', '{THRESHOLD_VALUE}'), ('4019', 'Outlier_Pct_Above', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), @@ -349,7 +364,7 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('5003', 'Condition_Flag', 'trino', 'SUM(CASE WHEN {BASELINE_VALUE} IS NOT NULL THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('5004', 'Constant', 'trino', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('5005', 'Daily_Record_Ct', 'trino', 'DATE_DIFF(''DAY'', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), - ('5006', 'Dec_Trunc', 'trino', 'ROUND(SUM(ABS(CAST({COLUMN_NAME} AS DECIMAL(18,4))) % 1), 0)', '<', '{THRESHOLD_VALUE}'), + ('5006', 'Dec_Trunc', 'trino', 'SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1', '<', '{THRESHOLD_VALUE}'), ('5007', 'Distinct_Date_Ct', 'trino', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), ('5008', 'Distinct_Value_Ct', 'trino', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), ('5009', 'Email_Format', 'trino', 'SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME} , ''^[A-Za-z0-9._''''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'') != TRUE THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), @@ -359,7 +374,7 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('5013', 'LOV_All', 'trino', 'LISTAGG(DISTINCT {COLUMN_NAME}, ''|'') WITHIN GROUP (ORDER BY {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), ('5014', 'LOV_Match', 'trino', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('5015', 'Min_Date', 'trino', 'SUM(CASE WHEN {COLUMN_NAME} < CAST(''{BASELINE_VALUE}'' AS DATE) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('5016', 'Min_Val', 'trino', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('5016', 'Min_Val', 'trino', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('5017', 'Missing_Pct', 'trino', 'ABS(2.0 * ASIN(SQRT(CAST({BASELINE_VALUE_CT} AS REAL) / CAST({BASELINE_CT} AS REAL))) - 2 * ASIN(SQRT(CAST(COUNT({COLUMN_NAME}) AS REAL) / CAST(NULLIF(COUNT(*), 0) AS REAL) )))', '>=', '{THRESHOLD_VALUE}'), ('5018', 'Monthly_Rec_Ct', 'trino', '(MAX(DATE_DIFF(''month'', {COLUMN_NAME}, CAST(''{RUN_DATE}'' AS DATE))) - MIN(DATE_DIFF(''month'', {COLUMN_NAME}, CAST(''{RUN_DATE}'' AS DATE))) + 1) - COUNT(DISTINCT DATE_DIFF(''month'', {COLUMN_NAME}, CAST(''{RUN_DATE}'' AS DATE)))', '>', '{THRESHOLD_VALUE}'), ('5019', 'Outlier_Pct_Above', 'trino', 'CAST(SUM(CASE WHEN CAST({COLUMN_NAME} AS REAL) > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS REAL) / CAST(COUNT({COLUMN_NAME}) AS REAL)', '>', '{THRESHOLD_VALUE}'), @@ -382,7 +397,7 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('6003', 'Condition_Flag', 'databricks', 'SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('6004', 'Constant', 'databricks', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('6005', 'Daily_Record_Ct', 'databricks', '<%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), - ('6006', 'Dec_Trunc', 'databricks', 'ROUND(SUM(ABS({COLUMN_NAME})::DECIMAL(18,4) % 1), 0)', '<', '{THRESHOLD_VALUE}'), + ('6006', 'Dec_Trunc', 'databricks', 'SUM(ROUND(ABS(({COLUMN_NAME} % 1)), 5))+1', '<', '{THRESHOLD_VALUE}'), ('6007', 'Distinct_Date_Ct', 'databricks', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), ('6008', 'Distinct_Value_Ct', 'databricks', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), ('6009', 'Email_Format', 'databricks', 'SUM(CASE WHEN NOT REGEXP_LIKE({COLUMN_NAME}::STRING, ''^[A-Za-z0-9._''''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), @@ -392,7 +407,7 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('6013', 'LOV_All', 'databricks', 'STRING_AGG(DISTINCT {COLUMN_NAME}, ''|'') WITHIN GROUP (ORDER BY {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), ('6014', 'LOV_Match', 'databricks', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('6015', 'Min_Date', 'databricks', 'SUM(CASE WHEN {COLUMN_NAME} < ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('6016', 'Min_Val', 'databricks', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('6016', 'Min_Val', 'databricks', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('6017', 'Missing_Pct', 'databricks', 'ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT({COLUMN_NAME})::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), ('6018', 'Monthly_Rec_Ct', 'databricks', '(MAX(<%DATEDIFF_MONTH;{COLUMN_NAME};''{RUN_DATE}''::DATE%>) - MIN(<%DATEDIFF_MONTH;{COLUMN_NAME};''{RUN_DATE}''::DATE%>) + 1) - COUNT(DISTINCT <%DATEDIFF_MONTH;{COLUMN_NAME};''{RUN_DATE}''::DATE%>)', '>', '{THRESHOLD_VALUE}'), ('6019', 'Outlier_Pct_Above', 'databricks', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), @@ -483,7 +498,7 @@ VALUES ('1039', '1007', 'Profile Anomaly' , 'Column_Pattern_Mismatch', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) A UNION ALL SELECT B.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) B UNION ALL SELECT C.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) C UNION ALL SELECT D.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) D ORDER BY top_pattern DESC, count DESC;' ), ('1040', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'redshift', NULL, 'SELECT column_name, table_name, data_type FROM information_schema.columns WHERE table_schema = ''{TARGET_SCHEMA}'' AND column_name = ''{COLUMN_NAME}'' ORDER BY data_type;' ), ('1041', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), - ('1042', '1010', 'Profile Anomaly' , 'Quoted_Values', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE ''"%"'' OR "{COLUMN_NAME}" ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), + ('1042', '1010', 'Profile Anomaly' , 'Quoted_Values', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE ''"%"'' OR "{COLUMN_NAME}" ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;' ), ('1043', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 10 DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), ('1044', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 10 DISTINCT ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), ('1045', '1013', 'Profile Anomaly' , 'Small Missing Value Ct', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), @@ -509,7 +524,7 @@ VALUES ('1064', '1007', 'Profile Anomaly' , 'Column_Pattern_Mismatch', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) A UNION ALL SELECT B.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) B UNION ALL SELECT C.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) C UNION ALL SELECT D.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC;' ), ('1065', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'postgresql', NULL, 'SELECT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY columns.table_name;' ), ('1066', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), - ('1067', '1010', 'Profile Anomaly' , 'Quoted_Values', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE ''"%"'' OR "{COLUMN_NAME}" ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), + ('1067', '1010', 'Profile Anomaly' , 'Quoted_Values', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE ''"%"'' OR "{COLUMN_NAME}" ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;' ), ('1068', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC;' ), ('1069', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS B ORDER BY data_type, count DESC;' ), ('1070', '1013', 'Profile Anomaly' , 'Small Missing Value Ct', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), @@ -748,7 +763,7 @@ ORDER BY check_period DESC;'), ('1178', '1007', 'Profile Anomaly' , 'Column_Pattern_Mismatch', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) B UNION ALL SELECT C.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) C UNION ALL SELECT D.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) D ORDER BY top_pattern DESC, count DESC;' ), ('1179', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'snowflake', NULL, 'SELECT DISTINCT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY table_name; ' ), ('1180', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), - ('1181', '1010', 'Profile Anomaly' , 'Quoted_Values', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE ''"%"'' OR "{COLUMN_NAME}" ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), + ('1181', '1010', 'Profile Anomaly' , 'Quoted_Values', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE ''"%"'' OR "{COLUMN_NAME}" ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;' ), ('1182', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;' ), ('1183', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 10 ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;' ), ('1184', '1013', 'Profile Anomaly' , 'Small Missing Value Ct', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''-{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''0{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''9{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''x{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''z{2,}'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), @@ -1087,6 +1102,160 @@ FULL JOIN older_ver o ON (l.category = o.category) ORDER BY COALESCE(l.category, o.category)'), + ('1245', '1504', 'Test Results', 'Aggregate_Balance_Percent', 'redshift', NULL, 'SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) + ORDER BY {GROUPBY_NAMES};'), + ('1246', '1504', 'Test Results', 'Aggregate_Balance_Percent', 'snowflake', NULL, 'SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) + ORDER BY {GROUPBY_NAMES};'), + ('1247', '1504', 'Test Results', 'Aggregate_Balance_Percent', 'mssql', NULL, 'SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) + ORDER BY {GROUPBY_NAMES};'), + ('1248', '1504', 'Test Results', 'Aggregate_Balance_Percent', 'postgresql', NULL, 'SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) + ORDER BY {GROUPBY_NAMES};'), + + ('1245', '1505', 'Test Results', 'Aggregate_Balance_Range', 'redshift', NULL, 'SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) + ORDER BY {GROUPBY_NAMES};'), + ('1246', '1505', 'Test Results', 'Aggregate_Balance_Range', 'snowflake', NULL, 'SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) + ORDER BY {GROUPBY_NAMES};'), + ('1247', '1505', 'Test Results', 'Aggregate_Balance_Range', 'mssql', NULL, 'SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) + ORDER BY {GROUPBY_NAMES};'), + ('1248', '1505', 'Test Results', 'Aggregate_Balance_Range', 'postgresql', NULL, 'SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) + ORDER BY {GROUPBY_NAMES};'), + ('1261', '1508', 'Test Results', 'Timeframe_Combo_Gain', 'redshift', NULL, 'SELECT {COLUMN_NAME_NO_QUOTES} FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {SUBSET_CONDITION} @@ -1237,7 +1406,7 @@ WHERE {SUBSET_CONDITION} )'), ('1269', '1100', 'Profile Anomaly', 'Potential_PII', 'redshift', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'), ('1270', '1100', 'Profile Anomaly', 'Potential_PII', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'), - ('1271', '1100', 'Profile Anomaly', 'Potential_PII', 'mssql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'), + ('1271', '1100', 'Profile Anomaly', 'Potential_PII', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'), ('1272', '1100', 'Profile Anomaly', 'Potential_PII', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'), ('1273', '1001', 'Profile Anomaly' , 'Suggested_Type', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;'), @@ -1249,7 +1418,7 @@ WHERE {SUBSET_CONDITION} ('1279', '1007', 'Profile Anomaly' , 'Column_Pattern_Mismatch', 'databricks', NULL, 'SELECT A.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) A UNION ALL SELECT B.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) B UNION ALL SELECT C.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) C UNION ALL SELECT D.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC;' ), ('1280', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'databricks', NULL, 'SELECT DISTINCT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY table_name; ' ), ('1281', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;' ), - ('1282', '1010', 'Profile Anomaly' , 'Quoted_Values', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` ILIKE ''"%"'' OR `{COLUMN_NAME}` ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;' ), + ('1282', '1010', 'Profile Anomaly' , 'Quoted_Values', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` ILIKE ''"%"'' OR `{COLUMN_NAME}` ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500;' ), ('1283', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'databricks', NULL, 'SELECT A.* FROM (SELECT DISTINCT ''Numeric'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT ''Non-Numeric'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;' ), ('1284', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'databricks', NULL, 'SELECT A.* FROM (SELECT DISTINCT ''Date'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT ''Non-Date'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;' ), ('1285', '1013', 'Profile Anomaly' , 'Small Missing Value Ct', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''-{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''0{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''9{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''x{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''z{2,}'' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN `{COLUMN_NAME}` = '''' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;' ), @@ -1371,6 +1540,44 @@ SELECT COALESCE(l.category, o.category) AS category, FULL JOIN older_ver o ON (l.category = o.category) ORDER BY COALESCE(l.category, o.category)'), + ('1248', '1504', 'Test Results', 'Aggregate_Balance_Percent', 'databricks', NULL, 'SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) + ORDER BY {GROUPBY_NAMES};'), + ('1245', '1505', 'Test Results', 'Aggregate_Balance_Range', 'databricks', NULL, 'SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) + ORDER BY {GROUPBY_NAMES};'), ('1337', '1509', 'Test Results', 'Timeframe_Combo_Match', 'databricks', NULL, ' ( SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} FROM {TARGET_SCHEMA}.{TABLE_NAME} @@ -1396,9 +1603,118 @@ FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} )'), - ('1338', '1100', 'Profile Anomaly', 'Potential_PII', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;') - + ('1338', '1100', 'Profile Anomaly', 'Potential_PII', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;'), + ('1253', '1510', 'Test Results', 'Dupe_Rows', 'redshift', NULL, 'SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + HAVING COUNT(*) > 1 +ORDER BY {GROUPBY_NAMES}'), + ('1254', '1510', 'Test Results', 'Dupe_Rows', 'snowflake', NULL, 'SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + HAVING COUNT(*) > 1 +ORDER BY {GROUPBY_NAMES}'), + ('1255', '1510', 'Test Results', 'Dupe_Rows', 'mssql', NULL, 'SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + HAVING COUNT(*) > 1 +ORDER BY {GROUPBY_NAMES}'), + ('1256', '1510', 'Test Results', 'Dupe_Rows', 'postgresql', NULL, 'SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + HAVING COUNT(*) > 1 +ORDER BY {GROUPBY_NAMES}'), + ('1257', '1510', 'Test Results', 'Dupe_Rows', 'databricks', NULL, 'SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + HAVING COUNT(*) > 1 +ORDER BY {GROUPBY_NAMES}'), + ('1258', '1028', 'Profile Anomaly', 'Inconsistent_Casing', 'redshift', NULL, '(SELECT ''Upper Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" +GROUP BY "{COLUMN_NAME}" LIMIT 20) +UNION ALL +(SELECT ''Mixed Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") +GROUP BY "{COLUMN_NAME}" LIMIT 20)'), + ('1259', '1028', 'Profile Anomaly', 'Inconsistent_Casing', 'postgresql', NULL, '(SELECT ''Upper Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" +GROUP BY "{COLUMN_NAME}" LIMIT 20) +UNION ALL +(SELECT ''Mixed Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") +GROUP BY "{COLUMN_NAME}" LIMIT 20)'), + ('1260', '1028', 'Profile Anomaly', 'Inconsistent_Casing', 'mssql', NULL, 'SELECT TOP 20 ''Upper Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" +GROUP BY "{COLUMN_NAME}" +UNION +SELECT TOP 20 ''Mixed Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") +GROUP BY "{COLUMN_NAME}"'), + ('1261', '1028', 'Profile Anomaly', 'Inconsistent_Casing', 'snowflake', NULL, '(SELECT ''Upper Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" +GROUP BY "{COLUMN_NAME}" LIMIT 20) +UNION ALL +(SELECT ''Mixed Case'' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") +GROUP BY "{COLUMN_NAME}" LIMIT 20)'), + ('1262', '1028', 'Profile Anomaly', 'Inconsistent_Casing', 'databricks', NULL, '(SELECT ''Upper Case'' as casing, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE UPPER(`{COLUMN_NAME}`) = `{COLUMN_NAME}` +GROUP BY `{COLUMN_NAME}` LIMIT 20) +UNION ALL +(SELECT ''Mixed Case'' as casing, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE `{COLUMN_NAME}` <> UPPER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` <> LOWER(`{COLUMN_NAME}`) +GROUP BY `{COLUMN_NAME}` LIMIT 20)'), + ('1263', '1029', 'Profile Anomaly', 'Non_Alpha_Name_Address', 'redshift', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '''' +GROUP BY "{COLUMN_NAME}" LIMIT 500'), + ('1264', '1029', 'Profile Anomaly', 'Non_Alpha_Name_Address', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '''' +GROUP BY "{COLUMN_NAME}" LIMIT 500'), + ('1265', '1029', 'Profile Anomaly', 'Non_Alpha_Name_Address', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '''' +GROUP BY "{COLUMN_NAME}"'), + ('1266', '1029', 'Profile Anomaly', 'Non_Alpha_Name_Address', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '''' +GROUP BY "{COLUMN_NAME}" LIMIT 500'), + ('1267', '1029', 'Profile Anomaly', 'Non_Alpha_Name_Address', 'databricks', NULL, 'SELECT any_value(`{COLUMN_NAME}`), COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE `{COLUMN_NAME}` = UPPER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` = LOWER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` > '''' +GROUP BY "{COLUMN_NAME}" LIMIT 500'), + ('1268', '1030', 'Profile Anomaly', 'Non_Alpha_Prefixed_Name', 'redshift', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" < ''A'' AND LEFT("{COLUMN_NAME}", 1) NOT IN (''"'', '' '') AND RIGHT("{COLUMN_NAME}", 1) <> '''''''' +GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'), + ('1269', '1030', 'Profile Anomaly', 'Non_Alpha_Prefixed_Name', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" < ''A'' AND LEFT("{COLUMN_NAME}", 1) NOT IN (''"'', '' '') AND RIGHT("{COLUMN_NAME}", 1) <> '''''''' +GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'), + ('1270', '1030', 'Profile Anomaly', 'Non_Alpha_Prefixed_Name', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE "{COLUMN_NAME}" < ''A'' AND LEFT("{COLUMN_NAME}", 1) NOT IN (''"'', '' '') AND RIGHT("{COLUMN_NAME}", 1) <> '''''''' +GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"'), + ('1271', '1030', 'Profile Anomaly', 'Non_Alpha_Prefixed_Name', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" +WHERE "{COLUMN_NAME}" < ''A'' AND LEFT("{COLUMN_NAME}", 1) NOT IN (''"'', '' '') AND RIGHT("{COLUMN_NAME}", 1) <> '''''''' +GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'), + ('1272', '1030', 'Profile Anomaly', 'Non_Alpha_Prefixed_Name', 'databricks', NULL, 'SELECT any_value(`{COLUMN_NAME}`), COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` +WHERE `{COLUMN_NAME}` < ''A'' AND LEFT(`{COLUMN_NAME}`, 1) NOT IN (''"'', '' '') AND RIGHT(`{COLUMN_NAME}`, 1) <> '''''''' +GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500'), + ('1273', '1031', 'Profile Anomaly', 'Non_Printing_Chars', 'redshift', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), ''XXXXXXXXXX'') <> "{COLUMN_NAME}" +GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'), + ('1274', '1031', 'Profile Anomaly', 'Non_Printing_Chars', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), ''XXXXXXXXXX'') <> "{COLUMN_NAME}" +GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'), + ('1275', '1031', 'Profile Anomaly', 'Non_Printing_Chars', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE TRANSLATE("{COLUMN_NAME}", NCHAR(160), ''X'') <> "{COLUMN_NAME}" +GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"'), + ('1276', '1031', 'Profile Anomaly', 'Non_Printing_Chars', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), ''XXXXXXXXXX'') <> "{COLUMN_NAME}" +GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500'), + ('1277', '1031', 'Profile Anomaly', 'Non_Printing_Chars', 'databricks', NULL, 'SELECT any_value(`{COLUMN_NAME}`), COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` + WHERE TRANSLATE(`{COLUMN_NAME}`, ''\u00a0\u2009\u200b\u200c\u200d\u200e\u200f\u202f\u3000\ufeff'', ''XXXXXXXXXX'') <> `{COLUMN_NAME}` +GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500') ; diff --git a/testgen/template/dbsetup/075_grant_role_rights.sql b/testgen/template/dbsetup/075_grant_role_rights.sql index 1b4f11b5..2f7fbf31 100644 --- a/testgen/template/dbsetup/075_grant_role_rights.sql +++ b/testgen/template/dbsetup/075_grant_role_rights.sql @@ -39,7 +39,8 @@ GRANT SELECT, INSERT, DELETE, UPDATE ON {SCHEMA_NAME}.score_definition_results_breakdown, {SCHEMA_NAME}.score_definition_results_history, {SCHEMA_NAME}.score_history_latest_runs, - {SCHEMA_NAME}.job_schedules + {SCHEMA_NAME}.job_schedules, + {SCHEMA_NAME}.settings TO testgen_execute_role; diff --git a/testgen/template/dbupgrade/0141_incremental_upgrade.sql b/testgen/template/dbupgrade/0141_incremental_upgrade.sql new file mode 100644 index 00000000..55675eaf --- /dev/null +++ b/testgen/template/dbupgrade/0141_incremental_upgrade.sql @@ -0,0 +1,18 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +ALTER TABLE profile_results + ADD COLUMN non_printing_ct BIGINT; + +ALTER TABLE test_definitions + ALTER COLUMN groupby_names TYPE VARCHAR, + ALTER COLUMN match_groupby_names TYPE VARCHAR; + +DROP VIEW IF EXISTS v_test_results; +DROP VIEW IF EXISTS v_queued_observability_results; + +ALTER TABLE test_results + ALTER COLUMN input_parameters TYPE VARCHAR; + +UPDATE profile_anomaly_results + SET detail = REPLACE(detail, 'Filled Values:', 'Dummy Values:') + WHERE detail ILIKE 'Filled Values:%' diff --git a/testgen/template/dbupgrade/0142_incremental_upgrade.sql b/testgen/template/dbupgrade/0142_incremental_upgrade.sql new file mode 100644 index 00000000..b9db7219 --- /dev/null +++ b/testgen/template/dbupgrade/0142_incremental_upgrade.sql @@ -0,0 +1,5 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +ALTER TABLE test_definitions + ADD COLUMN lower_tolerance VARCHAR(1000), + ADD COLUMN upper_tolerance VARCHAR(1000); diff --git a/testgen/template/dbupgrade/0143_incremental_upgrade.sql b/testgen/template/dbupgrade/0143_incremental_upgrade.sql new file mode 100644 index 00000000..c70a7360 --- /dev/null +++ b/testgen/template/dbupgrade/0143_incremental_upgrade.sql @@ -0,0 +1,6 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +CREATE TABLE settings ( + key VARCHAR(50) NOT NULL PRIMARY KEY, + value JSONB NOT NULL +); diff --git a/testgen/template/execution/ex_get_tests_non_cat.sql b/testgen/template/execution/ex_get_tests_non_cat.sql index 7d69ef40..536fd509 100644 --- a/testgen/template/execution/ex_get_tests_non_cat.sql +++ b/testgen/template/execution/ex_get_tests_non_cat.sql @@ -14,13 +14,15 @@ SELECT tt.test_type, coalesce(baseline_sum, '') as baseline_sum, coalesce(baseline_avg, '') as baseline_avg, coalesce(baseline_sd, '') as baseline_sd, + coalesce(lower_tolerance, '') as lower_tolerance, + coalesce(upper_tolerance, '') as upper_tolerance, case when nullif(subset_condition, '') is null then '1=1' else subset_condition end as subset_condition, coalesce(groupby_names, '') as groupby_names, case when having_condition is null then '' - else concat('WHERE ', having_condition) end as having_condition, + else concat('HAVING ', having_condition) end as having_condition, coalesce(window_date_column, '') as window_date_column, cast(coalesce(window_days, '0') as varchar(50)) as window_days, coalesce(match_schema_name, '') as match_schema_name, @@ -30,7 +32,10 @@ SELECT tt.test_type, when nullif(match_subset_condition, '') is null then '1=1' else match_subset_condition end as match_subset_condition, coalesce(match_groupby_names, '') as match_groupby_names, - coalesce(match_having_condition, '') as match_having_condition, + case + when match_having_condition is null then '' + else concat('HAVING ', match_having_condition) + END as match_having_condition, coalesce(custom_query, '') as custom_query, coalesce(tm.template_name, '') as template_name FROM test_definitions td diff --git a/testgen/template/execution/ex_write_test_record_to_testrun_table.sql b/testgen/template/execution/ex_write_test_record_to_testrun_table.sql deleted file mode 100644 index 07be1462..00000000 --- a/testgen/template/execution/ex_write_test_record_to_testrun_table.sql +++ /dev/null @@ -1,5 +0,0 @@ -INSERT INTO test_runs (id, test_suite_id, test_starttime, process_id) -(SELECT '{TEST_RUN_ID}' :: UUID as id, - '{TEST_SUITE_ID}' as test_suite_id, - '{RUN_DATE}' as test_starttime, - '{PROCESS_ID}'as process_id); diff --git a/testgen/template/flavors/databricks/profiling/project_profiling_query_databricks.yaml b/testgen/template/flavors/databricks/profiling/project_profiling_query_databricks.yaml index 18c24243..d42c6947 100644 --- a/testgen/template/flavors/databricks/profiling/project_profiling_query_databricks.yaml +++ b/testgen/template/flavors/databricks/profiling/project_profiling_query_databricks.yaml @@ -71,6 +71,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE(`{COL_NAME}`,' '''',.-', WHEN TRANSLATE(`{COL_NAME}`, 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', ' ') = `{COL_NAME}` THEN 1 ELSE 0 END) AS non_alpha_ct, + COUNT( CASE WHEN TRANSLATE(`{COL_NAME}`, '\u00a0\u2009\u200b\u200c\u200d\u200e\u200f\u202f\u3000\ufeff', 'XXXXXXXXXX') <> `{COL_NAME}` THEN 1 END) as non_printing_ct, SUM(<%IS_NUM;LEFT(`{COL_NAME}`, 31)%>) AS numeric_ct, SUM(<%IS_DATE;LEFT(`{COL_NAME}`, 26)%>) AS date_ct, CASE @@ -122,6 +123,7 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as upper_case_ct, NULL as lower_case_ct, NULL as non_alpha_ct, + NULL as non_printing_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, @@ -174,7 +176,7 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND((`{COL_NAME}` % 1), 5)) as fractional_sum, +strTemplate10_N_dec: SUM(ROUND(ABS(MOD(`{COL_NAME}`, 1)), 5)) as fractional_sum, strTemplate10_else: NULL as fractional_sum, diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_num_incr_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_num_incr_generic.sql deleted file mode 100644 index c9660494..00000000 --- a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_num_incr_generic.sql +++ /dev/null @@ -1,34 +0,0 @@ -SELECT '{TEST_TYPE}' as test_type, - '{TEST_DEFINITION_ID}' as test_definition_id, - '{TEST_SUITE_ID}' as test_suite_id, - '{RUN_DATE}' as test_time, '{START_TIME}' as starttime, CURRENT_TIMESTAMP as endtime, - '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{GROUPBY_NAMES}' as column_name, - {SKIP_ERRORS} as skip_errors, - 'match_schema_name = {MATCH_SCHEMA_NAME}, match_table_name = {MATCH_TABLE_NAME}, match_groupby_names = {MATCH_GROUPBY_NAMES} ,match_column_names = {MATCH_COLUMN_NAMES}, match_subset_condition = {MATCH_SUBSET_CONDITION}, match_having_condition = {MATCH_HAVING_CONDITION}, mode = {MODE}' - as input_parameters, - CASE WHEN COUNT(*) > COALESCE(skip_errors, 0) THEN 0 ELSE 1 END as result_code, - CONCAT( - CONCAT( 'Mismatched measures: ', CAST( COALESCE(COUNT(*), 0) AS {VARCHAR_TYPE}) ), - CONCAT( ', Threshold: ', - CONCAT( CAST(COALESCE(skip_errors, 0) AS {VARCHAR_TYPE}), '.') - ) - ) AS result_message, - COUNT(*) as result_measure, - '{TEST_ACTION}' as test_action, - '{SUBSET_CONDITION}' as subset_condition, - NULL as result_query, - '{TEST_DESCRIPTION}' as test_description -FROM ( - SELECT {GROUPBY_NAMES}, {SUM_COLUMNS} - FROM {SCHEMA_NAME}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} - UNION ALL - SELECT {MATCH_GROUPBY_NAMES}, {MATCH_SUM_COLUMNS} - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} - ) - ) a ; diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_above_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_generic.sql similarity index 70% rename from testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_above_generic.sql rename to testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_generic.sql index 6e20b995..62a92d40 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_above_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_generic.sql @@ -8,10 +8,10 @@ SELECT '{TEST_TYPE}' as test_type, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, - '{THRESHOLD_VALUE}' as threshold_value, + '{SKIP_ERRORS}' as threshold_value, {SKIP_ERRORS} as skip_errors, '{INPUT_PARAMETERS}' as input_parameters, - CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, CASE WHEN COUNT(*) > 0 THEN CONCAT( @@ -32,17 +32,17 @@ SELECT '{TEST_TYPE}' as test_type, FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total - FROM {SCHEMA_NAME}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} + FROM {SCHEMA_NAME}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} UNION ALL SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s - WHERE NOT total BETWEEN match_total AND match_total * (1 + {BASELINE_VALUE}::FLOAT/100.0) - OR (total IS NOT NULL AND match_total IS NULL) - OR (total IS NULL AND match_total IS NOT NULL); + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)); diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_within_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_range_generic.sql similarity index 69% rename from testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_within_generic.sql rename to testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_range_generic.sql index 78864287..9ab77d10 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_within_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_range_generic.sql @@ -8,10 +8,10 @@ SELECT '{TEST_TYPE}' as test_type, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, - '{THRESHOLD_VALUE}' as threshold_value, + '{SKIP_ERRORS}' as threshold_value, {SKIP_ERRORS} as skip_errors, '{INPUT_PARAMETERS}' as input_parameters, - CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, CASE WHEN COUNT(*) > 0 THEN CONCAT( @@ -32,17 +32,17 @@ SELECT '{TEST_TYPE}' as test_type, FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total - FROM {SCHEMA_NAME}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION} + FROM {SCHEMA_NAME}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} UNION ALL SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s - WHERE NOT total BETWEEN match_total * (1 - {BASELINE_VALUE}::FLOAT/100.0) AND match_total * (1 + {BASELINE_VALUE}::FLOAT/100.0) - OR (total IS NOT NULL AND match_total IS NULL) - OR (total IS NULL AND match_total IS NOT NULL); + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}); diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql index e9790a55..8a4c4cdf 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql @@ -41,7 +41,7 @@ FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TO FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} WHERE {MATCH_SUBSET_CONDITION} GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} ) a + {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_dupe_rows_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_dupe_rows_generic.sql new file mode 100644 index 00000000..2ec939c1 --- /dev/null +++ b/testgen/template/flavors/generic/exec_query_tests/ex_dupe_rows_generic.sql @@ -0,0 +1,37 @@ +SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{START_TIME}' as starttime, + CURRENT_TIMESTAMP as endtime, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CONCAT( + CONCAT( CAST(COUNT(*) AS {VARCHAR_TYPE}), ' duplicate row(s) identified, ' ), + CONCAT( + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END, + '{SKIP_ERRORS}.' + ) + ) + ELSE 'No errors found.' + END AS result_message, + COALESCE(SUM(record_ct), 0) as result_measure, + '{SUBSET_DISPLAY}' as subset_condition, + NULL as result_query + FROM ( SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct + FROM {SCHEMA_NAME}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + HAVING COUNT(*) > 1 + ) test; diff --git a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml b/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml index 1fe2412b..40a7568e 100644 --- a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml +++ b/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml @@ -71,6 +71,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE("{COL_NAME}",' '''',.-', WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', ' ') = "{COL_NAME}" THEN 1 ELSE 0 END) AS non_alpha_ct, + COUNT( CASE WHEN TRANSLATE("{COL_NAME}", NCHAR(160), 'X') <> "{COL_NAME}" THEN 1 END) as non_printing_ct, SUM(<%IS_NUM;LEFT("{COL_NAME}", 31)%>) AS numeric_ct, SUM(<%IS_DATE;LEFT("{COL_NAME}", 26)%>) AS date_ct, CASE @@ -124,6 +125,7 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as upper_case_ct, NULL as lower_case_ct, NULL as non_alpha_ct, + NULL as non_printing_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, @@ -171,7 +173,7 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(("{COL_NAME}" % 1), 5)) as fractional_sum, +strTemplate10_N_dec: SUM(ROUND(ABS(("{COL_NAME}" % 1)), 5)) as fractional_sum, strTemplate10_else: NULL as fractional_sum, diff --git a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml b/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml index 384f923e..8f035cff 100644 --- a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml +++ b/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml @@ -65,6 +65,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1 ELSE 0 END) AS non_alpha_ct, + COUNT( CASE WHEN TRANSLATE("{COL_NAME}", E'\u00a0\u2009\u200b\u200c\u200d\u200e\u200f\u202f\u3000\ufeff', 'XXXXXXXXXX') <> "{COL_NAME}" THEN 1 END) as non_printing_ct, SUM(<%IS_NUM;LEFT("{COL_NAME}", 31)%>) AS numeric_ct, SUM(<%IS_DATE;LEFT("{COL_NAME}", 26)%>) AS date_ct, CASE @@ -101,6 +102,7 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as upper_case_ct, NULL as lower_case_ct, NULL as non_alpha_ct, + NULL as non_printing_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, @@ -149,7 +151,7 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(MOD("{COL_NAME}", 1), 5)) as fractional_sum, +strTemplate10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, strTemplate10_else: NULL as fractional_sum, diff --git a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml b/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml index 0d2db5bc..0de85a1b 100644 --- a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml +++ b/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml @@ -45,6 +45,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a COUNT( CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" <> LOWER("{COL_NAME}") THEN 1 END) AS upper_case_ct, COUNT( CASE WHEN "{COL_NAME}" = LOWER("{COL_NAME}") AND "{COL_NAME}" <> UPPER("{COL_NAME}") THEN 1 END) AS lower_case_ct, COUNT( CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" = LOWER("{COL_NAME}") THEN 1 END) AS non_alpha_ct, + COUNT( CASE WHEN TRANSLATE("{COL_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COL_NAME}" THEN 1 END) as non_printing_ct, SUM(<%IS_NUM;LEFT("{COL_NAME}", 31)%>) AS numeric_ct, SUM(<%IS_DATE;LEFT("{COL_NAME}", 26)%>) AS date_ct, CASE @@ -81,6 +82,7 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as upper_case_ct, NULL as lower_case_ct, NULL as non_alpha_ct, + NULL as non_printing_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, @@ -126,7 +128,7 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(MOD("{COL_NAME}", 1), 5)) as fractional_sum, +strTemplate10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, strTemplate10_else: NULL as fractional_sum, diff --git a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml b/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml index 44e2cd5e..292dcb38 100644 --- a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml +++ b/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml @@ -52,6 +52,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a COUNT( CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" <> LOWER("{COL_NAME}") THEN 1 END) AS upper_case_ct, COUNT( CASE WHEN "{COL_NAME}" = LOWER("{COL_NAME}") AND "{COL_NAME}" <> UPPER("{COL_NAME}") THEN 1 END) AS lower_case_ct, COUNT( CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" = LOWER("{COL_NAME}") THEN 1 END) AS non_alpha_ct, + COUNT( CASE WHEN TRANSLATE("{COL_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COL_NAME}" THEN 1 END) as non_printing_ct, SUM(<%IS_NUM;LEFT("{COL_NAME}", 31)%>) AS numeric_ct, SUM(<%IS_DATE;LEFT("{COL_NAME}", 26)%>) AS date_ct, CASE @@ -88,6 +89,7 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as upper_case_ct, NULL as lower_case_ct, NULL as non_alpha_ct, + NULL as non_printing_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, @@ -134,7 +136,7 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(MOD("{COL_NAME}", 1), 5)) as fractional_sum, +strTemplate10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, strTemplate10_else: NULL as fractional_sum, diff --git a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml b/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml index 0e2dec6e..c1355afc 100644 --- a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml +++ b/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml @@ -65,6 +65,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1 ELSE 0 END) AS non_alpha_ct, + COUNT( CASE WHEN TRANSLATE("{COL_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COL_NAME}" THEN 1 END) as non_printing_ct, SUM(fndk_isnum(SUBSTRING("{COL_NAME}", 1, 31))) AS numeric_ct, SUM(fndk_isdate(SUBSTRING("{COL_NAME}", 1, 26))) AS date_ct, CASE @@ -101,6 +102,7 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as upper_case_ct, NULL as lower_case_ct, NULL as non_alpha_ct, + NULL as non_printing_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, @@ -146,7 +148,8 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(MOD("{COL_NAME}", 1), 5)) as fractional_sum, +strTemplate10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, + strTemplate10_else: NULL as fractional_sum, strTemplate11_D: CASE WHEN MIN("{COL_NAME}") IS NULL THEN NULL diff --git a/testgen/template/gen_funny_cat_tests/gen_test_constant.sql b/testgen/template/gen_funny_cat_tests/gen_test_constant.sql index 3f28dc5c..4270d713 100644 --- a/testgen/template/gen_funny_cat_tests/gen_test_constant.sql +++ b/testgen/template/gen_funny_cat_tests/gen_test_constant.sql @@ -55,6 +55,8 @@ WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date INNER JOIN profile_results p ON (rr.table_groups_id = p.table_groups_id AND rr.run_date = p.run_date) + -- No Dates as constants + WHERE NOT (p.general_type = 'D' AND rr.run_rank = 1) GROUP BY p.schema_name, p.table_name, p.column_name HAVING SUM(CASE WHEN distinct_value_ct = 1 THEN 0 ELSE 1 END) = 0 AND SUM(CASE WHEN max_length < 100 THEN 0 ELSE 1 END) = 0 @@ -67,7 +69,9 @@ WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date WHEN p.general_type = 'B' AND p.boolean_true_ct = 0 AND p.distinct_value_ct = 1 THEN 'FALSE' - END ) = 1 ), + END ) = 1 + -- Only constant if more than one profiling result + AND COUNT(*) > 1), newtests AS ( SELECT 'Constant'::VARCHAR AS test_type, '{TEST_SUITE_ID}'::UUID AS test_suite_id, c.profile_run_id, diff --git a/testgen/template/gen_funny_cat_tests/gen_test_distinct_value_ct.sql b/testgen/template/gen_funny_cat_tests/gen_test_distinct_value_ct.sql index 75e63cef..ab939339 100644 --- a/testgen/template/gen_funny_cat_tests/gen_test_distinct_value_ct.sql +++ b/testgen/template/gen_funny_cat_tests/gen_test_distinct_value_ct.sql @@ -56,7 +56,7 @@ WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date AND rr.run_date = p.run_date) GROUP BY p.schema_name, p.table_name, p.column_name HAVING SUM(CASE WHEN distinct_value_ct = 1 THEN 0 ELSE 1 END) = 0 - AND COUNT(DISTINCT CASE + AND (COUNT(DISTINCT CASE WHEN p.general_type = 'A' THEN min_text WHEN p.general_type = 'N' THEN min_value::VARCHAR WHEN p.general_type IN ('D','T') THEN min_date::VARCHAR @@ -65,7 +65,9 @@ WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date WHEN p.general_type = 'B' AND p.boolean_true_ct = 0 AND p.distinct_value_ct = 1 THEN 'FALSE' - END ) > 1 ), + END ) > 1 + -- include cases with only single profiling result -- can't yet assume constant + OR COUNT(*) = 1)), newtests AS ( SELECT 'Distinct_Value_Ct'::VARCHAR AS test_type, '{TEST_SUITE_ID}'::UUID AS test_suite_id, c.table_groups_id, c.profile_run_id, diff --git a/testgen/template/gen_query_tests/gen_dupe_rows_test.sql b/testgen/template/gen_query_tests/gen_dupe_rows_test.sql new file mode 100644 index 00000000..5027c111 --- /dev/null +++ b/testgen/template/gen_query_tests/gen_dupe_rows_test.sql @@ -0,0 +1,46 @@ +INSERT INTO test_definitions (table_groups_id, profile_run_id, test_type, test_suite_id, + schema_name, table_name, + skip_errors, test_active, last_auto_gen_date, profiling_as_of_date, + groupby_names ) +WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date + FROM profile_results p + INNER JOIN profiling_runs r + ON (p.profile_run_id = r.id) + INNER JOIN test_suites ts + ON p.project_code = ts.project_code + AND p.connection_id = ts.connection_id + WHERE p.project_code = '{PROJECT_CODE}' + AND r.table_groups_id = '{TABLE_GROUPS_ID}'::UUID + AND ts.id = '{TEST_SUITE_ID}' + AND p.run_date::DATE <= '{AS_OF_DATE}' + GROUP BY r.table_groups_id), + curprof AS (SELECT p.schema_name, p.table_name, p.profile_run_id, + STRING_AGG(QUOTE_IDENT(p.column_name), ', ' ORDER BY p.position) as unique_by_columns + FROM last_run lr + INNER JOIN profile_results p + ON (lr.table_groups_id = p.table_groups_id + AND lr.last_run_date = p.run_date) + GROUP BY p.schema_name, p.table_name, p.profile_run_id), + locked AS (SELECT schema_name, table_name + FROM test_definitions + WHERE table_groups_id = '{TABLE_GROUPS_ID}'::UUID + AND test_suite_id = '{TEST_SUITE_ID}' + AND test_type = '{TEST_TYPE}' + AND lock_refresh = 'Y'), + newtests AS (SELECT * + FROM curprof + WHERE schema_name = '{DATA_SCHEMA}') +SELECT '{TABLE_GROUPS_ID}'::UUID as table_groups_id, + n.profile_run_id, + 'Dupe_Rows' AS test_type, + '{TEST_SUITE_ID}' AS test_suite_id, + n.schema_name, n.table_name, + 0 as skip_errors, 'Y' as test_active, + '{RUN_DATE}'::TIMESTAMP as last_auto_gen_date, + '{AS_OF_DATE}'::TIMESTAMP as profiling_as_of_date, + unique_by_columns as groupby_columns +FROM newtests n +LEFT JOIN locked l + ON (n.schema_name = l.schema_name + AND n.table_name = l.table_name) +WHERE l.schema_name IS NULL; diff --git a/testgen/template/get_entities/get_connection.sql b/testgen/template/get_entities/get_connection.sql index 30621ea1..035f9304 100644 --- a/testgen/template/get_entities/get_connection.sql +++ b/testgen/template/get_entities/get_connection.sql @@ -15,6 +15,7 @@ SELECT connect_by_url, connect_by_key, private_key, - private_key_passphrase + private_key_passphrase, + http_path FROM connections WHERE connection_id = {CONNECTION_ID}; diff --git a/testgen/template/get_entities/get_profile_screen.sql b/testgen/template/get_entities/get_profile_screen.sql index 8598f5ef..cdd42a09 100644 --- a/testgen/template/get_entities/get_profile_screen.sql +++ b/testgen/template/get_entities/get_profile_screen.sql @@ -37,7 +37,7 @@ WITH p.column_name, p.column_type, 'Non-Standard Blank Values' AS qualification_test, - (((('Filled Values: ' || p.filled_value_ct::VARCHAR(10)) || ', Null: ') || + (((('Dummy Values: ' || p.filled_value_ct::VARCHAR(10)) || ', Null: ') || p.null_value_ct::VARCHAR(10)) || ', Empty String: ') || p.zero_length_ct::VARCHAR(10) AS detail FROM profiling p @@ -50,7 +50,7 @@ WITH p.column_type, 'Invalid Zip Code Format' AS qualification_test, (((('Min Length: ' || p.min_length::VARCHAR(10)) || ', Max Length: ') || - p.max_length::VARCHAR(10)) || ', Filled Values: ') || + p.max_length::VARCHAR(10)) || ', Dummy Values: ') || p.filled_value_ct::VARCHAR(10) AS detail FROM profiling p WHERE p.column_name ILIKE '%zip%' @@ -95,7 +95,7 @@ WITH p.column_name, p.column_type, 'No column values present' AS qualification_test, - (((('Null: ' || p.null_value_ct::VARCHAR(10)) || ', Filled: ') || + (((('Null: ' || p.null_value_ct::VARCHAR(10)) || ', Dummy: ') || p.filled_value_ct::VARCHAR(10)) || ', Zero Len: ') || p.zero_length_ct::VARCHAR(10) AS detail FROM profiling p diff --git a/testgen/template/get_entities/get_test_info.sql b/testgen/template/get_entities/get_test_info.sql index 2bd589a5..feb0cfb8 100644 --- a/testgen/template/get_entities/get_test_info.sql +++ b/testgen/template/get_entities/get_test_info.sql @@ -29,6 +29,8 @@ Optional: last_auto_run_date (==test-gen-run-id==), schema-name, table-name, col td.baseline_sum, td.baseline_avg, td.baseline_sd, + td.lower_tolerance, + td.upper_tolerance, td.subset_condition, td.check_result, td.last_auto_gen_date, diff --git a/testgen/template/profiling/datatype_suggestions.sql b/testgen/template/profiling/datatype_suggestions.sql index 9a117f27..d0af2a48 100644 --- a/testgen/template/profiling/datatype_suggestions.sql +++ b/testgen/template/profiling/datatype_suggestions.sql @@ -1,56 +1,151 @@ -UPDATE profile_results - SET datatype_suggestion = - CASE - WHEN record_ct > 500 AND column_name not ILIKE '%id' THEN - CASE - WHEN general_type = 'A' AND column_name ILIKE '%zip%' - AND max_length <= 10 THEN 'VARCHAR(10)' - WHEN general_type = 'A' - AND numeric_ct > 0 - AND value_ct = numeric_ct + zero_length_ct - AND POSITION('.' in top_freq_values) > 0 THEN 'DECIMAL(18,4)' - WHEN general_type = 'A' - AND numeric_ct > 0 - AND value_ct = numeric_ct + zero_length_ct - AND max_length <= 6 - AND POSITION('.' in top_freq_values) = 0 THEN 'INTEGER' - WHEN general_type = 'A' - AND numeric_ct > 0 - AND value_ct = numeric_ct + zero_length_ct - AND max_length > 6 - AND POSITION('.' in top_freq_values) = 0 THEN 'BIGINT' - WHEN general_type = 'A' - AND date_ct > 0 - AND value_ct = date_ct + zero_length_ct THEN 'DATE' - WHEN general_type = 'A' - AND max_length <= 5 THEN 'VARCHAR(10)' - WHEN general_type = 'A' - AND max_length IS NOT NULL - THEN 'VARCHAR(' - || ( (1 + TRUNC( (max_length + 10) /20.0, 0)) * 20)::VARCHAR(10) - || ')' - WHEN general_type = 'N' - AND RTRIM(SPLIT_PART(column_type, ',', 2),')') > '0' - AND fractional_sum = 0 - AND min_value >= -100 - AND max_value <= 100 - THEN 'SMALLINT' - WHEN general_type = 'N' - AND RTRIM(SPLIT_PART(column_type, ',', 2),')') > '0' - AND fractional_sum = 0 - AND min_value >= -100000000 - AND max_value <= 100000000 - THEN 'INTEGER' - WHEN general_type = 'N' - AND RTRIM(SPLIT_PART(column_type, ',', 2),')') > '0' - AND fractional_sum = 0 - AND (min_value < -100000000 - OR max_value > 100000000) - THEN 'BIGINT' - ELSE LOWER(column_type) - END - ELSE LOWER(column_type) - END - WHERE project_code = '{PROJECT_CODE}' - AND schema_name = '{DATA_SCHEMA}' - AND run_date = '{RUN_DATE}'; +UPDATE profile_results pr +SET datatype_suggestion = + CASE + WHEN pr.record_ct > 500 + AND pr.column_name NOT ILIKE '%id' + THEN + CASE base.general_type + WHEN 'A' THEN + CASE + -- ZIP codes + WHEN pr.column_name ILIKE '%zip%' + AND pr.max_length <= 10 + THEN 'VARCHAR(' + || COALESCE(LEAST(10, base.current_size), 10)::text + || ')' + + -- Small and Predictable + WHEN pr.functional_data_type IN ('State', 'Boolean') + THEN 'VARCHAR(' || max_length::VARCHAR || ')' + + WHEN pr.functional_data_type = 'Measurement Pct' + THEN 'VARCHAR(' + || COALESCE(GREATEST(6, max_length), 6)::text + || ')' + + -- DECIMALs + WHEN pr.numeric_ct > 0 + AND pr.value_ct = pr.numeric_ct + pr.zero_length_ct + AND POSITION('.' IN pr.top_freq_values) > 0 + THEN 'DECIMAL(18,4)' + + -- small/big integers + WHEN pr.numeric_ct > 0 + AND pr.value_ct = pr.numeric_ct + pr.zero_length_ct + AND pr.max_length <= 6 + AND POSITION('.' IN pr.top_freq_values) = 0 + THEN 'INTEGER' + WHEN pr.numeric_ct > 0 + AND pr.value_ct = pr.numeric_ct + pr.zero_length_ct + AND pr.max_length > 6 + AND POSITION('.' IN pr.top_freq_values) = 0 + THEN 'BIGINT' + + -- timestamps with zone + WHEN pr.date_ct > 0 + AND pr.value_ct = pr.date_ct + pr.zero_length_ct + AND POSITION('+' IN pr.top_freq_values) > 0 + THEN CASE + WHEN '{SQL_FLAVOR}' = 'redshift' THEN 'TIMESTAMPZ' + WHEN '{SQL_FLAVOR}' = 'postgresql' THEN 'TIMESTAMPZ' + WHEN '{SQL_FLAVOR}' = 'snowflake' THEN 'TIMESTAMP_TZ' + WHEN '{SQL_FLAVOR}' LIKE 'mssql%' THEN 'DATETIMEOFFSET' + WHEN '{SQL_FLAVOR}' = 'databricks' THEN 'TIMESTAMP' + WHEN '{SQL_FLAVOR}' = 'bigquery' THEN 'TIMESTAMP' + ELSE 'TIMESTAMPZ' + END + + -- timestamps without zone + WHEN pr.date_ct > 0 + AND pr.value_ct = pr.date_ct + pr.zero_length_ct + AND POSITION(':' IN pr.top_freq_values) > 0 + THEN CASE + WHEN '{SQL_FLAVOR}' = 'redshift' THEN 'TIMESTAMP' + WHEN '{SQL_FLAVOR}' = 'postgresql' THEN 'TIMESTAMP' + WHEN '{SQL_FLAVOR}' = 'snowflake' THEN 'TIMESTAMP_NTZ' + WHEN '{SQL_FLAVOR}' LIKE 'mssql%' THEN 'DATETIME2' + WHEN '{SQL_FLAVOR}' = 'databricks' THEN 'TIMESTAMP_NTZ' + WHEN '{SQL_FLAVOR}' = 'bigquery' THEN 'DATETIME' + ELSE 'TIMESTAMP_NTZ' + END + + -- pure dates + WHEN pr.date_ct > 0 + AND pr.value_ct = pr.date_ct + pr.zero_length_ct + THEN 'DATE' + + -- very short text → suggest VARCHAR(10) + WHEN pr.max_length <= 5 + THEN 'VARCHAR(' + || COALESCE(LEAST(10, base.current_size), 10)::text + || ')' + + -- fallback text → adaptive bucket + WHEN pr.max_length IS NOT NULL + THEN + 'VARCHAR(' + || COALESCE( + LEAST( + -- computed_bucket: + (CASE + WHEN pr.max_length <= 50 + THEN CEIL((pr.max_length + 5)/10.0) * 10 + ELSE ((1 + TRUNC((pr.max_length + 10)/20.0, 0)) * 20) + END)::int, + base.current_size + ), + -- fallback if current_size IS NULL + (CASE + WHEN pr.max_length <= 50 + THEN CEIL(pr.max_length/10.0) * 10 + ELSE ((1 + TRUNC((pr.max_length + 10)/20.0, 0)) * 20) + END)::int + )::text + || ')' + + ELSE + lower(pr.column_type) + END + + WHEN 'N' THEN + CASE + WHEN RTRIM(SPLIT_PART(pr.column_type, ',', 2),')') > '0' + AND pr.fractional_sum = 0 + AND pr.min_value >= -100 + AND pr.max_value <= 100 + THEN 'SMALLINT' + + WHEN RTRIM(SPLIT_PART(pr.column_type, ',', 2),')') > '0' + AND pr.fractional_sum = 0 + AND pr.min_value >= -100000000 + AND pr.max_value <= 100000000 + THEN 'INTEGER' + + WHEN RTRIM(SPLIT_PART(pr.column_type, ',', 2),')') > '0' + AND pr.fractional_sum = 0 + AND (pr.min_value < -100000000 + OR pr.max_value > 100000000) + THEN 'BIGINT' + + ELSE + lower(pr.column_type) + END + + ELSE + lower(pr.column_type) + END + ELSE + lower(pr.column_type) + END +FROM ( + SELECT + id, + general_type, + -- pull out declared size if present, else NULL + CAST(substring(column_type FROM '\((\d+)\)') AS int) AS current_size + FROM profile_results + WHERE project_code = '{PROJECT_CODE}' + AND schema_name = '{DATA_SCHEMA}' + AND run_date = '{RUN_DATE}' +) AS base +WHERE pr.id = base.id; diff --git a/testgen/template/profiling/functional_datatype.sql b/testgen/template/profiling/functional_datatype.sql index 97d28b82..da853219 100644 --- a/testgen/template/profiling/functional_datatype.sql +++ b/testgen/template/profiling/functional_datatype.sql @@ -126,7 +126,8 @@ SET functional_data_type = 'DateTime Stamp' WHERE profile_run_id = '{PROFILE_RUN_ID}' AND functional_data_type IS NULL AND distinct_pattern_ct = 1 - AND TRIM(SPLIT_PART(top_patterns, '|', 2)) = 'NNNN-NN-NN NN:NN:NN'; + AND (TRIM(SPLIT_PART(top_patterns, '|', 2)) = 'NNNN-NN-NN NN:NN:NN' + OR TRIM(SPLIT_PART(top_patterns, '|', 2)) = 'NNNN-NN-NNANN:NN:NN+NN:NN'); -- Process Timestamp UPDATE profile_results @@ -306,7 +307,7 @@ INNER JOIN profile_results s AND LOWER(c.column_name) SIMILAR TO '%c(|i)ty%' AND c.functional_data_type NOT IN ('State', 'Zip') AND profile_results.id = c.id; - + -- Assign Name UPDATE profile_results SET functional_data_type = 'Person Full Name' @@ -476,6 +477,17 @@ SET functional_data_type = WHEN ROUND(100.0 * value_ct::FLOAT/NULLIF(record_ct, 0)) > 70 THEN 'ID' ELSE 'Attribute-Numeric' END + WHEN general_type='N' + AND ( -- Sparsity condition: mostly zero + (percentile_25 = 0 AND percentile_75 = 0 AND percentile_50 = 0) + OR + -- Sparsity condition: mostly NULL + (value_ct > 0 AND record_ct > 0 + AND (value_ct::FLOAT / record_ct::FLOAT) < 0.05) ) + AND ( -- Evidence of extreme non-zero values + (percentile_75 - percentile_25) > 2 * ABS(avg_value) + OR ABS(avg_value) > 5 * ABS(percentile_50) ) THEN 'Measurement Spike' + WHEN general_type='N' AND ( column_type ILIKE '%int%' OR diff --git a/testgen/template/profiling/project_update_profile_results_to_estimates.sql b/testgen/template/profiling/project_update_profile_results_to_estimates.sql index 48d2d61c..640829cf 100644 --- a/testgen/template/profiling/project_update_profile_results_to_estimates.sql +++ b/testgen/template/profiling/project_update_profile_results_to_estimates.sql @@ -22,8 +22,7 @@ set sample_ratio = {PROFILE_SAMPLE_RATIO}, within_1yr_date_ct = ROUND(within_1yr_date_ct * {PROFILE_SAMPLE_RATIO}, 0), within_1mo_date_ct = ROUND(within_1mo_date_ct * {PROFILE_SAMPLE_RATIO}, 0), future_date_ct = ROUND(future_date_ct * {PROFILE_SAMPLE_RATIO}, 0), - boolean_true_ct = ROUND(boolean_true_ct * {PROFILE_SAMPLE_RATIO}, 0), - date_days_present = ROUND(date_days_present * {PROFILE_SAMPLE_RATIO}, 0) + boolean_true_ct = ROUND(boolean_true_ct * {PROFILE_SAMPLE_RATIO}, 0) where profile_run_id = '{PROFILE_RUN_ID}' and schema_name = split_part('{SAMPLING_TABLE}', '.', 1) and table_name = split_part('{SAMPLING_TABLE}', '.', 2) diff --git a/testgen/ui/app.py b/testgen/ui/app.py index 36bff7b5..de4f8d0e 100644 --- a/testgen/ui/app.py +++ b/testgen/ui/app.py @@ -3,6 +3,7 @@ import streamlit as st from testgen import settings +from testgen.common import version_service from testgen.common.docker_service import check_basic_configuration from testgen.common.models import with_database_session from testgen.ui import bootstrap @@ -51,9 +52,12 @@ def render(log_level: int = logging.INFO): testgen.sidebar( projects=project_service.get_projects(), current_project=session.sidebar_project, - menu=application.menu.update_version(application.get_version()), - username=session.username, + menu=application.menu, current_page=session.current_page, + username=session.username, + role=session.auth_role, + version=version_service.get_version(), + support_email=settings.SUPPORT_EMAIL, ) application.router.run() diff --git a/testgen/ui/assets/scripts.js b/testgen/ui/assets/scripts.js index 45da923a..46e0aafb 100644 --- a/testgen/ui/assets/scripts.js +++ b/testgen/ui/assets/scripts.js @@ -2,10 +2,6 @@ import van from './static/js/van.min.js'; window.van = van; -window.addEventListener('load', function() { - removeElements([ 'header[data-testid="stHeader"]' ]); -}); - window.addEventListener('message', async function(event) { if (event.data.type === 'TestgenCopyToClipboard') { await copyToClipboard(event.data.text || ''); diff --git a/testgen/ui/assets/style.css b/testgen/ui/assets/style.css index a57b453c..420f9605 100644 --- a/testgen/ui/assets/style.css +++ b/testgen/ui/assets/style.css @@ -33,6 +33,8 @@ body { --portal-background: white; --portal-box-shadow: rgba(0, 0, 0, 0.16) 0px 4px 16px; --select-hover-background: rgb(240, 242, 246); + + --app-background-color: #f8f9fa; } img.dk-logo-img { @@ -40,18 +42,51 @@ img.dk-logo-img { width: 100%; } -/* Streamlit header */ -header { +/* Header */ +.stAppHeader { + width: 85px !important; /* allows clicking on the breadcrumbs */ + left: calc(24px - 1rem) !important; + background: transparent !important; + min-height: unset !important; + overflow: hidden !important; /* hides the running man animation */ +} + +/* - with breadcrumbs */ +.stAppHeader:has(~ .stMain .st-key-testgen-breadcrumbs) { + height: 65px !important; + top: 5px !important; +} + +/* - without breadcrumbs */ +.stAppHeader:not(:has(~ .stMain .st-key-testgen-breadcrumbs)) { + top: 7px !important; + height: 39.59px !important; +} + +/* hide while sidebar expanded */ +.stApp:has(.stSidebar[aria-expanded="true"]) .stAppHeader { + display: none; +} + +.stStatusWidget { display: none !important; } -/* ... */ +/* End Header */ + +#stDecoration { + visibility: hidden; +} /* Sidebar */ -[data-testid="stSidebarHeader"] { +[data-testid="stSidebarContent"] [data-testid="stSidebarHeader"] { padding: 16px 20px; } -section[data-testid="stSidebar"] { +[data-testid="stSidebarHeader"] .stLogo { + max-width: fit-content; +} + +section.stSidebar { width: 250px; z-index: 999; background-color: var(--sidebar-background-color); @@ -64,30 +99,18 @@ section[data-testid="stSidebar"] { /* */ /* Main content */ -div[data-testid="stAppViewContainer"] > :nth-child(2 of section) { - background-color: #f8f9fa; +.stMain { + background-color: var(--app-background-color); } -div[data-testid="stMainBlockContainer"] { +.stMain > .stMainBlockContainer { padding: 12px 24px 24px; } -div[data-testid="stVerticalBlock"] { +.stVerticalBlock[data-testid="stVerticalBlock"] { gap: 0.5rem; } -div[data-testid="stAppViewContainer"]:has(section[data-testid="stSidebar"]) div[data-testid="stSidebarCollapsedControl"] { - top: 0.5rem; - border-radius: 4px; - background-color: var(--border-color); - padding: 3px 0 0 8px; -} - -div[data-testid="stAppViewContainer"]:has(section[data-testid="stSidebar"][aria-expanded="true"]) div[data-testid="stSidebarCollapsedControl"] { - display: none; -} -/* */ - /* Dialog - sets the width of all st.dialog */ /* There is no way to target "large" and "small" dialogs reliably */ div[data-testid="stDialog"] div[role="dialog"] { @@ -121,6 +144,8 @@ div.st-key-data_catalog-spinner { } /* Theming for buttons, tabs and form inputs */ +button[data-testid="stPopoverButton"]:hover, +button[data-testid="stPopoverButton"]:focus:not(:active), button[data-testid="stBaseButton-secondary"]:hover, button[data-testid="stBaseButton-secondary"]:focus:not(:active), button[data-testid="stBaseButton-secondaryFormSubmit"]:hover, @@ -129,6 +154,7 @@ button[data-testid="stBaseButton-secondaryFormSubmit"]:focus:not(:active) { color: var(--primary-color); } +button[data-testid="stPopoverButton"]:active, button[data-testid="stBaseButton-secondary"]:active, button[data-testid="stBaseButton-secondaryFormSubmit"]:active, label[data-baseweb="checkbox"]:has(input[aria-checked="true"]) > span { @@ -196,11 +222,12 @@ button[title="Show password text"] { } /* ... */ -[data-testid="stVerticalBlockBorderWrapper"]:has(> div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.bg-white) { +.stVerticalBlock:has(> div.stElementContainer > div.stHtml > i.bg-white), +[data-testid="stVerticalBlockBorderWrapper"]:has(> .stVerticalBlock > .stElementContainer > div.stHtml > i.bg-white) { background-color: var(--dk-card-background); } -div[data-testid="stVerticalBlockBorderWrapper"]:has(> div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.flex-row) > div > [data-testid="stVerticalBlock"] { +.stVerticalBlock:has(> div.stElementContainer > div.stHtml > i.flex-row) { width: 100%; flex-direction: row; } @@ -211,19 +238,19 @@ div[data-testid="stVerticalBlockBorderWrapper"]:has( > div > div[data-testid="st max-height: 40px; } -div[data-testid="stVerticalBlockBorderWrapper"]:has( > div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.flex-start) [data-testid="stVerticalBlock"] { +.stVerticalBlock:has(> div.stElementContainer > div.stHtml > i.flex-start) { justify-content: flex-start; } -div[data-testid="stVerticalBlockBorderWrapper"]:has( > div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.flex-end) [data-testid="stVerticalBlock"] { +.stVerticalBlock:has(> div.stElementContainer > div.stHtml > i.flex-end) { justify-content: flex-end; } -div[data-testid="stVerticalBlockBorderWrapper"]:has( > div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.flex-center) [data-testid="stVerticalBlock"] { +.stVerticalBlock:has(> div.stElementContainer > div.stHtml > i.flex-center) { justify-content: center; } -[data-testid="stVerticalBlock"]:has(> div.element-container > div.stHtml > i.no-flex-gap) { +.stVerticalBlock:has(> div.stElementContainer > div.stHtml > i.no-flex-gap) { gap: unset; } @@ -289,8 +316,13 @@ Use as testgen.text("text", "extra_styles") */ transition: padding 0.3s; } -[data-testid="stSidebar"][aria-expanded="false"] ~ [data-testid="stMain"] .tg-header { - padding-left: 80px; +.st-key-testgen-breadcrumbs { + transition: padding 0.3s; +} + +[data-testid="stSidebar"][aria-expanded="false"] ~ div > [data-testid="stMain"] .tg-header, +[data-testid="stSidebar"][aria-expanded="false"] ~ div > [data-testid="stMain"] .st-key-testgen-breadcrumbs { + padding-left: 85px; } .tg-header--line { @@ -301,17 +333,54 @@ Use as testgen.text("text", "extra_styles") */ background-color: var(--disabled-text-color); } -div[data-testid="stVerticalBlockBorderWrapper"]:has(> div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.tg-header--links) [data-testid="stLinkButton"] a { +/* Help menu */ +.st-key-tg-header--help [data-testid="stPageLink"] { + position: absolute; + top: -7px; + right: 0; + z-index: 5; +} + +.st-key-tg-header--help [data-testid="stPageLink"] [data-testid="stPageLink-NavLink"] { + line-height: 1; +} + +.st-key-tg-header--help [data-testid="stPopover"] { + display: flex; + justify-content: flex-end; +} + +.st-key-tg-header--help button[data-testid="stPopoverButton"] { border: none; background: none; - padding: 6px; - min-height: 24px; - color: var(--primary-text-color); + padding: 0; + margin-top: 8px; + min-height: fit-content; } -div[data-testid="stVerticalBlockBorderWrapper"]:has(> div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.tg-header--links) [data-testid="stLinkButton"] a p { - font-size: 20px; - line-height: 1; +.st-key-tg-header--help button[data-testid="stPopoverButton"]:focus:not(:hover) { + color: inherit; +} + +.st-key-tg-header--help-dummy [data-testid="stMarkdownContainer"] p { + display: flex; + align-items: center; + margin-top: 8px; + min-height: fit-content; +} + +.st-key-tg-header--help-dummy p span { + width: 1.25rem; + height: 1.25rem; + font-size: 1.25rem; + line-height: 1.25rem; + margin-top: 0.125rem; + margin-left: 0.125rem; + margin-right: -0.3125rem; +} + +div[data-testid="stPopoverBody"]:has(i.tg-header--help-wrapper) { + padding: 0; } /* */ @@ -367,6 +436,48 @@ div[data-testid="stVerticalBlockBorderWrapper"]:has(> div > div[data-testid="stV } /* */ +/* Export Menu */ +.st-key-tg--export-popover [data-testid="stPopoverButton"] > div:last-child { + display: none; +} + +.st-key-tg--export-popover [data-testid="stPopover"] { + width: auto; +} + +div[data-testid="stPopoverBody"]:has(i.tg--export-wrapper) { + min-width: 150px; + border-radius: 8px; + padding: 0; +} + +div[data-testid="stPopoverBody"] [data-testid="stVerticalBlock"]:has(i.tg--export-wrapper) { + gap: 0; +} + +div[data-testid="stPopoverBody"] [data-testid="stVerticalBlock"]:has(i.tg--export-wrapper) button { + width: 100%; + padding: 4px 16px; + justify-content: flex-start; + border-radius: 0; +} + +div[data-testid="stPopoverBody"] [data-testid="stVerticalBlock"]:has(i.tg--export-wrapper) [data-testid="stElementContainer"]:nth-child(2) button { + border-top-left-radius: 8px; + border-top-right-radius: 8px; +} + +div[data-testid="stPopoverBody"] [data-testid="stVerticalBlock"]:has(i.tg--export-wrapper) [data-testid="stElementContainer"]:last-child button { + border-bottom-left-radius: 8px; + border-bottom-right-radius: 8px; +} + +div[data-testid="stPopoverBody"] [data-testid="stVerticalBlock"]:has(i.tg--export-wrapper) button:hover { + color: unset; + background: var(--select-hover-background); +} +/* */ + /* Dark mode */ @media (prefers-color-scheme: dark) { body { @@ -387,20 +498,16 @@ div[data-testid="stVerticalBlockBorderWrapper"]:has(> div > div[data-testid="stV --portal-background: #14181f; --portal-box-shadow: rgba(0, 0, 0, 0.95) 0px 4px 16px; --select-hover-background: rgba(255, 255, 255, .32); - } - /* Main content */ - div[data-testid="stAppViewContainer"] > :nth-child(2 of section) { - background-color: rgb(14, 17, 23); + --app-background-color: rgb(14, 17, 23); } - /* */ div[data-modal-container='true']::before { background-color: rgba(100, 100, 100, 0.5) !important; } div[data-modal-container='true'] > div:first-child > div:first-child { - background-color: rgb(14, 17, 23) !important; + background-color: var(--app-background-color) !important; } } /* ... */ diff --git a/testgen/ui/bootstrap.py b/testgen/ui/bootstrap.py index 6b0fed7a..3b048414 100644 --- a/testgen/ui/bootstrap.py +++ b/testgen/ui/bootstrap.py @@ -2,12 +2,10 @@ import logging from testgen import settings -from testgen.commands.run_upgrade_db_config import get_schema_revision -from testgen.common import configure_logging, version_service -from testgen.ui.navigation.menu import Menu, Version +from testgen.common import configure_logging +from testgen.ui.navigation.menu import Menu from testgen.ui.navigation.page import Page from testgen.ui.navigation.router import Router -from testgen.ui.session import session from testgen.ui.views.connections import ConnectionsPage from testgen.ui.views.data_catalog import DataCatalogPage from testgen.ui.views.hygiene_issues import HygieneIssuesPage @@ -55,17 +53,6 @@ def __init__(self, logo: plugins.Logo, router: Router, menu: Menu, logger: loggi self.menu = menu self.logger = logger - def get_version(self) -> Version: - latest_version = self.menu.version.latest - if not session.latest_version: - latest_version = version_service.get_latest_version() - - return Version( - current=settings.VERSION, - latest=latest_version, - schema=get_schema_revision(), - ) - def run(log_level: int = logging.INFO) -> Application: pages = [*BUILTIN_PAGES] @@ -106,11 +93,6 @@ def run(log_level: int = logging.INFO) -> Application: for page in pages if page.menu_item }.values() ), - version=Version( - current=settings.VERSION, - latest="...", - schema=get_schema_revision(), - ), ), logger=LOG, ) diff --git a/testgen/ui/components/frontend/css/shared.css b/testgen/ui/components/frontend/css/shared.css index 643b4ffb..d9ff025d 100644 --- a/testgen/ui/components/frontend/css/shared.css +++ b/testgen/ui/components/frontend/css/shared.css @@ -118,7 +118,7 @@ body { --portal-background: #14181f; --portal-box-shadow: rgba(0, 0, 0, 0.95) 0px 4px 16px; - --select-hover-background: rgba(255, 255, 255, .32); + --select-hover-background: rgb(38, 39, 48); } } @@ -149,12 +149,17 @@ body { border: var(--button-stroked-border); border-radius: 8px; padding: 16px; + box-sizing: border-box; } .table-row { padding: 12px 0; } +.table.hoverable .table-row:hover { + background-color: var(--select-hover-background); +} + .table-row:not(:last-child) { border-bottom: var(--button-stroked-border); } @@ -586,3 +591,31 @@ body { padding-left: 40px; } /* */ + +code { + position: relative; + border-radius: 0.5rem; + display: block; + margin: 0px; + overflow: auto; + padding: 24px 16px; + color: var(--primary-text-color); + background-color: var(--empty-light); +} + +code > .tg-icon { + position: absolute; + top: 21px; + right: 16px; + color: var(--secondary-text-color); + cursor: pointer; + opacity: 0; +} + +code > .tg-icon:hover { + opacity: 1; +} + +.accent-primary { + accent-color: var(--primary-color); +} diff --git a/testgen/ui/components/frontend/js/components/alert.js b/testgen/ui/components/frontend/js/components/alert.js index a797d6aa..cda6afda 100644 --- a/testgen/ui/components/frontend/js/components/alert.js +++ b/testgen/ui/components/frontend/js/components/alert.js @@ -4,6 +4,7 @@ * @property {string?} icon * @property {number?} timeout * @property {boolean?} closeable + * @property {string?} class * @property {'info'|'success'|'warn'|'error'} type */ import van from '../van.min.js'; @@ -12,12 +13,6 @@ import { Icon } from './icon.js'; import { Button } from './button.js'; const { div } = van.tags; -const alertTypeColors = { - info: {backgroundColor: 'rgba(28, 131, 225, 0.1)', color: 'rgb(0, 66, 128)'}, - success: {backgroundColor: 'rgba(33, 195, 84, 0.1)', color: 'rgb(23, 114, 51)'}, - warn: {backgroundColor: 'rgba(255, 227, 18, 0.2)', color: 'rgb(255, 255, 194)'}, - error: {backgroundColor: 'rgba(255, 43, 43, 0.09)', color: 'rgb(125, 53, 59)'}, -}; const Alert = (/** @type Properties */ props, /** @type Array */ ...children) => { loadStylesheet('alert', stylesheet); @@ -40,6 +35,10 @@ const Alert = (/** @type Properties */ props, /** @type Array */ .. }, () => { const icon = getValue(props.icon); + if (!icon) { + return ''; + } + return Icon({size: 20, classes: 'mr-2'}, icon); }, div( @@ -52,11 +51,10 @@ const Alert = (/** @type Properties */ props, /** @type Array */ .. return ''; } - const colors = alertTypeColors[getValue(props.type)]; return Button({ type: 'icon', icon: 'close', - style: `margin-left: auto; color: ${colors.color};`, + style: `margin-left: auto;`, }); }, ); @@ -81,26 +79,45 @@ stylesheet.replace(` color: rgb(23, 114, 51); } -.tg-alert-error { - background-color: rgba(255, 43, 43, 0.09); - color: rgb(125, 53, 59); -} - .tg-alert-warn { background-color: rgba(255, 227, 18, 0.1); color: rgb(146, 108, 5); } +.tg-alert-error { + background-color: rgba(255, 43, 43, 0.09); + color: rgb(125, 53, 59); +} + @media (prefers-color-scheme: dark) { + .tg-alert-info { + background-color: rgba(61, 157, 243, 0.2); + color: rgb(199, 235, 255); + } + + .tg-alert-success { + background-color: rgba(61, 213, 109, 0.2); + color: rgb(223, 253, 233); + } + .tg-alert-warn { background-color: rgba(255, 227, 18, 0.2); color: rgb(255, 255, 194); } + + .tg-alert-error { + background-color: rgba(255, 108, 108, 0.2); + color: rgb(255, 222, 222); + } } .tg-alert > .tg-icon { color: inherit !important; } + +.tg-alert > .tg-button { + color: inherit !important; +} `); export { Alert }; diff --git a/testgen/ui/components/frontend/js/components/attribute.js b/testgen/ui/components/frontend/js/components/attribute.js index 106bd7e0..61240f7f 100644 --- a/testgen/ui/components/frontend/js/components/attribute.js +++ b/testgen/ui/components/frontend/js/components/attribute.js @@ -5,6 +5,7 @@ * @property {string?} help * @property {string | number} value * @property {number?} width + * @property {string?} class */ import { getValue, loadStylesheet } from '../utils.js'; import { Icon } from './icon.js'; @@ -17,7 +18,7 @@ const Attribute = (/** @type Properties */ props) => { loadStylesheet('attribute', stylesheet); return div( - { style: () => `width: ${props.width ? getValue(props.width) + 'px' : 'auto'}` }, + { style: () => `width: ${props.width ? getValue(props.width) + 'px' : 'auto'}`, class: props.class }, div( { class: 'flex-row fx-gap-1 text-caption mb-1' }, props.label, diff --git a/testgen/ui/components/frontend/js/components/box_plot.js b/testgen/ui/components/frontend/js/components/box_plot.js index 5c3ba7e9..ef1957b9 100644 --- a/testgen/ui/components/frontend/js/components/box_plot.js +++ b/testgen/ui/components/frontend/js/components/box_plot.js @@ -12,7 +12,7 @@ */ import van from '../van.min.js'; import { getValue, loadStylesheet } from '../utils.js'; -import { colorMap } from '../display_utils.js'; +import { colorMap, formatNumber } from '../display_utils.js'; import { niceBounds } from '../axis_utils.js'; const { div } = van.tags; @@ -83,7 +83,7 @@ const BoxPlot = (/** @type Properties */ props) => { class: 'tg-box-plot--axis-tick', style: `left: ${(position - min) * 100 / range}%;` }, - position, + formatNumber(position), )), ); }, diff --git a/testgen/ui/components/frontend/js/components/button.js b/testgen/ui/components/frontend/js/components/button.js index 08b32393..d90b0034 100644 --- a/testgen/ui/components/frontend/js/components/button.js +++ b/testgen/ui/components/frontend/js/components/button.js @@ -214,10 +214,6 @@ button.tg-button.tg-warn-button.tg-stroked-button { color: var(--button-warn-stroked-text-color); background: var(--button-warn-stroked-background); } - -button.tg-button.tg-warn-button[disabled] { - color: rgba(255, 255, 255, .5) !important; -} /* ... */ `); diff --git a/testgen/ui/components/frontend/js/components/checkbox.js b/testgen/ui/components/frontend/js/components/checkbox.js index 75bbe743..6e5968d8 100644 --- a/testgen/ui/components/frontend/js/components/checkbox.js +++ b/testgen/ui/components/frontend/js/components/checkbox.js @@ -8,7 +8,7 @@ * @property {boolean?} indeterminate * @property {function(boolean, Event)?} onChange * @property {number?} width - * @property {testId?} testId + * @property {string?} testId */ import van from '../van.min.js'; import { getValue, loadStylesheet } from '../utils.js'; diff --git a/testgen/ui/components/frontend/js/components/code.js b/testgen/ui/components/frontend/js/components/code.js new file mode 100644 index 00000000..4f9f6ba7 --- /dev/null +++ b/testgen/ui/components/frontend/js/components/code.js @@ -0,0 +1,43 @@ +/** + * @typedef Options + * @type {object} + * @property {string?} id + * @property {string?} testId + * @property {string?} class + */ + +import van from '../van.min.js'; +import { getRandomId } from '../utils.js'; +import { Icon } from './icon.js'; + +const { code } = van.tags; + +/** + * + * @param {Options} options + * @param {...HTMLElement} children + */ +const Code = (options, ...children) => { + const domId = options.id ?? `code-snippet-${getRandomId()}`; + const icon = 'content_copy'; + + return code( + { ...options, id: domId, class: options.class ?? '', 'data-testid': options.testId ?? '' }, + ...children, + Icon( + { + classes: '', + onclick: () => { + const parentElement = document.getElementById(domId); + const content = (parentElement.textContent || parentElement.innerText).replace(icon, ''); + if (content) { + navigator.clipboard.writeText(content); + } + }, + }, + 'content_copy', + ), + ); +}; + +export { Code }; diff --git a/testgen/ui/components/frontend/js/components/connection_form.js b/testgen/ui/components/frontend/js/components/connection_form.js index 5486fd91..7ed5918f 100644 --- a/testgen/ui/components/frontend/js/components/connection_form.js +++ b/testgen/ui/components/frontend/js/components/connection_form.js @@ -119,14 +119,14 @@ const ConnectionForm = (props, saveButton) => { privateKeyPhrase.val = ''; } - const flavor = getValue(props.flavors).find(f => f.value === connectionFlavor.val); + const flavor = getValue(props.flavors).find(f => f.value === connectionFlavor.rawVal); const originalURLTemplate = van.state(flavor.connection_string); - const [prefixPart, sufixPart] = originalURLTemplate.val.split('@'); + const [prefixPart, sufixPart] = originalURLTemplate.rawVal.split('@'); const connectionStringPrefix = van.state(prefixPart); const connectionStringSuffix = van.state(connection?.url ?? ''); - if (!connectionStringSuffix.val) { - connectionStringSuffix.val = formatURL(sufixPart ?? '', connectionHost.val, connectionPort.val, connectionDatabase.val); + if (!connectionStringSuffix.rawVal) { + connectionStringSuffix.val = formatURL(sufixPart ?? '', connectionHost.rawVal, connectionPort.rawVal, connectionDatabase.rawVal); } const updatedConnection = van.derive(() => { diff --git a/testgen/ui/components/frontend/js/components/frequency_bars.js b/testgen/ui/components/frontend/js/components/frequency_bars.js index c3ad64a3..d26073ce 100644 --- a/testgen/ui/components/frontend/js/components/frequency_bars.js +++ b/testgen/ui/components/frontend/js/components/frequency_bars.js @@ -14,7 +14,7 @@ */ import van from '../van.min.js'; import { getValue, loadStylesheet } from '../utils.js'; -import { colorMap } from '../display_utils.js'; +import { colorMap, formatNumber } from '../display_utils.js'; const { div, span } = van.tags; const defaultColor = 'teal'; @@ -67,7 +67,7 @@ const FrequencyBars = (/** @type Properties */ props) => { class: 'text-caption tg-frequency-bars--count', style: () => `width: ${width.val}px;`, }, - count, + formatNumber(count), ), div(value), ); diff --git a/testgen/ui/components/frontend/js/components/help_menu.js b/testgen/ui/components/frontend/js/components/help_menu.js new file mode 100644 index 00000000..1a364a23 --- /dev/null +++ b/testgen/ui/components/frontend/js/components/help_menu.js @@ -0,0 +1,161 @@ +/** + * @typedef Version + * @type {object} + * @property {string} edition + * @property {string} current + * @property {string} latest + * + * @typedef Permissions + * @type {object} + * @property {boolean} can_edit + * + * @typedef Properties + * @type {object} + * @property {string} page_help + * @property {string} support_email + * @property {Version} version + * @property {Permissions} permissions +*/ +import van from '../van.min.js'; +import { emitEvent, getRandomId, getValue, loadStylesheet, resizeFrameHeightOnDOMChange, resizeFrameHeightToElement } from '../utils.js'; +import { Streamlit } from '../streamlit.js'; +import { Icon } from './icon.js'; + +const { a, div, span } = van.tags; + +const baseHelpUrl = 'https://docs.datakitchen.io/articles/#!dataops-testgen-help/'; +const releaseNotesTopic = 'testgen-release-notes'; +const upgradeTopic = 'upgrade-testgen'; + +const slackUrl = 'https://data-observability-slack.datakitchen.io/join'; +const trainingUrl = 'https://info.datakitchen.io/data-quality-training-and-certifications'; + +const HelpMenu = (/** @type Properties */ props) => { + loadStylesheet('help-menu', stylesheet); + Streamlit.setFrameHeight(1); + window.testgen.isPage = true; + + const domId = `help-menu-${getRandomId()}`; + const version = getValue(props.version) ?? {}; + + resizeFrameHeightToElement(domId); + resizeFrameHeightOnDOMChange(domId); + + return div( + { id: domId }, + div( + { class: 'flex-column pt-3' }, + getValue(props.help_topic) + ? HelpLink(`${baseHelpUrl}${getValue(props.help_topic)}`, 'Help for this Page', 'description') + : null, + HelpLink(baseHelpUrl, 'TestGen Help', 'help'), + HelpLink(trainingUrl, 'Training Portal', 'school'), + getValue(props.permissions)?.can_edit + ? div( + { class: 'help-item', onclick: () => emitEvent('AppLogsClicked') }, + Icon({ classes: 'help-item-icon' }, 'browse_activity'), + 'Application Logs', + ) + : null, + span({ class: 'help-divider' }), + HelpLink(slackUrl, 'Slack Community', 'group'), + getValue(props.support_email) + ? HelpLink( + `mailto:${getValue(props.support_email)} + ?subject=${version.edition}: Contact Support + &body=%0D%0D%0DVersion: ${version.edition} ${version.current}`, + 'Contact Support', + 'email', + ) + : null, + span({ class: 'help-divider' }), + version.current || version.latest + ? div( + { class: 'help-version' }, + version.current + ? HelpLink(`${baseHelpUrl}${releaseNotesTopic}`, `${version.edition} ${version.current}`, null, null) + : null, + version.latest !== version.current + ? HelpLink( + `${baseHelpUrl}${upgradeTopic}`, + `New version available! ${version.latest}`, + null, + 'latest', + ) + : null, + ) + : null, + ), + ); +} + +const HelpLink = ( + /** @type string */ url, + /** @type string */ label, + /** @type string? */ icon, + /** @type string */ classes = 'help-item', +) => { + return a( + { + class: classes, + href: url, + target: '_blank', + onclick: () => emitEvent('ExternalLinkClicked'), + }, + icon ? Icon({ classes: 'help-item-icon' }, icon) : null, + label, + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.help-item { + padding: 12px 24px; + color: var(--primary-text-color); + text-decoration: none; + display: flex; + align-items: center; + gap: 8px; + cursor: pointer; + transition: 0.3s; +} + +.help-item:hover { + background-color: var(--select-hover-background); + color: var(--primary-color); +} + +.help-item-icon { + color: var(--primary-text-color); + transition: 0.3s; +} + +.help-item:hover .help-item-icon { + color: var(--primary-color); +} + +.help-divider { + height: 1px; + background-color: var(--border-color); + margin: 0 16px; +} + +.help-version { + padding: 16px 16px 8px; + display: flex; + flex-direction: column; + align-items: flex-end; + gap: 8px; +} + +.help-version > a { + color: var(--secondary-text-color); + text-decoration: none; +} + +.help-version > a.latest { + color: var(--red); +} +`); + +export { HelpMenu }; diff --git a/testgen/ui/components/frontend/js/components/icon.js b/testgen/ui/components/frontend/js/components/icon.js index 59ad154c..b4e879d7 100644 --- a/testgen/ui/components/frontend/js/components/icon.js +++ b/testgen/ui/components/frontend/js/components/icon.js @@ -4,14 +4,26 @@ * @property {number?} size * @property {string} classes */ -import { getValue, loadStylesheet } from '../utils.js'; +import { getValue, isDataURL, loadStylesheet } from '../utils.js'; import van from '../van.min.js'; -const { i } = van.tags; +const { i, img } = van.tags; const DEFAULT_SIZE = 20; const Icon = (/** @type Properties */ props, /** @type string */ icon) => { loadStylesheet('icon', stylesheet); + + if (isDataURL(getValue(icon))) { + return img( + { + width: () => getValue(props.size) || DEFAULT_SIZE, + height: () => getValue(props.size) || DEFAULT_SIZE, src: icon, + class: () => `tg-icon tg-icon-image ${getValue(props.classes)}`, + src: icon, + } + ); + } + return i( { class: () => `material-symbols-rounded tg-icon text-secondary ${getValue(props.classes)}`, diff --git a/testgen/ui/components/frontend/js/components/percent_bar.js b/testgen/ui/components/frontend/js/components/percent_bar.js index e6a53210..a0260344 100644 --- a/testgen/ui/components/frontend/js/components/percent_bar.js +++ b/testgen/ui/components/frontend/js/components/percent_bar.js @@ -10,7 +10,7 @@ */ import van from '../van.min.js'; import { getValue, loadStylesheet } from '../utils.js'; -import { colorMap } from '../display_utils.js'; +import { colorMap, formatNumber } from '../display_utils.js'; const { div, span } = van.tags; const defaultHeight = 10; @@ -25,7 +25,7 @@ const PercentBar = (/** @type Properties */ props) => { { style: () => `max-width: ${props.width ? getValue(props.width) + 'px' : '100%'};` }, div( { class: () => `tg-percent-bar--label ${value.val ? '' : 'text-secondary'}` }, - () => `${getValue(props.label)}: ${value.val}`, + () => `${getValue(props.label)}: ${formatNumber(value.val)}`, ), div( { diff --git a/testgen/ui/components/frontend/js/components/portal.js b/testgen/ui/components/frontend/js/components/portal.js index 072395d8..ad2287e5 100644 --- a/testgen/ui/components/frontend/js/components/portal.js +++ b/testgen/ui/components/frontend/js/components/portal.js @@ -9,6 +9,7 @@ * @property {string} target * @property {boolean?} targetRelative * @property {boolean} opened + * @property {'left' | 'right'} align * @property {(string|undefined)} style * @property {(string|undefined)} class */ @@ -18,7 +19,7 @@ import { getValue } from '../utils.js'; const { div } = van.tags; const Portal = (/** @type Options */ options, ...args) => { - const { target, targetRelative } = getValue(options); + const { target, targetRelative, align = 'left' } = getValue(options); const id = `${target}-portal`; window.testgen.portals[id] = { domId: id, targetId: target, opened: options.opened }; @@ -32,13 +33,19 @@ const Portal = (/** @type Options */ options, ...args) => { const anchorRect = anchor.getBoundingClientRect(); const top = (targetRelative ? 0 : anchorRect.top) + anchorRect.height; const left = targetRelative ? 0 : anchorRect.left; + const right = targetRelative ? 0 : (window.innerWidth - anchorRect.right); const minWidth = anchorRect.width; return div( { id, class: getValue(options.class) ?? '', - style: `position: absolute; z-index: 99; min-width: ${minWidth}px; top: ${top}px; left: ${left}px; ${getValue(options.style)}`, + style: `position: absolute; + z-index: 99; + min-width: ${minWidth}px; + top: ${top}px; + ${align === 'left' ? `left: ${left}px;` : `right: ${right}px;`} + ${getValue(options.style)}`, }, ...args, ); diff --git a/testgen/ui/components/frontend/js/components/score_metric.js b/testgen/ui/components/frontend/js/components/score_metric.js index 27ea3122..321caed0 100644 --- a/testgen/ui/components/frontend/js/components/score_metric.js +++ b/testgen/ui/components/frontend/js/components/score_metric.js @@ -16,7 +16,7 @@ const ScoreMetric = function( { class: 'flex-column fx-align-flex-center score-metric' }, Caption({ content: 'Score' }), span( - { style: 'font-size: 36px;' }, + { style: 'font-size: 28px;' }, score ?? '--', ), (profilingScore || testingScore) ? div( diff --git a/testgen/ui/components/frontend/js/components/select.js b/testgen/ui/components/frontend/js/components/select.js index 78903659..eb885fa2 100644 --- a/testgen/ui/components/frontend/js/components/select.js +++ b/testgen/ui/components/frontend/js/components/select.js @@ -4,6 +4,7 @@ * @property {string} label * @property {string} value * @property {boolean} selected + * @property {string?} icon * * @typedef Properties * @type {object} @@ -22,6 +23,7 @@ import van from '../van.min.js'; import { getRandomId, getValue, loadStylesheet, isState, isEqual } from '../utils.js'; import { Portal } from './portal.js'; +import { Icon } from './icon.js'; const { div, i, label, span } = van.tags; @@ -44,11 +46,9 @@ const Select = (/** @type {Properties} */ props) => { return options; }); const value = isState(props.value) ? props.value : van.state(props.value ?? null); - const valueLabel = van.derive(() => { - const currentOptions = getValue(options); - const currentValue = getValue(value); - return currentOptions?.find((op) => op.value === currentValue)?.label ?? ''; - }); + const initialSelection = options.val?.find((op) => op.value === value.val); + const valueLabel = van.state(initialSelection?.label ?? ''); + const valueIcon = van.state(initialSelection?.icon ?? undefined); const changeSelection = (/** @type Option */ option) => { opened.val = false; @@ -57,14 +57,19 @@ const Select = (/** @type {Properties} */ props) => { van.derive(() => { const currentOptions = getValue(options); - + const previousValue = value.oldVal; let currentValue = getValue(value); - let previousValue = value.oldVal; - if (currentOptions.find((op) => op.value === currentValue) === undefined) { - currentValue = value.val = null; + const selectedOption = currentOptions.find((op) => op.value === currentValue); + + if (selectedOption === undefined) { + currentValue = null; + setTimeout(() => value.val = null, 0.1); } if (!isEqual(currentValue, previousValue)) { + valueLabel.val = selectedOption?.label ?? ''; + valueIcon.val = selectedOption?.icon ?? undefined; + props.onChange?.(currentValue); } }); @@ -84,9 +89,12 @@ const Select = (/** @type {Properties} */ props) => { style: () => getValue(props.height) ? `height: ${getValue(props.height)}px;` : '', 'data-testid': 'select-input', }, - div( + () => div( { class: 'tg-select--field--content', 'data-testid': 'select-input-display' }, - valueLabel, + valueIcon.val + ? Icon({ classes: 'mr-2' }, valueIcon.val) + : undefined, + valueLabel.val, ), div( { class: 'tg-select--field--icon', 'data-testid': 'select-input-trigger' }, @@ -110,6 +118,9 @@ const Select = (/** @type {Properties} */ props) => { }, 'data-testid': 'select-options-item', }, + option.icon + ? Icon({ classes: 'mr-2' }, option.icon) + : undefined, span(option.label), ) ), @@ -118,7 +129,6 @@ const Select = (/** @type {Properties} */ props) => { ); }; - const stylesheet = new CSSStyleSheet(); stylesheet.replace(` .tg-select--label { diff --git a/testgen/ui/components/frontend/js/components/sidebar.js b/testgen/ui/components/frontend/js/components/sidebar.js index b2da6405..70b93883 100644 --- a/testgen/ui/components/frontend/js/components/sidebar.js +++ b/testgen/ui/components/frontend/js/components/sidebar.js @@ -9,38 +9,35 @@ * * @typedef Version * @type {object} + * @property {string} edition * @property {string} current * @property {string} latest - * @property {string} schema * * @typedef Menu * @type {object} * @property {Array.} items - * @property {Version} version * * @typedef Project * @type {object} * @property {string} code * @property {string} name * - * @typedef Permissions - * @type {object} - * @property {boolean} can_edit - * * @typedef Properties * @type {object} * @property {Menu} menu * @property {Project[]} projects - * @property {string} username - * @property {string} current_page * @property {string} current_project + * @property {string} current_page + * @property {string} username + * @property {string} role * @property {string} logout_path - * @property {Permissions} permissions + * @property {Version} version + * @property {string} support_email */ const van = window.top.van; const { a, button, div, i, img, label, option, select, span } = van.tags; -const PROJECT_CODE_QUERY_PARAM = "project_code" +const PROJECT_CODE_QUERY_PARAM = 'project_code'; const Sidebar = (/** @type {Properties} */ props) => { if (Sidebar.StreamlitInstance) { @@ -76,7 +73,11 @@ const Sidebar = (/** @type {Properties} */ props) => { }, ), div( - span({class: 'menu--username'}, props.username), + div( + { class: 'menu--user' }, + span({class: 'menu--username', title: props.username}, props.username), + span({class: 'menu--role'}, props.role.val?.replace('_', ' ')), + ), div( { class: 'menu--buttons' }, button( @@ -87,15 +88,16 @@ const Sidebar = (/** @type {Properties} */ props) => { i({class: 'material-symbols-rounded'}, 'logout'), span('Logout'), ), - props.permissions.val?.can_edit ? button( + props.support_email?.val ? a( { - class: 'tg-button', - onclick: () => emitEvent({ view_logs: true }), + href: `mailto:${props.support_email?.val} + ?subject=${props.version.val?.edition}: Contact Us + &body=%0D%0D%0DVersion: ${props.version.val?.edition} ${props.version.val?.current}`, + target: '_blank', }, - 'App Logs', + 'Contact Us', ) : null, ), - () => Version(props.menu?.val.version), ), ); }; @@ -181,36 +183,6 @@ const MenuItem = ( ); }; -const Version = (/** @type {Version} */ version) => { - const expanded = van.state(false); - - const icon = van.derive(() => expanded.val ? 'expand_less' : 'expand_more'); - const classes = van.derive(() => expanded.val ? ' version expanded' : 'version'); - - return div( - {class: classes, onclick: () => { expanded.val = !expanded.val; }}, - VersionRow( - 'Version', - version.current, - i({class: 'material-symbols-rounded version--dropdown-icon'}, icon), - ), - div( - {class: 'version--details'}, - VersionRow('latest version', version.latest), - VersionRow('schema revision', version.schema), - ), - ); -}; - -const VersionRow = (/** @type string */ label, /** @type string */ version, iconEl = undefined) => { - return div( - {class: 'version--row'}, - span({class: 'version--row--label'}, `${label}:`), - span({class: 'version--row--value'}, version), - iconEl, - ); -}; - function emitEvent(/** @type Object */ data) { if (Sidebar.StreamlitInstance) { Sidebar.StreamlitInstance.sendData({ ...data, _id: Math.random() }); // Identify the event so its handler is called once @@ -298,20 +270,22 @@ stylesheet.replace(` color: var(--primary-color); } -.menu .menu--username { - padding-left: 16px; - padding-bottom: 8px; +.menu .menu--user { + display: flex; + flex-direction: column; + padding: 16px; +} - max-width: 35%; +.menu .menu--username { overflow-x: hidden; text-overflow: ellipsis; text-wrap: nowrap; - - color: var(--secondary-text-color); } -.menu .menu--username:before { - content: 'User: '; +.menu .menu--role { + text-transform: uppercase; + font-size: 12px; + color: var(--secondary-text-color); } .menu .content > .menu--section > .menu--section--label { @@ -354,64 +328,12 @@ stylesheet.replace(` .menu .menu--buttons { display: flex; justify-content: space-between; + margin-bottom: 16px; } -.menu .version { - color: var(--secondary-text-color); - display: flex; - flex-direction: column; +.menu--buttons a { padding: 8px 16px; - cursor: pointer; -} - -.menu .version .version--dropdown-icon { - font-size: 19px; -} - -.menu .version .version--row { - display: flex; - align-items: center; - justify-content: flex-end; -} - -.menu .version .version--row .version--row--label { - font-weight: 500; - margin-right: 4px; -} - -.menu .version .version--details { - display: none; - flex-direction: column; -} - -.menu .version .version--details { - display: none; - margin-top: 4px; -} - -.menu .version.expanded .version--details { - display: block; -} - -.version--row + .version--row { - margin-top: 4px; -} - -.menu > :nth-child(1 of button) { - margin-top: auto !important; -} - -.menu > button { - margin: 16px; - color: var(--secondary-text-color) !important; -} - -.menu > button.logout { - margin-top: 8px; -} - -.menu > button.users { - margin-bottom: 0px; + font-size: 14px; } /* Intentionally duplicate from button.js */ diff --git a/testgen/ui/components/frontend/js/components/summary_bar.js b/testgen/ui/components/frontend/js/components/summary_bar.js index 2c791913..c16dcc61 100644 --- a/testgen/ui/components/frontend/js/components/summary_bar.js +++ b/testgen/ui/components/frontend/js/components/summary_bar.js @@ -15,7 +15,7 @@ */ import van from '../van.min.js'; import { friendlyPercent, getValue, loadStylesheet } from '../utils.js'; -import { colorMap } from '../display_utils.js'; +import { colorMap, formatNumber } from '../display_utils.js'; const { div, span } = van.tags; const defaultHeight = 24; @@ -50,7 +50,7 @@ const SummaryBar = (/** @type Properties */ props) => { class: 'dot', style: `color: ${colorMap[item.color] || item.color};`, }), - `${item.label}: ${item.value || 0}` + (item.showPercent ? ` (${friendlyPercent(item.value * 100 / total.val)}%)` : '') + `${item.label}: ${formatNumber(item.value || 0)}` + (item.showPercent ? ` (${friendlyPercent(item.value * 100 / total.val)}%)` : '') ) : null, ), diff --git a/testgen/ui/components/frontend/js/components/table_group_form.js b/testgen/ui/components/frontend/js/components/table_group_form.js index 05e4c490..6d043319 100644 --- a/testgen/ui/components/frontend/js/components/table_group_form.js +++ b/testgen/ui/components/frontend/js/components/table_group_form.js @@ -1,7 +1,10 @@ /** + * @import { Connection } from './connection_form.js'; + * * @typedef TableGroup * @type {object} - * @property {string?} table_group_id + * @property {string?} id + * @property {string?} connection_id * @property {string?} table_groups_name * @property {string?} profiling_include_mask * @property {string?} profiling_exclude_mask @@ -33,6 +36,10 @@ * @typedef Properties * @type {object} * @property {TableGroup} tableGroup + * @property {Connection[]} connections + * @property {boolean?} showConnectionSelector + * @property {boolean?} enableConnectionSelector + * @property {boolean?} disableSchemaField * @property {(tg: TableGroup, state: FormState) => void} onChange */ import van from '../van.min.js'; @@ -41,6 +48,7 @@ import { Input } from './input.js'; import { Checkbox } from './checkbox.js'; import { ExpansionPanel } from './expansion_panel.js'; import { required } from '../form_validators.js'; +import { Select } from './select.js'; const { div, span } = van.tags; @@ -53,6 +61,7 @@ const TableGroupForm = (props) => { loadStylesheet('table-group-form', stylesheet); const tableGroup = getValue(props.tableGroup); + const tableGroupConnectionId = van.state(tableGroup.connection_id); const tableGroupsName = van.state(tableGroup.table_groups_name); const profilingIncludeMask = van.state(tableGroup.profiling_include_mask ?? '%'); const profilingExcludeMask = van.state(tableGroup.profiling_exclude_mask ?? 'tmp%'); @@ -76,9 +85,22 @@ const TableGroupForm = (props) => { const transformLevel = van.state(tableGroup.transform_level); const dataProduct = van.state(tableGroup.data_product); + const connectionOptions = van.derive(() => { + const connections = getValue(props.connections) ?? []; + return connections.map(c => ({ + label: c.connection_name, + value: c.connection_id, + icon: c.flavor.icon, + })); + }); + const showConnectionSelector = getValue(props.showConnectionSelector) ?? false; + const disableConnectionSelector = van.derive(() => !getValue(props.enableConnectionSelector) || (getValue(props.connections) ?? []).length <= 0); + const disableSchemaField = van.derive(() => getValue(props.disableSchemaField) ?? false) + const updatedTableGroup = van.derive(() => { return { - table_group_id: tableGroup.table_group_id, + id: tableGroup.id, + connection_id: tableGroupConnectionId.val, table_groups_name: tableGroupsName.val, profiling_include_mask: profilingIncludeMask.val, profiling_exclude_mask: profilingExcludeMask.val, @@ -105,6 +127,9 @@ const TableGroupForm = (props) => { }); const dirty = van.derive(() => !isEqual(updatedTableGroup.val, tableGroup)); const validityPerField = van.state({}); + if (showConnectionSelector) { + validityPerField.val.connection_id = !!tableGroupConnectionId.val; + } van.derive(() => { const fieldsValidity = validityPerField.val; @@ -114,13 +139,27 @@ const TableGroupForm = (props) => { }); const setFieldValidity = (field, validity) => { - validityPerField.val = {...validityPerField.val, [field]: validity}; + validityPerField.val = {...validityPerField.rawVal, [field]: validity}; } return div( { class: 'flex-column fx-gap-3' }, + showConnectionSelector + ? Select({ + name: 'connection_id', + label: 'Connection', + value: tableGroupConnectionId.rawVal, + options: connectionOptions, + height: 38, + disabled: disableConnectionSelector, + onChange: (value) => { + tableGroupConnectionId.val = value; + setFieldValidity('connection_id', !!value); + }, + }) + : undefined, MainForm( - { setValidity: setFieldValidity }, + { disableSchemaField, setValidity: setFieldValidity }, tableGroupsName, profilingIncludeMask, profilingExcludeMask, @@ -227,6 +266,7 @@ const MainForm = ( height: 38, help: 'Database schema containing the tables for the Table Group', helpPlacement: 'bottom-left', + disabled: options.disableSchemaField, onChange: (value, state) => { tableGroupSchema.val = value; options.setValidity?.('table_group_schema', state.valid); diff --git a/testgen/ui/components/frontend/js/components/tabs.js b/testgen/ui/components/frontend/js/components/tabs.js new file mode 100644 index 00000000..b23b9ca5 --- /dev/null +++ b/testgen/ui/components/frontend/js/components/tabs.js @@ -0,0 +1,128 @@ +/** + * @typedef {Object} TabProps + * @property {string} label + */ +import { getValue, loadStylesheet } from '../utils.js'; +import van from '../van.min.js'; + +const { div, button, span } = van.tags; + +/** + * @param {TabProps} props + * @param {...any} children + * @returns {{label: string, children: van.ChildDom[]}} + */ +const Tab = ({ label }, ...children) => ({ + label, + children, +}); + +/** + * @param {object} props + * @param {...Tab} tabs + */ +const Tabs = (props, ...tabs) => { + loadStylesheet('tabs', stylesheet); + + const activeTab = van.state(0); + + let labelsContainerEl; + const highlightEl = span({ class: "tg-tabs--highlight" }); + + const updateHighlight = () => { + if (!labelsContainerEl?.isConnected || !labelsContainerEl.children.length) return; + + const activeLabel = labelsContainerEl.children[activeTab.val]; + if (!activeLabel) return; + + highlightEl.style.width = `${activeLabel.offsetWidth}px`; + highlightEl.style.left = `${activeLabel.offsetLeft}px`; + highlightEl.style.opacity = '1'; + }; + + labelsContainerEl = div( + { class: "tg-tabs--labels" }, + ...tabs.map((tab, i) => + button({ + class: () => `tg-tabs--tab--label ${i === activeTab.val ? 'active' : ''}`, + onclick: () => (activeTab.val = i), + }, + tab.label + )), + highlightEl, + ); + + const tabsContainerEl = div({ ...props, class: () => `${getValue(props.class) ?? ''} tg-tabs--container` }, + labelsContainerEl, + div({ class: "tg-tabs--content" }, () => div({class: "tg-tabs--content-inner"}, tabs[activeTab.val].children)), + ); + + van.derive(() => { + activeTab.val; + requestAnimationFrame(updateHighlight); + }); + + const resizeObserver = new ResizeObserver(() => { + requestAnimationFrame(updateHighlight); + }); + + tabsContainerEl.onadd = () => { + resizeObserver.observe(labelsContainerEl); + updateHighlight(); + }; + + tabsContainerEl.onremove = () => { + resizeObserver.disconnect(); + }; + + return tabsContainerEl; +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.tg-tabs--container { + width: 100%; +} + +.tg-tabs--labels { + position: relative; + display: flex; + border-bottom: 1px solid #dddfe2; +} + +.tg-tabs--tab--label { + padding: 12px 20px; + cursor: pointer; + background-color: transparent; + border: none; + font-size: 0.875rem; + color: var(--secondary-text-color); + font-weight: 500; + transition: color 0.2s ease-in-out; + white-space: nowrap; +} + +.tg-tabs--tab--label:hover { + color: var(--primary-color); + border-radius: 6px 6px 0 0; +} + +.tg-tabs--tab--label.active { + color: var(--primary-color); +} + +.tg-tabs--highlight { + position: absolute; + bottom: -1px; + height: 2px; + background-color: var(--primary-color); + transition: left 0.3s cubic-bezier(0.25, 0.8, 0.25, 1), width 0.3s cubic-bezier(0.25, 0.8, 0.25, 1); + opacity: 0; +} + +.tg-tabs--content { + padding-top: 20px; +} +`); + +export { Tabs, Tab }; \ No newline at end of file diff --git a/testgen/ui/components/frontend/js/data_profiling/column_distribution.js b/testgen/ui/components/frontend/js/data_profiling/column_distribution.js index a0810553..4d51f65f 100644 --- a/testgen/ui/components/frontend/js/data_profiling/column_distribution.js +++ b/testgen/ui/components/frontend/js/data_profiling/column_distribution.js @@ -16,7 +16,7 @@ import { PercentBar } from '../components/percent_bar.js'; import { FrequencyBars } from '../components/frequency_bars.js'; import { BoxPlot } from '../components/box_plot.js'; import { loadStylesheet, emitEvent, friendlyPercent, getValue } from '../utils.js'; -import { formatTimestamp, roundDigits } from '../display_utils.js'; +import { formatNumber, formatTimestamp } from '../display_utils.js'; const { div, span } = van.tags; const columnTypeFunctionMap = { @@ -106,7 +106,7 @@ function AlphaColumn(/** @type ColumnProfile */ item) { SummaryBar({ height: summaryHeight, width: summaryWidth, - label: `Missing Values: ${missing} (${friendlyPercent(missing * 100 / total)}%)`, + label: `Missing Values: ${formatNumber(missing)} (${friendlyPercent(missing * 100 / total)}%)`, items: [ { label: 'Actual Values', value: item.value_ct - item.zero_length_ct - item.filled_value_ct, color: 'green' }, { label: 'Null', value: item.null_value_ct, color: 'brownLight', showPercent: true }, @@ -117,7 +117,7 @@ function AlphaColumn(/** @type ColumnProfile */ item) { SummaryBar({ height: summaryHeight, width: summaryWidth, - label: `Duplicate Values: ${duplicates} (${friendlyPercent(duplicates * 100 / item.value_ct)}%)`, + label: `Duplicate Values: ${formatNumber(duplicates)} (${friendlyPercent(duplicates * 100 / item.value_ct)}%)`, items: [ { label: 'Distinct', value: item.distinct_value_ct, color: 'indigo' }, { label: 'Duplicates', value: duplicates, color: 'orange' }, @@ -128,7 +128,7 @@ function AlphaColumn(/** @type ColumnProfile */ item) { ? SummaryBar({ height: summaryHeight, width: summaryWidth, - label: `Duplicate Values, Standardized: ${duplicatesStandardized} (${friendlyPercent(duplicatesStandardized * 100 / item.value_ct)}%)`, + label: `Duplicate Values, Standardized: ${formatNumber(duplicatesStandardized)} (${friendlyPercent(duplicatesStandardized * 100 / item.value_ct)}%)`, items: [ { label: 'Distinct', value: item.distinct_std_value_ct, color: 'indigo' }, { label: 'Duplicates', value: duplicatesStandardized, color: 'orange' }, @@ -188,14 +188,14 @@ function AlphaColumn(/** @type ColumnProfile */ item) { PercentBar({ label: 'Quoted Values', value: item.quoted_value_ct, total, width: percentWidth }), PercentBar({ label: 'Leading Spaces', value: item.lead_space_ct, total, width: percentWidth }), PercentBar({ label: 'Embedded Spaces', value: item.embedded_space_ct ?? 0, total, width: percentWidth }), - Attribute({ label: 'Average Embedded Spaces', value: roundDigits(item.avg_embedded_spaces), width: attributeWidth }), + Attribute({ label: 'Average Embedded Spaces', value: formatNumber(item.avg_embedded_spaces), width: attributeWidth }), ), ), div( { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-4' }, - Attribute({ label: 'Minimum Length', value: item.min_length, width: attributeWidth }), - Attribute({ label: 'Maximum Length', value: item.max_length, width: attributeWidth }), - Attribute({ label: 'Average Length', value: roundDigits(item.avg_length), width: attributeWidth }), + Attribute({ label: 'Minimum Length', value: formatNumber(item.min_length), width: attributeWidth }), + Attribute({ label: 'Maximum Length', value: formatNumber(item.max_length), width: attributeWidth }), + Attribute({ label: 'Average Length', value: formatNumber(item.avg_length), width: attributeWidth }), ), div( { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-4' }, @@ -205,7 +205,7 @@ function AlphaColumn(/** @type ColumnProfile */ item) { div( { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-4' }, Attribute({ label: 'Standard Pattern Match', value: standardPattern, width: attributeWidth }), - Attribute({ label: 'Distinct Patterns', value: item.distinct_pattern_ct, width: attributeWidth }), + Attribute({ label: 'Distinct Patterns', value: formatNumber(item.distinct_pattern_ct), width: attributeWidth }), ), ); } @@ -260,7 +260,7 @@ function DatetimeColumn(/** @type ColumnProfile */ item) { { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-4' }, Attribute({ label: 'Minimum Date', value: formatTimestamp(item.min_date, true), width: attributeWidth }), Attribute({ label: 'Maximum Date', value: formatTimestamp(item.max_date, true), width: attributeWidth }), - Attribute({ label: 'Distinct Values', value: item.distinct_value_ct, width: attributeWidth }), + Attribute({ label: 'Distinct Values', value: formatNumber(item.distinct_value_ct), width: attributeWidth }), ), ); } @@ -283,15 +283,15 @@ function NumericColumn(/** @type ColumnProfile */ item) { ), div( { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-4 tg-profile--attribute-block' }, - Attribute({ label: 'Distinct Values', value: item.distinct_value_ct, width: attributeWidth }), - Attribute({ label: 'Average Value', value: roundDigits(item.avg_value), width: attributeWidth }), - Attribute({ label: 'Standard Deviation', value: roundDigits(item.stdev_value), width: attributeWidth }), - Attribute({ label: 'Minimum Value', value: item.min_value, width: attributeWidth }), - Attribute({ label: 'Minimum Value > 0', value: item.min_value_over_0, width: attributeWidth }), - Attribute({ label: 'Maximum Value', value: item.max_value, width: attributeWidth }), - Attribute({ label: '25th Percentile', value: roundDigits(item.percentile_25), width: attributeWidth }), - Attribute({ label: 'Median Value', value: roundDigits(item.percentile_50), width: attributeWidth }), - Attribute({ label: '75th Percentile', value: roundDigits(item.percentile_75), width: attributeWidth }), + Attribute({ label: 'Distinct Values', value: formatNumber(item.distinct_value_ct), width: attributeWidth }), + Attribute({ label: 'Average Value', value: formatNumber(item.avg_value), width: attributeWidth }), + Attribute({ label: 'Standard Deviation', value: formatNumber(item.stdev_value), width: attributeWidth }), + Attribute({ label: 'Minimum Value', value: formatNumber(item.min_value), width: attributeWidth }), + Attribute({ label: 'Minimum Value > 0', value: formatNumber(item.min_value_over_0), width: attributeWidth }), + Attribute({ label: 'Maximum Value', value: formatNumber(item.max_value), width: attributeWidth }), + Attribute({ label: '25th Percentile', value: formatNumber(item.percentile_25), width: attributeWidth }), + Attribute({ label: 'Median Value', value: formatNumber(item.percentile_50), width: attributeWidth }), + Attribute({ label: '75th Percentile', value: formatNumber(item.percentile_75), width: attributeWidth }), ), div( { class: 'flex-row fx-justify-center tg-profile--plot-block' }, @@ -318,7 +318,7 @@ const BaseCounts = (/** @type ColumnProfile */ item) => { { class: 'flex-row fx-gap-4' }, attributes.map(({ key, label }) => Attribute({ label: item[key] === 0 ? span({ class: 'text-error' }, label) : label, - value: item[key], + value: formatNumber(item[key]), width: attributeWidth, })), ); diff --git a/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js b/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js index df3cbf17..0de721a0 100644 --- a/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js +++ b/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js @@ -236,8 +236,8 @@ const LatestProfilingTime = (/** @type Properties */ props, /** @type Table | Co } else { text = 'No profiling results yet for table group.'; link = Link({ - href: 'connections:table-groups', - params: { connection_id: item.connection_id }, + href: 'table-groups', + params: { project_code: item.project_code, connection_id: item.connection_id }, open_new: true, label: 'Go to Table Groups', right_icon: 'chevron_right', diff --git a/testgen/ui/components/frontend/js/data_profiling/table_size.js b/testgen/ui/components/frontend/js/data_profiling/table_size.js index 9c5055b1..2573d9c3 100644 --- a/testgen/ui/components/frontend/js/data_profiling/table_size.js +++ b/testgen/ui/components/frontend/js/data_profiling/table_size.js @@ -9,7 +9,7 @@ import { Card } from '../components/card.js'; import { Attribute } from '../components/attribute.js'; import { Button } from '../components/button.js'; import { emitEvent } from '../utils.js'; -import { formatTimestamp } from '../display_utils.js'; +import { formatNumber, formatTimestamp } from '../display_utils.js'; const { div, span } = van.tags; @@ -27,7 +27,7 @@ const TableSizeCard = (/** @type Properties */ _props, /** @type Table */ item) { class: 'flex-row fx-flex-wrap fx-gap-4' }, attributes.map(({ key, label }) => Attribute({ label: item[key] === 0 ? span({ class: 'text-error' }, label) : label, - value: item[key], + value: formatNumber(item[key]), width: 250, })), ), diff --git a/testgen/ui/components/frontend/js/display_utils.js b/testgen/ui/components/frontend/js/display_utils.js index bc7c1a9d..f0315368 100644 --- a/testgen/ui/components/frontend/js/display_utils.js +++ b/testgen/ui/components/frontend/js/display_utils.js @@ -30,11 +30,11 @@ function formatDuration(/** @type string */ duration) { return formatted.trim() || '< 1s'; } -function roundDigits(/** @type number | string */ number, /** @type number */ precision = 3) { +function formatNumber(/** @type number | string */ number, /** @type number */ precision = 3) { if (!['number', 'string'].includes(typeof number) || isNaN(number)) { return '--'; } - return parseFloat(Number(number).toPrecision(precision)); + return parseFloat(Number(number).toPrecision(precision)).toLocaleString(); } function capitalize(/** @type string */ text) { @@ -89,4 +89,4 @@ const colorMap = { const DISABLED_ACTION_TEXT = 'You do not have permissions to perform this action. Contact your administrator.'; -export { formatTimestamp, formatDuration, roundDigits, capitalize, humanReadableSize, colorMap, DISABLED_ACTION_TEXT }; +export { formatTimestamp, formatDuration, formatNumber, capitalize, humanReadableSize, colorMap, DISABLED_ACTION_TEXT }; diff --git a/testgen/ui/components/frontend/js/main.js b/testgen/ui/components/frontend/js/main.js index d4854cc5..2265c595 100644 --- a/testgen/ui/components/frontend/js/main.js +++ b/testgen/ui/components/frontend/js/main.js @@ -29,6 +29,13 @@ import { ColumnProfilingHistory } from './data_profiling/column_profiling_histor import { ScheduleList } from './pages/schedule_list.js'; import { Connections } from './pages/connections.js'; import { TableGroupWizard } from './pages/table_group_wizard.js'; +import { HelpMenu } from './components/help_menu.js' +import { TableGroup } from './pages/table_group.js'; +import { TableGroupList } from './pages/table_group_list.js'; +import { TableGroupDeleteConfirmation } from './pages/table_group_delete_confirmation.js'; +import { RunProfilingDialog } from './pages/run_profiling_dialog.js'; +import { ConfirmationDialog } from './pages/confirmation_dialog.js'; +import { TestDefinitionSummary } from './pages/test_definition_summary.js'; let currentWindowVan = van; let topWindowVan = window.top.van; @@ -56,6 +63,13 @@ const TestGenComponent = (/** @type {string} */ id, /** @type {object} */ props) column_selector: ColumnSelector, connections: Connections, table_group_wizard: TableGroupWizard, + help_menu: HelpMenu, + table_group: TableGroup, + table_group_list: TableGroupList, + table_group_delete: TableGroupDeleteConfirmation, + run_profiling_dialog: RunProfilingDialog, + confirm_dialog: ConfirmationDialog, + test_definition_summary: TestDefinitionSummary, }; if (Object.keys(window.testgen.plugins).includes(id)) { diff --git a/testgen/ui/components/frontend/js/pages/confirmation_dialog.js b/testgen/ui/components/frontend/js/pages/confirmation_dialog.js new file mode 100644 index 00000000..c1fa1aad --- /dev/null +++ b/testgen/ui/components/frontend/js/pages/confirmation_dialog.js @@ -0,0 +1,105 @@ +/** + * @typedef Result + * @type {object} + * @property {boolean} success + * @property {string} message + * + * @typedef Constraint + * @type {object} + * @property {string} warning + * @property {string} confirmation + * + * @typedef Properties + * @type {object} + * @property {string} project_code + * @property {string} message + * @property {Constraint?} constraint + * @property {Result?} result + * @property {string?} button_label + * @property {string?} button_type + * @property {string?} button_color + */ + +import van from '../van.min.js'; +import { Streamlit } from '../streamlit.js'; +import { emitEvent, getValue, loadStylesheet, resizeFrameHeightOnDOMChange, resizeFrameHeightToElement } from '../utils.js'; +import { Button } from '../components/button.js'; +import { Toggle } from '../components/toggle.js'; +import { Alert } from '../components/alert.js'; + +const { div, span } = van.tags; + +/** + * @param {Properties} props + * @returns + */ +const ConfirmationDialog = (props) => { + loadStylesheet('confirmation-dialog', stylesheet); + Streamlit.setFrameHeight(1); + window.testgen.isPage = true; + + const wrapperId = 'confirmation-dialog'; + const confirmed = van.state(false); + const actionDisabled = van.derive(() => !!getValue(props.constraint) && !confirmed.val); + const buttonLabel = van.derive(() => getValue(props.button_label) ?? 'Confirm'); + const buttonColor = van.derive(() => (actionDisabled.val ? 'basic' : getValue(props.button_color)) ?? 'basic'); + const buttonType = van.derive(() => (actionDisabled.val ? 'stroked' : getValue(props.button_type)) ?? 'flat'); + + const message = getValue(props.message); + const constraint = getValue(props.constraint); + + resizeFrameHeightToElement(wrapperId); + resizeFrameHeightOnDOMChange(wrapperId); + + return div( + { id: wrapperId, class: 'flex-column' }, + div({ class: 'flex-column fx-gap-4' }, message), + constraint + ? div( + { class: 'flex-column fx-gap-4 mt-4' }, + Alert({ type: 'warn' }, span(constraint.warning)), + Toggle({ + name: 'confirm-action', + label: span(constraint.confirmation), + checked: confirmed, + onChange: (value) => confirmed.val = value, + }), + ) + : '', + div( + { class: 'flex-row fx-justify-content-flex-end' }, + Button({ + type: buttonType, + color: buttonColor, + label: buttonLabel, + style: 'width: auto;', + disabled: actionDisabled, + onclick: () => emitEvent('ActionConfirmed', {}), + }), + ), + () => { + const result = getValue(props.result); + + if (!result) { + return ''; + } + + return div( + {class: 'mt-4'}, + Alert( + { + type: result.success ? 'success' : 'error', + closeable: true, + }, + span(result.message), + ), + ); + }, + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +`); + +export { ConfirmationDialog }; diff --git a/testgen/ui/components/frontend/js/pages/connections.js b/testgen/ui/components/frontend/js/pages/connections.js index cd28e96b..1bfec4b8 100644 --- a/testgen/ui/components/frontend/js/pages/connections.js +++ b/testgen/ui/components/frontend/js/pages/connections.js @@ -12,6 +12,7 @@ * * @typedef Properties * @type {object} + * @property {string} project_code * @property {Connection} connection * @property {boolean} has_table_groups * @property {Array} flavors @@ -39,6 +40,7 @@ const Connections = (props) => { window.testgen.isPage = true; const wrapperId = 'connections-list-wrapper'; + const projectCode = getValue(props.project_code); const connection = getValue(props.connection); const connectionId = connection.connection_id; const updatedConnection = van.state(connection); @@ -53,8 +55,8 @@ const Connections = (props) => { { class: 'flex-row fx-justify-content-flex-end' }, () => getValue(props.has_table_groups) ? Link({ - href: 'connections:table-groups', - params: {"connection_id": connectionId}, + href: 'table-groups', + params: {'project_code': projectCode, "connection_id": connectionId}, label: 'Manage Table Groups', right_icon: 'chevron_right', class: 'tg-connections--link', diff --git a/testgen/ui/components/frontend/js/pages/data_catalog.js b/testgen/ui/components/frontend/js/pages/data_catalog.js index 1641ca5e..0a9bedca 100644 --- a/testgen/ui/components/frontend/js/pages/data_catalog.js +++ b/testgen/ui/components/frontend/js/pages/data_catalog.js @@ -1,6 +1,6 @@ /** * @import { Column, Table } from '../data_profiling/data_profiling_utils.js'; - * @import { TreeNode } from '../components/tree.js'; + * @import { TreeNode, SelectedNode } from '../components/tree.js'; * * @typedef ProjectSummary * @type {object} @@ -46,7 +46,7 @@ import { Input } from '../components/input.js'; import { Icon } from '../components/icon.js'; import { withTooltip } from '../components/tooltip.js'; import { Streamlit } from '../streamlit.js'; -import { emitEvent, getValue, loadStylesheet } from '../utils.js'; +import { emitEvent, getRandomId, getValue, loadStylesheet } from '../utils.js'; import { ColumnDistributionCard } from '../data_profiling/column_distribution.js'; import { DataCharacteristicsCard } from '../data_profiling/data_characteristics.js'; import { PotentialPIICard, HygieneIssuesCard, TestIssuesCard } from '../data_profiling/data_issues.js'; @@ -60,6 +60,7 @@ import { Card } from '../components/card.js'; import { Button } from '../components/button.js'; import { Link } from '../components/link.js'; import { EMPTY_STATE_MESSAGE, EmptyState } from '../components/empty_state.js'; +import { Portal } from '../components/portal.js'; const { div, h2, span, i } = van.tags; @@ -183,9 +184,9 @@ const DataCatalog = (/** @type Properties */ props) => { return projectSummary.table_groups_ct > 0 ? div( { class: 'flex-column tg-dh' }, - () => div( + div( { class: 'flex-row fx-align-flex-end fx-justify-space-between mb-2' }, - Select({ + () => Select({ label: 'Table Group', value: getValue(props.table_group_filter_options)?.find((op) => op.selected)?.value ?? null, options: getValue(props.table_group_filter_options) ?? [], @@ -194,28 +195,7 @@ const DataCatalog = (/** @type Properties */ props) => { testId: 'table-group-filter', onChange: (value) => emitEvent('TableGroupSelected', {payload: value}), }), - Button({ - icon: 'download', - type: 'stroked', - label: 'Export', - tooltip: 'Download filtered columns to Excel', - tooltipPosition: 'left', - width: 'fit-content', - style: 'background: var(--dk-card-background);', - onclick: () => { - const columnIds = treeNodes.val.reduce((ids, table) => { - if (!table.hidden.val) { - table.children.forEach(column => { - if (!column.hidden.val) { - ids.push(column.id); - } - }); - } - return ids; - }, []); - emitEvent('ExportClicked', { payload: columnIds }); - }, - }), + ExportOptions(treeNodes, multiSelectedItems), ), () => treeNodes.val.length ? div( @@ -329,6 +309,88 @@ const DataCatalog = (/** @type Properties */ props) => { : ConditionalEmptyState(projectSummary, userCanEdit, userCanNavigate); }; +const ExportOptions = (/** @type TreeNode[] */ treeNodes, /** @type SelectedNode[] */ selectedNodes) => { + const exportOptionsDomId = `data-catalog-export-${getRandomId()}`; + const exportOptionsOpened = van.state(false); + + return [ + Button({ + id: exportOptionsDomId, + icon: 'download', + type: 'stroked', + label: 'Export', + tooltip: 'Download columns to Excel', + tooltipPosition: 'left', + width: 'fit-content', + style: 'background: var(--dk-card-background);', + onclick: () => exportOptionsOpened.val = !exportOptionsOpened.val, + }), + Portal( + { target: exportOptionsDomId, opened: exportOptionsOpened, align: 'right' }, + () => div( + { class: 'tg-dh--export-portal' }, + div( + { + class: 'tg-dh--export-option', + onclick: () => { + emitEvent('ExportClicked', { payload: null }); + exportOptionsOpened.val = false; + }, + }, + 'All columns', + ), + div( + { + class: 'tg-dh--export-option', + onclick: () => { + const payload = treeNodes.val.reduce((array, table) => { + if (!table.hidden.val) { + const [ type, id ] = table.id.split('_'); + array.push({ type, id, selected: table.selected.val }); + + table.children.forEach(column => { + if (!column.hidden.val) { + const [ type, id ] = column.id.split('_'); + array.push({ type, id, selected: column.selected.val }); + } + }); + } + return array; + }, []); + emitEvent('ExportClicked', { payload }); + exportOptionsOpened.val = false; + }, + }, + 'Filtered columns', + ), + selectedNodes.val?.length + ? div( + { + class: 'tg-dh--export-option', + onclick: () => { + const payload = selectedNodes.val.reduce((array, table) => { + const [ type, id ] = table.id.split('_'); + array.push({ type, id }); + + table.children.forEach(column => { + const [ type, id ] = column.id.split('_'); + array.push({ type, id }); + }); + + return array; + }, []); + emitEvent('ExportClicked', { payload }); + exportOptionsOpened.val = false; + }, + }, + 'Selected columns', + ) + : null, + ), + ), + ]; +}; + const SelectedDetails = (/** @type Properties */ props, /** @type Table | Column */ item) => { const userCanEdit = getValue(props.permissions)?.can_edit ?? false; const userCanNavigate = getValue(props.permissions)?.can_navigate ?? false; @@ -599,16 +661,14 @@ const MultiEdit = (/** @type Properties */ props, /** @type Object */ selectedIt disabled: () => attributes.every(({ checkedState }) => !checkedState.val), onclick: () => { const items = selectedItems.val.reduce((array, table) => { - if (table.all) { - const [ type, id ] = table.id.split('_'); + const [ type, id ] = table.id.split('_'); + array.push({ type, id }); + + table.children.forEach(column => { + const [ type, id ] = column.id.split('_'); array.push({ type, id }); - } else { - const columns = table.children.map(column => { - const [ type, id ] = column.id.split('_'); - return { type, id }; - }); - array.push(...columns); - } + }); + return array; }, []); @@ -681,8 +741,11 @@ const ConditionalEmptyState = ( message: EMPTY_STATE_MESSAGE.tableGroup, link: { label: 'Go to Table Groups', - href: 'connections:table-groups', - params: { connection_id: projectSummary.default_connection_id }, + href: 'table-groups', + params: { + project_code: projectSummary.project_code, + connection_id: projectSummary.default_connection_id, + }, disabled: !userCanNavigate, }, }; @@ -745,6 +808,34 @@ stylesheet.replace(` font-size: 18px; text-align: center; } + +.tg-dh--export-portal { + border-radius: 8px; + background: var(--dk-card-background); + box-shadow: var(--portal-box-shadow); + overflow: visible; + z-index: 99; +} + +.tg-dh--export-option { + padding: 12px 16px; + cursor: pointer; + color: var(--primary-text-color); +} + +.tg-dh--export-option:first-child { + border-top-left-radius: 8px; + border-top-right-radius: 8px; +} + +.tg-dh--export-option:last-child { + border-bottom-left-radius: 8px; + border-bottom-right-radius: 8px; +} + +.tg-dh--export-option:hover { + background: var(--select-hover-background); +} `); export { DataCatalog }; diff --git a/testgen/ui/components/frontend/js/pages/profiling_runs.js b/testgen/ui/components/frontend/js/pages/profiling_runs.js index dc955cb9..1290e270 100644 --- a/testgen/ui/components/frontend/js/pages/profiling_runs.js +++ b/testgen/ui/components/frontend/js/pages/profiling_runs.js @@ -35,8 +35,9 @@ import { Button } from '../components/button.js'; import { Streamlit } from '../streamlit.js'; import { emitEvent, getValue, resizeFrameHeightToElement } from '../utils.js'; import { formatTimestamp, formatDuration } from '../display_utils.js'; +import { Checkbox } from '../components/checkbox.js'; -const { div, span, i } = van.tags; +const { div, i, span, strong } = van.tags; const ProfilingRuns = (/** @type Properties */ props) => { window.testgen.isPage = true; @@ -49,40 +50,104 @@ const ProfilingRuns = (/** @type Properties */ props) => { Streamlit.setFrameHeight(100 * items.length); return items; }); - const columns = ['20%', '20%', '20%', '30%', '10%']; + const columns = ['5%', '15%', '20%', '20%', '30%', '10%']; const userCanRun = getValue(props.permissions)?.can_run ?? false; + const userCanEdit = getValue(props.permissions)?.can_edit ?? false; + const selectedRuns = {}; const tableId = 'profiling-runs-table'; resizeFrameHeightToElement(tableId); + const initializeSelectedStates = (items) => { + for (const profilingRun of items) { + if (selectedRuns[profilingRun.profiling_run_id] == undefined) { + selectedRuns[profilingRun.profiling_run_id] = van.state(false); + } + } + }; + + initializeSelectedStates(profilingRunItems.val); + + van.derive(() => { + initializeSelectedStates(profilingRunItems.val); + }); + return div( { class: 'table', id: tableId }, + () => { + const items = profilingRunItems.val; + const selectedItems = items.filter(i => selectedRuns[i.profiling_run_id]?.val ?? false); + const someRunSelected = selectedItems.length > 0; + const tooltipText = !someRunSelected ? 'No runs selected' : undefined; + + if (!userCanEdit) { + return ''; + } + + return div( + { class: 'flex-row fx-justify-content-flex-end pb-2' }, + someRunSelected ? strong({class: 'mr-1'}, selectedItems.length) : '', + someRunSelected ? span({class: 'mr-4'}, 'runs selected') : '', + Button({ + type: 'stroked', + icon: 'delete', + label: 'Delete Runs', + tooltip: tooltipText, + tooltipPosition: 'bottom-left', + disabled: !someRunSelected, + width: 'auto', + onclick: () => emitEvent('RunsDeleted', { payload: selectedItems.map(i => i.profiling_run_id) }), + }), + ); + }, div( { class: 'table-header flex-row' }, + () => { + const items = profilingRunItems.val; + const selectedItems = items.filter(i => selectedRuns[i.profiling_run_id]?.val ?? false); + const allSelected = selectedItems.length === items.length; + const partiallySelected = selectedItems.length > 0 && selectedItems.length < items.length; + + if (!userCanEdit) { + return ''; + } + + return span( + { style: `flex: ${columns[0]}` }, + userCanEdit + ? Checkbox({ + checked: allSelected, + indeterminate: partiallySelected, + onChange: (checked) => items.forEach(item => selectedRuns[item.profiling_run_id].val = checked), + testId: 'select-all-profiling-run', + }) + : '', + ); + }, span( - { style: `flex: ${columns[0]}` }, + { style: `flex: ${columns[1]}` }, 'Start Time | Table Group', ), span( - { style: `flex: ${columns[1]}` }, + { style: `flex: ${columns[2]}` }, 'Status | Duration', ), span( - { style: `flex: ${columns[2]}` }, + { style: `flex: ${columns[3]}` }, 'Schema', ), span( - { style: `flex: ${columns[3]}` }, + { style: `flex: ${columns[4]}` }, 'Hygiene Issues', ), span( - { style: `flex: ${columns[4]}` }, + { style: `flex: ${columns[5]}` }, 'Profiling Score', ), ), () => div( - profilingRunItems.val.map(item => ProfilingRunItem(item, columns, userCanRun)), + profilingRunItems.val.map(item => ProfilingRunItem(item, columns, selectedRuns[item.profiling_run_id], userCanRun, userCanEdit)), ), ); } @@ -90,12 +155,24 @@ const ProfilingRuns = (/** @type Properties */ props) => { const ProfilingRunItem = ( /** @type ProfilingRun */ item, /** @type string[] */ columns, + /** @type boolean */ selected, /** @type boolean */ userCanRun, + /** @type boolean */ userCanEdit, ) => { return div( { class: 'table-row flex-row', 'data-testid': 'profiling-run-item' }, + userCanEdit + ? div( + { style: `flex: ${columns[0]}; font-size: 16px;` }, + Checkbox({ + checked: selected, + onChange: (checked) => selected.val = checked, + testId: 'select-profiling-run', + }), + ) + : '', div( - { style: `flex: ${columns[0]}` }, + { style: `flex: ${columns[1]}` }, div({'data-testid': 'profiling-run-item-starttime'}, formatTimestamp(item.start_time)), div( { class: 'text-caption mt-1', 'data-testid': 'profiling-run-item-tablegroup' }, @@ -103,7 +180,7 @@ const ProfilingRunItem = ( ), ), div( - { class: 'flex-row', style: `flex: ${columns[1]}` }, + { class: 'flex-row', style: `flex: ${columns[2]}` }, div( ProfilingRunStatus(item), div( @@ -119,7 +196,7 @@ const ProfilingRunItem = ( }) : null, ), div( - { style: `flex: ${columns[2]}` }, + { style: `flex: ${columns[3]}` }, div({'data-testid': 'profiling-run-item-schema'}, item.schema_name), div( { @@ -138,7 +215,7 @@ const ProfilingRunItem = ( }) : null, ), div( - { class: 'pr-3', style: `flex: ${columns[3]}` }, + { class: 'pr-3', style: `flex: ${columns[4]}` }, item.anomaly_ct ? SummaryBar({ items: [ { label: 'Definite', value: item.anomalies_definite_ct, color: 'red' }, @@ -146,7 +223,7 @@ const ProfilingRunItem = ( { label: 'Possible', value: item.anomalies_possible_ct, color: 'yellow' }, { label: 'Dismissed', value: item.anomalies_dismissed_ct, color: 'grey' }, ], - height: 10, + height: 3, width: 350, }) : '--', item.anomaly_ct ? Link({ @@ -160,7 +237,7 @@ const ProfilingRunItem = ( }) : null, ), div( - { style: `flex: ${columns[4]}; font-size: 16px;` }, + { style: `flex: ${columns[5]}; font-size: 16px;` }, item.dq_score_profiling ?? '--', ), ); diff --git a/testgen/ui/components/frontend/js/pages/project_dashboard.js b/testgen/ui/components/frontend/js/pages/project_dashboard.js index 940d7708..92c22c0f 100644 --- a/testgen/ui/components/frontend/js/pages/project_dashboard.js +++ b/testgen/ui/components/frontend/js/pages/project_dashboard.js @@ -2,9 +2,6 @@ * @typedef ProjectSummary * @type {object} * @property {string} project_code - * @property {number} table_groups_count - * @property {number} test_suites_count - * @property {number} test_definitions_count * @property {number} test_runs_count * @property {number} profiling_runs_count * @property {number} connections_count @@ -15,9 +12,8 @@ * @property {string} id * @property {string} test_suite * @property {number} test_ct - * @property {string} latest_auto_gen_date - * @property {string} latest_run_start - * @property {string} latest_run_id + * @property {number?} latest_run_start + * @property {string?} latest_run_id * @property {number} last_run_test_ct * @property {number} last_run_passed_ct * @property {number} last_run_warning_ct @@ -29,12 +25,11 @@ * @type {object} * @property {string} id * @property {string} table_groups_name - * @property {string} table_groups_name - * @property {number?} dq_score - * @property {number?} dq_score_profiling - * @property {number?} dq_score_testing - * @property {string} latest_profile_id - * @property {string} latest_profile_start + * @property {string?} dq_score + * @property {string?} dq_score_profiling + * @property {string?} dq_score_testing + * @property {string?} latest_profile_id + * @property {number?} latest_profile_start * @property {number} latest_profile_table_ct * @property {number} latest_profile_column_ct * @property {number} latest_anomalies_ct @@ -42,16 +37,8 @@ * @property {number} latest_anomalies_likely_ct * @property {number} latest_anomalies_possible_ct * @property {number} latest_anomalies_dismissed_ct - * @property {string} latest_tests_start - * @property {number} latest_tests_suite_ct - * @property {number} latest_tests_ct - * @property {number} latest_tests_passed_ct - * @property {number} latest_tests_warning_ct - * @property {number} latest_tests_failed_ct - * @property {number} latest_tests_error_ct - * @property {number} latest_tests_dismissed_ct + * @property {number?} latest_tests_start * @property {TestSuiteSummary[]} test_suites - * @property {boolean} expanded * * @typedef SortOption * @type {object} @@ -67,11 +54,9 @@ */ import van from '../van.min.js'; import { Streamlit } from '../streamlit.js'; -import { emitEvent, getValue, loadStylesheet, friendlyPercent, resizeFrameHeightOnDOMChange, resizeFrameHeightToElement } from '../utils.js'; +import { getValue, loadStylesheet, resizeFrameHeightOnDOMChange, resizeFrameHeightToElement } from '../utils.js'; import { formatTimestamp } from '../display_utils.js'; import { Card } from '../components/card.js'; -import { Caption } from '../components/caption.js'; -import { ExpanderToggle } from '../components/expander_toggle.js'; import { Select } from '../components/select.js'; import { Input } from '../components/input.js'; import { Link } from '../components/link.js'; @@ -79,7 +64,9 @@ import { SummaryBar } from '../components/summary_bar.js'; import { EmptyState, EMPTY_STATE_MESSAGE } from '../components/empty_state.js'; import { ScoreMetric } from '../components/score_metric.js'; -const { div, h3, hr, span, strong } = van.tags; +const { div, h3, hr, span } = van.tags; + +const staleProfileDays = 60; const ProjectDashboard = (/** @type Properties */ props) => { loadStylesheet('project-dashboard', stylesheet); @@ -124,43 +111,7 @@ const ProjectDashboard = (/** @type Properties */ props) => { { id: wrapperId, class: 'flex-column tg-overview' }, () => !getValue(isEmpty) ? div( - { class: 'flex-row fx-align-stretch fx-gap-4' }, - Card({ - id: 'overview-project-summary', - class: 'tg-overview--project', - testId: 'project-summary', - border: true, - content: [ - () => div( - { class: 'flex-row fx-align-flex-start' }, - () => { - return div( - { class: 'flex-column fx-gap-2 tg-overview--project--summary' }, - Caption({content: 'Project Summary', style: 'margin-bottom: 8px;' }), - div( - strong({ style: 'margin-right: 4px;' }, props.project.val.table_groups_count), - span('table groups'), - ), - div( - strong({ style: 'margin-right: 4px;' }, props.project.val.test_suites_count), - span('test suites'), - ), - div( - strong({ style: 'margin-right: 4px;' }, props.project.val.test_definitions_count), - span('test definitions'), - ), - ); - } - ), - ], - }), - ) - : ConditionalEmptyState(getValue(props.project)), - () => !getValue(isEmpty) - ? div( - { class: 'flex-row fx-align-flex-end' }, - h3(() => `Table Groups (${tableGroups?.val?.length ?? 0})`), - span({ style: 'margin-right: auto;' }), + { class: 'flex-row fx-align-flex-end fx-gap-4' }, Input({ width: 230, height: 38, @@ -171,7 +122,6 @@ const ProjectDashboard = (/** @type Properties */ props) => { testId: 'table-groups-filter', onChange: (value) => tableGroupsSearchTerm.val = value, }), - span({ style: 'margin-right: 1rem;' }), Select({ label: 'Sort by', value: tableGroupsSortOption, @@ -184,10 +134,10 @@ const ProjectDashboard = (/** @type Properties */ props) => { : '', () => !getValue(isEmpty) ? div( - { class: 'flex-column mt-2' }, + { class: 'flex-column mt-4' }, getValue(filteredTableGroups).map(tableGroup => TableGroupCard(tableGroup)), ) - : '', + : ConditionalEmptyState(getValue(props.project)), ); } @@ -195,153 +145,112 @@ const TableGroupCard = (/** @type TableGroupSummary */ tableGroup) => { return Card({ testId: 'table-group-summary-card', border: true, - title: tableGroup.table_groups_name, - actionContent: () => ExpanderToggle({ - default: tableGroup.expanded, - style: 'font-size: 14px !important; font-weight: 400;', - onExpand: () => { - emitEvent('TableGroupExpanded', {payload: tableGroup.id}); - }, - onCollapse: () => { - emitEvent('TableGroupCollapsed', {payload: tableGroup.id}); - }, - }), content: () => div( { class: 'flex-column' }, div( - { class: 'flex-row fx-align-flex-start' }, + { class: 'flex-row fx-align-flex-start fx-justify-space-between' }, div( - { class: 'flex-column fx-flex' }, - TableGroupLatestProfile(tableGroup), - ), - div( - { class: 'flex-column fx-flex' }, - TableGroupLatestTestResults(tableGroup), + { class: 'flex-column', style: 'flex: auto;' }, + h3( + { class: 'tg-overview--title' }, + tableGroup.table_groups_name, + ), + span( + { class: 'text-caption mt-1 mb-3 tg-overview--subtitle' }, + `${tableGroup.latest_profile_table_ct ?? 0} tables | ${tableGroup.latest_profile_column_ct ?? 0} columns`, + ), + TableGroupTestSuiteSummary(tableGroup.test_suites), ), ScoreMetric(tableGroup.dq_score, tableGroup.dq_score_profiling, tableGroup.dq_score_testing), ), - tableGroup.expanded - ? hr({ class: 'tg-overview--table-group-divider' }) - : undefined, - tableGroup.expanded - ? TableGroupTestSuiteSummary(tableGroup.test_suites) - : undefined, + hr({ class: 'tg-overview--table-group-divider' }), + TableGroupLatestProfile(tableGroup), ) }); }; const TableGroupLatestProfile = (/** @type TableGroupSummary */ tableGroup) => { - return [ - Caption({ content: 'Latest profile' }), - () => tableGroup.latest_profile_start ? div( - div( - { class: 'flex-row mb-3' }, - Link({ - label: formatTimestamp(tableGroup.latest_profile_start), - href: 'profiling-runs:results', - params: { run_id: tableGroup.latest_profile_id }, - }), - ), - div( - { class: 'flex-row mb-3' }, - strong({ class: 'mr-1' }, tableGroup.latest_profile_table_ct), - span('tables'), - span({ class: 'mr-1 ml-1' }, '|'), - strong({ class: 'mr-1' }, tableGroup.latest_profile_column_ct), - span('columns'), - span({ class: 'mr-1 ml-1' }, '|'), - Link({ - label: `${tableGroup.latest_anomalies_ct} hygiene issues`, - href: 'profiling-runs:hygiene', - params: { - run_id: tableGroup.latest_profile_id, - }, - width: 150, - }) - ), - () => tableGroup.latest_anomalies_ct - ? SummaryBar({ - items: [ - { label: 'Definite', value: parseInt(tableGroup.latest_anomalies_definite_ct), color: 'red' }, - { label: 'Likely', value: parseInt(tableGroup.latest_anomalies_likely_ct), color: 'orange' }, - { label: 'Possible', value: parseInt(tableGroup.latest_anomalies_possible_ct), color: 'yellow' }, - { label: 'Dismissed', value: parseInt(tableGroup.latest_anomalies_dismissed_ct), color: 'grey' }, - ], - height: 12, - width: 280, - }) - : '', - ) - : span('--'), - ]; -}; + if (!tableGroup.latest_profile_start) { + return div( + { class: 'mt-1 mb-1 text-secondary' }, + 'No profiling data yet', + ); + } -const TableGroupLatestTestResults = (/** @type TableGroupSummary */ tableGroup) => { - return [ - Caption({ content: 'Latest test results' }), - () => tableGroup.latest_tests_ct - ? div( - { class: 'flex-column' }, - span( - { class: 'mb-3' }, - `${friendlyPercent(tableGroup.latest_tests_passed_ct * 100 / tableGroup.latest_tests_ct)}% passed`, - ), - div( - { class: 'flex-row mb-3' }, - strong({ class: 'mr-1' }, tableGroup.latest_tests_ct), - span({ class: 'mr-1' }, 'tests in'), - strong({ class: 'mr-1' }, tableGroup.latest_tests_suite_ct), - span('test suites'), - ), - SummaryBar({ - items: [ - { label: 'Passed', value: parseInt(tableGroup.latest_tests_passed_ct), color: 'green' }, - { label: 'Warning', value: parseInt(tableGroup.latest_tests_warning_ct), color: 'yellow' }, - { label: 'Failed', value: parseInt(tableGroup.latest_tests_failed_ct), color: 'red' }, - { label: 'Error', value: parseInt(tableGroup.latest_tests_error_ct), color: 'brown' }, - { label: 'Dismissed', value: parseInt(tableGroup.latest_tests_dismissed_ct), color: 'grey' }, - ], - height: 12, - width: 350, - }) - ) - : span('--'), - ]; + const daysAgo = Math.round((new Date() - new Date(tableGroup.latest_profile_start)) / (1000 * 60 * 60 * 24)); + + return div( + div( + { class: 'flex-row fx-gap-1 mb-2' }, + span('Latest profile:'), + Link({ + label: formatTimestamp(tableGroup.latest_profile_start), + href: 'profiling-runs:results', + params: { run_id: tableGroup.latest_profile_id }, + }), + daysAgo > staleProfileDays + ? span({ class: 'text-error' }, `(${daysAgo} days ago)`) + : null, + span('|'), + Link({ + label: `${tableGroup.latest_anomalies_ct} hygiene issues`, + href: 'profiling-runs:hygiene', + params: { + run_id: tableGroup.latest_profile_id, + }, + width: 150, + }), + ), + tableGroup.latest_anomalies_ct + ? SummaryBar({ + items: [ + { label: 'Definite', value: parseInt(tableGroup.latest_anomalies_definite_ct), color: 'red' }, + { label: 'Likely', value: parseInt(tableGroup.latest_anomalies_likely_ct), color: 'orange' }, + { label: 'Possible', value: parseInt(tableGroup.latest_anomalies_possible_ct), color: 'yellow' }, + { label: 'Dismissed', value: parseInt(tableGroup.latest_anomalies_dismissed_ct), color: 'grey' }, + ], + height: 3, + width: 350, + }) + : '', + ); }; const TableGroupTestSuiteSummary = (/** @type TestSuiteSummary[] */testSuites) => { + if (!testSuites?.length) { + return div( + { class: 'mt-1 mb-1 text-secondary' }, + 'No test suites yet', + ); + } + return div( { class: 'flex-column' }, div( - { class: 'flex-row mb-4' }, - Caption({ content: 'Test Suite', style: 'flex: 1 1 20%;' }), - Caption({ content: 'Latest Generation', style: 'flex: 1 1 15%;' }), - Caption({ content: 'Latest Run', style: 'flex: 1 1 15%;' }), - Caption({ content: 'Latest Results', style: 'flex: 1 1 50%;' }), + { class: 'flex-row mb-1 tg-overview--row' }, + span({ style: 'flex: 1 1 25%;' }, 'Test Suite'), + span({ style: 'flex: 1 1 25%;' }, 'Latest Run'), + span({ style: 'flex: 1 1 50%;' }, 'Latest Results'), ), testSuites.map(suite => div( - { class: 'flex-row mb-2' }, + { class: 'flex-row fx-align-flex-start mt-2 tg-overview--row' }, div( - { class: 'flex-column', style: 'flex: 1 1 20%;' }, + { class: 'flex-column', style: 'flex: 1 1 25%; word-break: break-word;' }, Link({ label: suite.test_suite, href: 'test-suites:definitions', params: { test_suite_id: suite.id }, }), - Caption({ content: `${suite.test_ct ?? 0} tests`}), - ), - span( - { style: 'flex: 1 1 15%;' }, - suite.latest_auto_gen_date ? formatTimestamp(suite.latest_auto_gen_date) : '--', + span({ class: 'text-caption' }, `${suite.test_ct ?? 0} tests`), ), suite.latest_run_id ? Link({ label: formatTimestamp(suite.latest_run_start), href: 'test-runs:results', params: { run_id: suite.latest_run_id }, - style: 'flex: 1 1 15%;', + style: 'flex: 1 1 25%;', }) - : span({ style: 'flex: 1 1 15%;' }, '--'), + : span({ style: 'flex: 1 1 25%;' }, '--'), div( { style: 'flex: 1 1 50%;' }, suite.last_run_test_ct ? SummaryBar({ @@ -352,7 +261,7 @@ const TableGroupTestSuiteSummary = (/** @type TestSuiteSummary[] */testSuites) = { label: 'Error', 'value': parseInt(suite.last_run_error_ct), color: 'brown' }, { label: 'Dismissed', 'value': parseInt(suite.last_run_dismissed_ct), color: 'grey' }, ], - width: 200, + width: 350, height: 8, }) : '--', ), @@ -373,8 +282,8 @@ const ConditionalEmptyState = (/** @type ProjectSummary */ project) => { message: EMPTY_STATE_MESSAGE.tableGroup, link: { label: 'Go to Table Groups', - href: 'connections:table-groups', - params: { connection_id: project.default_connection_id }, + href: 'table-groups', + params: { project_code: project.project_code, connection_id: project.default_connection_id }, }, }; @@ -393,22 +302,20 @@ stylesheet.replace(` width: 100%; } -.tg-overview--project { - margin: 8px 0; - width: 50%; -} - -.tg-overview--project--score { - margin-right: auto; +.tg-overview--title { + margin: 0; + font-size: 18px; + font-weight: 500; } -.tg-overview--project--summary { - margin-right: auto; +.tg-overview--subtitle { + text-transform: none; + font-weight: 400; } hr.tg-overview--table-group-divider { height: 1px; - margin: 8px 0 12px; + margin: 12px 0; padding: 0px; color: inherit; background-color: transparent; @@ -416,7 +323,11 @@ hr.tg-overview--table-group-divider { border-right: none; border-left: none; border-image: initial; - border-bottom: 1px solid rgba(49, 51, 63, 0.2); + border-bottom: 1px solid var(--border-color); +} + +.tg-overview--row > * { + padding: 0 4px; } `); diff --git a/testgen/ui/components/frontend/js/pages/quality_dashboard.js b/testgen/ui/components/frontend/js/pages/quality_dashboard.js index 093fe249..e502b011 100644 --- a/testgen/ui/components/frontend/js/pages/quality_dashboard.js +++ b/testgen/ui/components/frontend/js/pages/quality_dashboard.js @@ -167,8 +167,8 @@ const ConditionalEmptyState = (/** @type ProjectSummary */ projectSummary) => { message: projectSummary.table_groups_count ? EMPTY_STATE_MESSAGE.profiling : EMPTY_STATE_MESSAGE.tableGroup, link: { label: 'Go to Table Groups', - href: 'connections:table-groups', - params: { connection_id: projectSummary.default_connection_id }, + href: 'table-groups', + params: { project_code: projectSummary.project_code, connection_id: projectSummary.default_connection_id }, }, }; } diff --git a/testgen/ui/components/frontend/js/pages/run_profiling_dialog.js b/testgen/ui/components/frontend/js/pages/run_profiling_dialog.js new file mode 100644 index 00000000..f5fd0f1e --- /dev/null +++ b/testgen/ui/components/frontend/js/pages/run_profiling_dialog.js @@ -0,0 +1,95 @@ +/** + * @import { TableGroup } from '../components/table_group_form.js'; + * + * @typedef Result + * @type {object} + * @property {boolean} success + * @property {string?} message + * + * @typedef Properties + * @type {object} + * @property {TableGroup} table_group + * @property {Result?} result + */ +import van from '../van.min.js'; +import { Streamlit } from '../streamlit.js'; +import { Alert } from '../components/alert.js'; +import { ExpanderToggle } from '../components/expander_toggle.js'; +import { Icon } from '../components/icon.js'; +import { emitEvent, getValue, resizeFrameHeightOnDOMChange, resizeFrameHeightToElement } from '../utils.js'; +import { Code } from '../components/code.js'; +import { Button } from '../components/button.js'; + +const { div, em, span, strong } = van.tags; + +/** + * @param {Properties} props + */ +const RunProfilingDialog = (props) => { + Streamlit.setFrameHeight(1); + window.testgen.isPage = true; + + const wrapperId = 'runprogiling-wrapper'; + + resizeFrameHeightToElement(wrapperId); + resizeFrameHeightOnDOMChange(wrapperId); + + const tableGroup = getValue(props.table_group); + const showCLICommand = van.state(false); + + return div( + { id: wrapperId, class: 'flex-column fx-gap-3' }, + div( + { class: 'flex-row fx-gap-1' }, + span({}, 'Execute profiling for the table group'), + strong({}, tableGroup.table_groups_name), + span({}, '?'), + ), + div( + { class: 'flex-row fx-gap-1' }, + Icon({}, 'info'), + em({}, ' Profiling will be performed in a background process.'), + ), + ExpanderToggle({ + collapseLabel: 'Collapse', + expandLabel: 'Show CLI command', + onCollapse: () => showCLICommand.val = false, + onExpand: () => showCLICommand.val = true, + }), + Code({ class: () => showCLICommand.val ? '' : 'hidden' }, `testgen run-profile --table-group-id ${tableGroup.id}`), + () => { + const result = getValue(props.result) ?? {}; + return result.message + ? Alert({ type: result.success ? 'success' : 'error' }, span(result.message)) + : ''; + }, + div( + { class: 'flex-row fx-justify-content-flex-end' }, + () => { + const result = getValue(props.result); + + if (result && result.success) { + return Button({ + type: 'stroked', + color: 'primary', + label: 'Go to Profiling Runs', + width: 'auto', + icon: 'chevron_right', + onclick: () => emitEvent('GoToProfilingRunsClicked', { payload: tableGroup.id }), + }); + } + + return Button({ + label: 'Run Profiling', + type: 'stroked', + color: 'primary', + width: 'auto', + style: 'width: auto;', + onclick: () => emitEvent('RunProfilingConfirmed', { payload: tableGroup.id }), + }); + } + ) + ); +}; + +export { RunProfilingDialog }; \ No newline at end of file diff --git a/testgen/ui/components/frontend/js/pages/schedule_list.js b/testgen/ui/components/frontend/js/pages/schedule_list.js index a4621c5d..f8c54f96 100644 --- a/testgen/ui/components/frontend/js/pages/schedule_list.js +++ b/testgen/ui/components/frontend/js/pages/schedule_list.js @@ -19,7 +19,7 @@ import van from '../van.min.js'; import { Button } from '../components/button.js'; import { Streamlit } from '../streamlit.js'; -import { emitEvent, getValue, resizeFrameHeightToElement } from '../utils.js'; +import { emitEvent, getValue, resizeFrameHeightToElement, resizeFrameHeightOnDOMChange } from '../utils.js'; import { withTooltip } from '../components/tooltip.js'; @@ -35,13 +35,14 @@ const ScheduleList = (/** @type Properties */ props) => { } catch (e) { console.log(e) } - Streamlit.setFrameHeight(100 * items.length); + Streamlit.setFrameHeight(100 * items.length || 150); return items; }); const columns = ['40%', '50%', '10%']; const tableId = 'profiling-schedules-table'; resizeFrameHeightToElement(tableId); + resizeFrameHeightOnDOMChange(tableId); return div( { class: 'table', id: tableId }, @@ -60,9 +61,11 @@ const ScheduleList = (/** @type Properties */ props) => { 'Actions', ), ), - () => div( - scheduleItems.val.map(item => ScheduleListItem(item, columns, getValue(props.permissions))), - ), + () => scheduleItems.val?.length + ? div( + scheduleItems.val.map(item => ScheduleListItem(item, columns, getValue(props.permissions))), + ) + : div({ class: 'mt-5 mb-3 ml-3 text-secondary' }, 'No schedules defined yet.'), ); } diff --git a/testgen/ui/components/frontend/js/pages/table_group.js b/testgen/ui/components/frontend/js/pages/table_group.js new file mode 100644 index 00000000..ce2cba33 --- /dev/null +++ b/testgen/ui/components/frontend/js/pages/table_group.js @@ -0,0 +1,174 @@ +/** + * @import { TableGroup } from '../components/table_group_form.js'; + * @import { Connection } from '../components/connection_form.js'; + * + * @typedef TableGroupPreview + * @type {object} + * @property {string} schema + * @property {string[]?} tables + * @property {number?} column_count + * @property {boolean?} success + * @property {string?} message + * + * @typedef Result + * @type {object} + * @property {boolean} success + * @property {string} message + * + * @typedef Properties + * @type {object} + * @property {string} project_code + * @property {TableGroup} table_group + * @property {Connection[]} connections + * @property {boolean?} in_used + * @property {TableGroupPreview?} table_group_preview + * @property {Result?} result + */ +import van from '../van.min.js'; +import { Streamlit } from '../streamlit.js'; +import { Button } from '../components/button.js'; +import { getValue, emitEvent, loadStylesheet, resizeFrameHeightToElement, resizeFrameHeightOnDOMChange } from '../utils.js'; +import { TableGroupForm } from '../components/table_group_form.js'; +import { Tab, Tabs } from '../components/tabs.js'; +import { Alert } from '../components/alert.js'; + +const { div, span, strong } = van.tags; + +/** + * @param {Properties} props + * @returns {HTMLElement} + */ +const TableGroup = (props) => { + loadStylesheet('tablegroupchange', stylesheet); + Streamlit.setFrameHeight(1); + window.testgen.isPage = true; + + const connections = getValue(props.connections) ?? []; + const enableConnectionSelector = getValue(props.table_group)?.connection_id === undefined; + const updatedTableGroup = van.state(getValue(props.table_group) ?? {}); + const disableSchemaField = getValue(props.in_used ?? false); + const disableSave = van.state(true); + const wrapperId = 'tablegroup-change-wrapper'; + + resizeFrameHeightToElement(wrapperId); + resizeFrameHeightOnDOMChange(wrapperId); + + return Tabs( + { id: wrapperId }, + Tab( + { label: 'Table Group Settings'}, + () => { + const tableGroup = updatedTableGroup.rawVal; + const result = getValue(props.result); + + return div( + { class: 'flex-column fx-gap-3' }, + TableGroupForm({ + tableGroup, + connections, + enableConnectionSelector, + disableSchemaField, + showConnectionSelector: connections.length > 1, + onChange: (newTableGroup, state) => { + updatedTableGroup.val = newTableGroup; + disableSave.val = !state.valid; + }, + }), + result + ? Alert( + { type: result.success ? 'success' : 'error', closeable: true }, + span({}, result.message), + ) + : undefined, + ); + }, + div( + { class: 'flex-row fx-gap-2 fx-justify-content-flex-end mt-3' }, + Button({ + label: 'Save', + type: 'stroked', + color: 'primary', + style: 'width: auto;', + disabled: disableSave, + onclick: () => emitEvent('TableGroupSaveClicked', { payload: updatedTableGroup.val }), + }), + ), + ), + Tab( + { label: 'Test' }, + () => { + const currentSchema = updatedTableGroup.val.table_group_schema ?? tableGroupPreview?.schema ?? '--'; + const tableGroupPreview = getValue(props.table_group_preview); + const wasPreviewExecuted = tableGroupPreview && typeof tableGroupPreview.success === 'boolean'; + const alertMessage = tableGroupPreview.success ? 'Operation has finished successfully.' : 'Operation was unsuccessful.'; + + return div( + { class: 'flex-column fx-gap-2' }, + div( + { class: 'flex-row fx-justify-space-between' }, + div( + { class: 'flex-column fx-gap-2' }, + div( + { class: 'flex-row fx-gap-1' }, + strong({}, 'Schema:'), + span({}, currentSchema), + ), + div( + { class: 'flex-row fx-gap-1' }, + strong({}, 'Table Count:'), + span({}, tableGroupPreview?.tables?.length ?? '--'), + ), + div( + { class: 'flex-row fx-gap-1' }, + strong({}, 'Column Count:'), + span({}, tableGroupPreview?.column_count ?? '--'), + ), + ), + wasPreviewExecuted + ? Alert( + { type: tableGroupPreview.success ? 'success' : 'error' }, + span({}, alertMessage), + ) + : undefined, + ), + wasPreviewExecuted ? + div( + { class: 'table hoverable p-3' }, + div( + { class: 'table-header' }, + span('Tables'), + ), + div( + { class: 'flex-column', style: 'max-height: 200px; overflow-y: auto;' }, + tableGroupPreview?.tables?.length + ? tableGroupPreview.tables.map((table) => + div({ class: 'table-row' }, table), + ) + : div( + { class: 'flex-row fx-justify-center', style: 'height: 50px; font-size: 16px;'}, + tableGroupPreview.message ?? 'No tables found.' + ), + ), + ) + : undefined, + ); + }, + div( + {class: 'flex-row fx-gap-2 fx-justify-content-flex-end mt-3'}, + Button({ + label: 'Test Table Group', + type: 'stroked', + color: 'primary', + style: 'width: auto;', + onclick: () => emitEvent('PreviewTableGroupClicked', { payload: updatedTableGroup.val }), + }), + ), + ), + ); +} + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +`); + +export { TableGroup }; diff --git a/testgen/ui/components/frontend/js/pages/table_group_delete_confirmation.js b/testgen/ui/components/frontend/js/pages/table_group_delete_confirmation.js new file mode 100644 index 00000000..2037abd0 --- /dev/null +++ b/testgen/ui/components/frontend/js/pages/table_group_delete_confirmation.js @@ -0,0 +1,105 @@ +/** + * @import { TableGroup } from '../components/table_group_form.js'; + * + * @typedef Result + * @type {object} + * @property {boolean} success + * @property {string} message + * + * @typedef Properties + * @type {object} + * @property {string} project_code + * @property {TableGroup} table_group + * @property {boolean} can_be_deleted + * @property {Result?} result + */ + +import van from '../van.min.js'; +import { Streamlit } from '../streamlit.js'; +import { emitEvent, getValue, loadStylesheet, resizeFrameHeightOnDOMChange, resizeFrameHeightToElement } from '../utils.js'; +import { Button } from '../components/button.js'; +import { Toggle } from '../components/toggle.js'; +import { Attribute } from '../components/attribute.js'; +import { Alert } from '../components/alert.js'; + +const { div, h3, hr, span, b } = van.tags; + +/** + * @param {Properties} props + * @returns + */ +const TableGroupDeleteConfirmation = (props) => { + loadStylesheet('tablegroup-delete-confirmation', stylesheet); + Streamlit.setFrameHeight(1); + window.testgen.isPage = true; + + const wrapperId = 'tablegroup-delete-wrapper'; + const tableGroup = getValue(props.table_group); + const confirmDeleteRelated = van.state(false); + const deleteDisabled = van.derive(() => !getValue(props.can_be_deleted) && !confirmDeleteRelated.val); + + resizeFrameHeightToElement(wrapperId); + resizeFrameHeightOnDOMChange(wrapperId); + + return div( + { id: wrapperId, class: 'flex-column' }, + div( + { class: 'flex-column fx-gap-4' }, + span( + 'Are you sure you want to delete the table group ', + b(tableGroup.table_groups_name), + '?', + ), + Attribute({ + label: 'ID', + value: tableGroup.id, + }), + Attribute({ + label: 'Name', + value: tableGroup.table_groups_name, + }), + Attribute({ + label: 'Schema', + value: tableGroup.table_group_schema, + }), + ), + () => !getValue(props.can_be_deleted) + ? div( + { class: 'flex-column fx-gap-4 mt-4' }, + Alert( + { type: 'warn' }, + div('This Table Group has related data, which may include profiling, test definitions and test results.'), + div({ class: 'mt-2' }, 'If you proceed, all related data will be permanently deleted.'), + ), + Toggle({ + name: 'confirm-delete-tablegroup', + label: span( + 'Yes, delete the table group ', + b(tableGroup.table_groups_name), + ' and related TestGen data.', + ), + checked: confirmDeleteRelated, + onChange: (value) => confirmDeleteRelated.val = value, + }), + ) + : '', + + div( + { class: 'flex-row fx-justify-content-flex-end' }, + Button({ + type: () => deleteDisabled.val ? 'stroked' : 'flat', + color: () => deleteDisabled.val ? 'basic' : 'warn', + label: 'Delete', + style: 'width: auto;', + disabled: deleteDisabled, + onclick: () => emitEvent('DeleteTableGroupConfirmed'), + }), + ), + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +`); + +export { TableGroupDeleteConfirmation }; diff --git a/testgen/ui/components/frontend/js/pages/table_group_list.js b/testgen/ui/components/frontend/js/pages/table_group_list.js new file mode 100644 index 00000000..333b133a --- /dev/null +++ b/testgen/ui/components/frontend/js/pages/table_group_list.js @@ -0,0 +1,294 @@ +/** + * @import { TableGroup } from '../components/table_group_form.js'; + * @import { Connection } from '../components/connection_form.js'; + * + * @typedef Permissions + * @type {object} + * @property {boolean} can_edit + * + * @typedef Properties + * @type {object} + * @property {string} project_code + * @property {string?} connection_id + * @property {Connection[]} connections + * @property {TableGroup[]} table_groups + * @property {Permissions} permissions + */ +import van from '../van.min.js'; +import { Streamlit } from '../streamlit.js'; +import { Button } from '../components/button.js'; +import { Card } from '../components/card.js'; +import { Caption } from '../components/caption.js'; +import { Link } from '../components/link.js'; +import { getValue, emitEvent, loadStylesheet, resizeFrameHeightToElement, resizeFrameHeightOnDOMChange } from '../utils.js'; +import { EMPTY_STATE_MESSAGE, EmptyState } from '../components/empty_state.js'; +import { Select } from '../components/select.js'; +import { Icon } from '../components/icon.js'; +import { withTooltip } from '../components/tooltip.js'; + +const { div, h4, i, span } = van.tags; + +/** + * @param {Properties} props + * @returns {HTMLElement} + */ +const TableGroupList = (props) => { + loadStylesheet('tablegrouplist', stylesheet); + Streamlit.setFrameHeight(1); + window.testgen.isPage = true; + + const wrapperId = 'tablegroup-list-wrapper'; + + resizeFrameHeightToElement(wrapperId); + resizeFrameHeightOnDOMChange(wrapperId); + + return div( + { id: wrapperId, style: 'overflow-y: auto;' }, + () => { + const permissions = getValue(props.permissions) ?? {can_edit: false}; + const connections = getValue(props.connections) ?? []; + const connectionId = getValue(props.connection_id); + const tableGroups = getValue(props.table_groups) ?? []; + + if (connections.length <= 0) { + return EmptyState({ + icon: 'table_view', + label: 'Your project is empty', + message: EMPTY_STATE_MESSAGE.connection, + link: { + label: 'Go to Connections', + href: 'connections', + params: { project_code: getValue(props.project_code) }, + disabled: !permissions.can_edit, + }, + }); + } + + return div( + Toolbar(permissions, connections, connectionId), + tableGroups.length > 0 + ? tableGroups.map((tableGroup) => Card({ + testId: 'table-group-card', + class: '', + title: div( + { class: 'flex-column fx-gap-2 tg-tablegroup--card-title', 'data-testid': 'tablegroup-card-title' }, + h4({'data-testid': 'tablegroup-card-title-name'}, tableGroup.table_groups_name), + div( + {class: 'flex-row fx-gap-1 fx-align-center'}, + Icon({ size: 14 }, tableGroup.connection.flavor.icon), + Caption({ content: tableGroup.connection.name }), + ), + ), + border: true, + content: div( + { class: 'flex-column fx-gap-3' }, + div( + { class: 'flex-row fx-gap-3' }, + div( + { class: 'flex-column fx-flex fx-gap-3' }, + Link({ + label: 'View test suites', + href: 'test-suites', + params: { 'project_code': getValue(props.project_code), 'table_group_id': tableGroup.id }, + right_icon: 'chevron_right', + right_icon_size: 20, + }), + div( + { class: 'flex-row fx-flex fx-gap-3' }, + div( + { class: 'flex-column fx-flex fx-gap-4' }, + div( + { class: 'flex-column fx-flex' }, + Caption({content: 'DB Schema', style: 'margin-bottom: 4px;'}), + span(tableGroup.table_group_schema || '--'), + ), + div( + { class: 'flex-column fx-flex' }, + Caption({content: 'Explicit Table List', style: 'margin-bottom: 4px;'}), + tableGroup.profiling_table_set + ? TruncatedText( + {max: 3}, + ...tableGroup.profiling_table_set.split(',').map(t => t.trim()) + ) + : '--', + ), + ), + div( + { class: 'flex-column fx-flex fx-gap-4' }, + div( + { class: 'flex-column fx-flex' }, + Caption({content: 'Tables to Include Mask', style: 'margin-bottom: 4px;'}), + span(tableGroup.profiling_include_mask || '--'), + ), + div( + { class: 'flex-column fx-flex' }, + Caption({content: 'Uses Record Sampling', style: 'margin-bottom: 4px;'}), + span(tableGroup.profile_use_sampling ? 'Yes' : 'No'), + ), + ), + div( + { class: 'flex-column fx-flex fx-gap-4' }, + div( + { class: 'flex-column fx-flex' }, + Caption({content: 'Tables to Exclude Mask', style: 'margin-bottom: 4px;'}), + span(tableGroup.profiling_exclude_mask || '--'), + ), + div( + { class: 'flex-column fx-flex' }, + Caption({content: 'Min Profiling Age (Days)', style: 'margin-bottom: 4px;'}), + span(tableGroup.profiling_delay_days || '--'), + ), + ), + span({ class: 'fx-flex' }), + ), + ), + permissions.can_edit + ? div( + { class: 'flex-column' }, + Button({ + type: 'stroked', + color: 'primary', + label: 'Run Profiling', + onclick: () => emitEvent('RunProfilingClicked', { payload: tableGroup.id }), + }), + ) + : '', + ) + ), + actionContent: permissions.can_edit + ? div( + { class: 'flex-row fx-align-center' }, + Button({ + type: 'icon', + icon: 'edit', + iconSize: 18, + tooltip: 'Edit table group', + tooltipPosition: 'left', + color: 'basic', + onclick: () => emitEvent('EditTableGroupClicked', { payload: tableGroup.id }), + }), + Button({ + type: 'icon', + icon: 'delete', + iconSize: 18, + tooltip: 'Delete table group', + tooltipPosition: 'left', + color: 'basic', + onclick: () => emitEvent('DeleteTableGroupClicked', { payload: tableGroup.id }), + }), + ) + : undefined, + })) + : EmptyState({ + icon: 'table_view', + label: 'No table groups yet', + class: 'mt-4', + message: EMPTY_STATE_MESSAGE.tableGroup, + button: Button({ + type: 'stroked', + icon: 'add', + label: 'Add Table Group', + color: 'primary', + style: 'width: unset;', + disabled: !permissions.can_edit, + onclick: () => emitEvent('AddTableGroupClicked', {}), + }), + }), + ); + }, + ); +} + +/** + * + * @param {Permissions} permissions + * @param {Connection[]} connections + * @param {string?} selectedConnection + * @returns + */ +const Toolbar = (permissions, connections, selectedConnection) => { + return div( + { class: 'flex-row fx-align-flex-end fx-justify-space-between mb-4' }, + (getValue(connections) ?? [])?.length > 1 + ? Select({ + testId: 'connection-select', + label: 'Connection', + allowNull: true, + height: 38, + value: selectedConnection, + options: getValue(connections)?.map((connection) => ({ + label: connection.connection_name, + value: String(connection.connection_id), + })) ?? [], + onChange: (value) => emitEvent('ConnectionSelected', { payload: value }), + }) + : span(''), + div( + { class: 'flex-row fx-gap-4' }, + Button({ + icon: 'today', + type: 'stroked', + label: 'Profiling Schedules', + tooltip: 'Manage when profiling should run for table groups', + tooltipPosition: 'bottom', + width: 'fit-content', + style: 'background: var(--dk-card-background);', + onclick: () => emitEvent('RunSchedulesClicked', {}), + }), + permissions.can_edit + ? Button({ + type: 'stroked', + icon: 'add', + label: 'Add Table Group', + color: 'basic', + style: 'background: var(--button-generic-background-color); width: unset;', + onclick: () => emitEvent('AddTableGroupClicked', {}), + }) + : '', + ) + ); +} + +/** + * @typedef TruncatedTextOptions + * @type {object} + * @property {number} max + * @property {string?} class + * + * @param {TruncatedTextOptions} options + * @param {string[]} children + */ +const TruncatedText = ({ max, ...options }, ...children) => { + const sortedChildren = [...children.sort((a, b) => a.length - b.length)]; + const tooltipText = children.sort((a, b) => a.localeCompare(b)).join(', '); + + return div( + { class: () => `${options.class ?? ''}`, style: 'position: relative;' }, + span(sortedChildren.slice(0, max).join(', ')), + sortedChildren.length > max + ? withTooltip( + i({class: 'text-caption'}, ` + ${sortedChildren.length - max} more`), + { + text: tooltipText, + position: 'top-right', + } + ) + : '', + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.tg-tablegroup--card-title h4 { + margin: 0; + color: var(--primary-text-color); + font-size: 1.5rem; + text-transform: initial; +} + +.tg-empty-state.mt-4 { + margin-top: 16px; +} +`); + +export { TableGroupList }; diff --git a/testgen/ui/components/frontend/js/pages/test_definition_summary.js b/testgen/ui/components/frontend/js/pages/test_definition_summary.js new file mode 100644 index 00000000..42984d50 --- /dev/null +++ b/testgen/ui/components/frontend/js/pages/test_definition_summary.js @@ -0,0 +1,143 @@ +/** + * @typedef TestDefinitionAttribute + * @type {object} + * @property {string} label + * @property {string} value + * @property {string?} help + * + * @typedef TestDefinition + * @type {object} + * @property {string} schema + * @property {string} test_suite_name + * @property {string} table_name + * @property {string} test_focus + * @property {string?} status + * @property {string} severity + * @property {string} active + * @property {string} locked + * @property {string} export_to_observability + * @property {string?} last_manual_update + * @property {string?} usage_notes + * @property {Array} attributes + * + * @typedef Properties + * @type {object} + * @property {TestDefinition} test_definition + */ +import van from '../van.min.js'; +import { Streamlit } from '../streamlit.js'; +import { getValue, loadStylesheet, resizeFrameHeightOnDOMChange, resizeFrameHeightToElement } from '../utils.js'; +import { Alert } from '../components/alert.js'; +import { Attribute } from '../components/attribute.js'; + +const { div, strong } = van.tags; + +/** + * @param {Properties} props + * @returns + */ +const TestDefinitionSummary = (props) => { + loadStylesheet('test-definition-summary', stylesheet) + Streamlit.setFrameHeight(1); + window.testgen.isPage = true; + + const wrapperId = 'test-definition-summary'; + + resizeFrameHeightToElement(wrapperId); + resizeFrameHeightOnDOMChange(wrapperId); + + return div( + {id: wrapperId}, + () => { + const testDefinition = getValue(props.test_definition); + console.log(testDefinition); + + return div( + { class: 'flex-column' }, + div( + { class: 'flex-row fx-gap-1 fx-align-flex-start' }, + div( + { class: 'flex-column fx-flex fx-gap-4 test-definition-attributes'}, + Attribute({ + label: 'Schema Name', + value: testDefinition.schema, + class: 'fx-flex' + }), + Attribute({ + label: 'Test Suite Name', + value: testDefinition.test_suite_name, + class: 'fx-flex' + }), + Attribute({ + label: 'Table Name', + value: testDefinition.table_name, + class: 'fx-flex' + }), + Attribute({ + label: 'Test Focus', + value: testDefinition.test_focus, + class: 'fx-flex' + }), + Attribute({ + label: 'Test Active', + value: testDefinition.active, + class: 'fx-flex' + }), + Attribute({ + label: 'Lock Refresh', + value: testDefinition.locked, + class: 'fx-flex' + }), + Attribute({ + label: 'Last Manual Update', + value: testDefinition.last_manual_update + ? Intl.DateTimeFormat("en-US", {dateStyle: 'long', timeStyle: 'long'}).format(Date.parse(testDefinition.last_manual_update)) + : undefined, + class: 'fx-flex' + }), + ), + div( + { class: 'flex-column fx-flex fx-gap-4 test-definition-attributes'}, + Attribute({ + label: 'Test Result Urgency', + value: testDefinition.severity, + class: 'fx-flex' + }), + Attribute({ + label: 'Send to Observability', + value: testDefinition.export_to_observability, + class: 'fx-flex' + }), + testDefinition.attributes.map(attribute => + Attribute({ + label: attribute.label, + value: attribute.value, + help: attribute.help, + class: 'fx-flex' + }) + ), + ), + ), + testDefinition.usage_notes + ? Alert( + { type: 'info', class: 'mt-4' }, + strong({class: 'mb-4'}, 'Usage Notes'), + testDefinition.usage_notes, + ) + : '', + ); + }, + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.test-definition-attributes > div .text-caption { + font-size: 14px; +} +.test-definition-attributes > div .attribute-value { + font-size: 16px; +} +`); + +export { TestDefinitionSummary }; diff --git a/testgen/ui/components/frontend/js/pages/test_runs.js b/testgen/ui/components/frontend/js/pages/test_runs.js index 9c5713d1..0159b0cd 100644 --- a/testgen/ui/components/frontend/js/pages/test_runs.js +++ b/testgen/ui/components/frontend/js/pages/test_runs.js @@ -20,6 +20,7 @@ * @typedef Permissions * @type {object} * @property {boolean} can_run + * @property {boolean} can_edit * * @typedef Properties * @type {object} @@ -34,8 +35,9 @@ import { Button } from '../components/button.js'; import { Streamlit } from '../streamlit.js'; import { emitEvent, getValue, resizeFrameHeightToElement } from '../utils.js'; import { formatTimestamp, formatDuration } from '../display_utils.js'; +import { Checkbox } from '../components/checkbox.js'; -const { div, span, i } = van.tags; +const { div, i, span, strong } = van.tags; const TestRuns = (/** @type Properties */ props) => { window.testgen.isPage = true; @@ -48,36 +50,100 @@ const TestRuns = (/** @type Properties */ props) => { Streamlit.setFrameHeight(100 * items.length); return items; }); - const columns = ['30%', '20%', '40%', '10%']; + const columns = ['5%', '28%', '17%', '40%', '10%']; const userCanRun = getValue(props.permissions)?.can_run ?? false; + const userCanEdit = getValue(props.permissions)?.can_edit ?? false; + const selectedRuns = {}; const tableId = 'test-runs-table'; resizeFrameHeightToElement(tableId); + const initializeSelectedStates = (items) => { + for (const testRun of items) { + if (selectedRuns[testRun.test_run_id] == undefined) { + selectedRuns[testRun.test_run_id] = van.state(false); + } + } + }; + + initializeSelectedStates(testRunItems.val); + + van.derive(() => { + initializeSelectedStates(testRunItems.val); + }); + return div( { class: 'table', id: tableId }, + () => { + const items = testRunItems.val; + const selectedItems = items.filter(i => selectedRuns[i.test_run_id]?.val ?? false); + const someRunSelected = selectedItems.length > 0; + const tooltipText = !someRunSelected ? 'No runs selected' : undefined; + + if (!userCanEdit) { + return ''; + } + + return div( + { class: 'flex-row fx-justify-content-flex-end pb-2' }, + someRunSelected ? strong({class: 'mr-1'}, selectedItems.length) : '', + someRunSelected ? span({class: 'mr-4'}, 'runs selected') : '', + Button({ + type: 'stroked', + icon: 'delete', + label: 'Delete Runs', + tooltip: tooltipText, + tooltipPosition: 'bottom-left', + disabled: !someRunSelected, + width: 'auto', + onclick: () => emitEvent('RunsDeleted', { payload: selectedItems.map(i => i.test_run_id) }), + }), + ); + }, div( { class: 'table-header flex-row' }, + () => { + const items = testRunItems.val; + const selectedItems = items.filter(i => selectedRuns[i.test_run_id]?.val ?? false); + const allSelected = selectedItems.length === items.length; + const partiallySelected = selectedItems.length > 0 && selectedItems.length < items.length; + + if (!userCanEdit) { + return ''; + } + + return span( + { style: `flex: ${columns[0]}` }, + userCanEdit + ? Checkbox({ + checked: allSelected, + indeterminate: partiallySelected, + onChange: (checked) => items.forEach(item => selectedRuns[item.test_run_id].val = checked), + testId: 'select-all-test-run', + }) + : '', + ); + }, span( - { style: `flex: ${columns[0]}` }, + { style: `flex: ${columns[1]}` }, 'Start Time | Table Group | Test Suite', ), span( - { style: `flex: ${columns[1]}` }, + { style: `flex: ${columns[2]}` }, 'Status | Duration', ), span( - { style: `flex: ${columns[2]}` }, + { style: `flex: ${columns[3]}` }, 'Results Summary', ), span( - { style: `flex: ${columns[3]}` }, + { style: `flex: ${columns[4]}` }, 'Testing Score', ), ), () => div( - testRunItems.val.map(item => TestRunItem(item, columns, userCanRun)), + testRunItems.val.map(item => TestRunItem(item, columns, selectedRuns[item.test_run_id], userCanRun, userCanEdit)), ), ); } @@ -85,12 +151,24 @@ const TestRuns = (/** @type Properties */ props) => { const TestRunItem = ( /** @type TestRun */ item, /** @type string[] */ columns, + /** @type boolean */ selected, /** @type boolean */ userCanRun, + /** @type boolean */ userCanEdit, ) => { return div( { class: 'table-row flex-row' }, + userCanEdit + ? div( + { style: `flex: ${columns[0]}; font-size: 16px;` }, + Checkbox({ + checked: selected, + onChange: (checked) => selected.val = checked, + testId: 'select-test-run', + }), + ) + : '', div( - { style: `flex: ${columns[0]}` }, + { style: `flex: ${columns[1]}` }, Link({ label: formatTimestamp(item.test_starttime), href: 'test-runs:results', @@ -103,7 +181,7 @@ const TestRunItem = ( ), ), div( - { class: 'flex-row', style: `flex: ${columns[1]}` }, + { class: 'flex-row', style: `flex: ${columns[2]}` }, div( TestRunStatus(item), div( @@ -119,7 +197,7 @@ const TestRunItem = ( }) : null, ), div( - { class: 'pr-3', style: `flex: ${columns[2]}` }, + { class: 'pr-3', style: `flex: ${columns[3]}` }, item.test_ct ? SummaryBar({ items: [ { label: 'Passed', value: item.passed_ct, color: 'green' }, @@ -128,14 +206,16 @@ const TestRunItem = ( { label: 'Error', value: item.error_ct, color: 'brown' }, { label: 'Dismissed', value: item.dismissed_ct, color: 'grey' }, ], - height: 10, - width: 400, + height: 8, + width: 350, }) : '--', ), div( - { style: `flex: ${columns[3]}; font-size: 16px;` }, - item.dq_score_testing ?? '--', - ) + { style: `flex: ${columns[4]}; font-size: 16px;` }, + item.test_ct && item.dq_score_testing + ? item.dq_score_testing + : '--', + ), ); } diff --git a/testgen/ui/components/frontend/js/pages/test_suites.js b/testgen/ui/components/frontend/js/pages/test_suites.js index 923e9de3..4aba36ce 100644 --- a/testgen/ui/components/frontend/js/pages/test_suites.js +++ b/testgen/ui/components/frontend/js/pages/test_suites.js @@ -77,7 +77,7 @@ const TestSuites = (/** @type Properties */ props) => { ? div( { class: 'tg-test-suites'}, () => div( - { class: 'tg-test-suites--toolbar flex-row fx-align-flex-end mb-4' }, + { class: 'flex-row fx-align-flex-end fx-justify-space-between mb-4' }, Select({ label: 'Table Group', value: getValue(props.table_group_filter_options)?.find((op) => op.selected)?.value ?? null, @@ -88,16 +88,29 @@ const TestSuites = (/** @type Properties */ props) => { testId: 'table-group-filter', onChange: (value) => emitEvent('FilterApplied', {payload: value}), }), - userCanEdit - ? Button({ - icon: 'add', + div( + { class: 'flex-row fx-gap-4' }, + Button({ + icon: 'today', type: 'stroked', - label: 'Add Test Suite', + label: 'Test Run Schedules', + tooltip: 'Manage when test suites should run', + tooltipPosition: 'bottom', width: 'fit-content', - style: 'margin-left: auto; background: var(--dk-card-background);', - onclick: () => emitEvent('AddTestSuiteClicked', {}), - }) - : '', + style: 'background: var(--dk-card-background);', + onclick: () => emitEvent('RunSchedulesClicked', {}), + }), + userCanEdit + ? Button({ + icon: 'add', + type: 'stroked', + label: 'Add Test Suite', + width: 'fit-content', + style: 'background: var(--dk-card-background);', + onclick: () => emitEvent('AddTestSuiteClicked', {}), + }) + : '', + ), ), () => div( { class: 'flex-column' }, @@ -245,8 +258,8 @@ const ConditionalEmptyState = ( message: EMPTY_STATE_MESSAGE.tableGroup, link: { label: 'Go to Table Groups', - href: 'connections:table-groups', - params: { connection_id: projectSummary.default_connection_id }, + href: 'table-groups', + params: { project_code: projectSummary.project_code, connection_id: projectSummary.default_connection_id }, }, }; } diff --git a/testgen/ui/components/frontend/js/utils.js b/testgen/ui/components/frontend/js/utils.js index 9edab31e..f912916d 100644 --- a/testgen/ui/components/frontend/js/utils.js +++ b/testgen/ui/components/frontend/js/utils.js @@ -193,4 +193,8 @@ function slugify(/** @type string */ str) { .replace(/^-|-$/g, ''); } -export { afterMount, debounce, emitEvent, enforceElementWidth, getRandomId, getValue, getParents, isEqual, isState, loadStylesheet, resizeFrameHeightToElement, resizeFrameHeightOnDOMChange, friendlyPercent, slugify }; +function isDataURL(/** @type string */ url) { + return url.startsWith('data:'); +} + +export { afterMount, debounce, emitEvent, enforceElementWidth, getRandomId, getValue, getParents, isEqual, isState, loadStylesheet, resizeFrameHeightToElement, resizeFrameHeightOnDOMChange, friendlyPercent, slugify, isDataURL }; diff --git a/testgen/ui/components/widgets/__init__.py b/testgen/ui/components/widgets/__init__.py index fcda10a0..5716f6c9 100644 --- a/testgen/ui/components/widgets/__init__.py +++ b/testgen/ui/components/widgets/__init__.py @@ -14,9 +14,9 @@ flex_row_center, flex_row_end, flex_row_start, + help_menu, no_flex_gap, page_header, - page_links, text, whitespace, ) diff --git a/testgen/ui/components/widgets/page.py b/testgen/ui/components/widgets/page.py index 02a8c34e..721c1063 100644 --- a/testgen/ui/components/widgets/page.py +++ b/testgen/ui/components/widgets/page.py @@ -1,13 +1,16 @@ import streamlit as st from streamlit.delta_generator import DeltaGenerator +from testgen import settings +from testgen.common import version_service from testgen.ui.components.widgets.breadcrumbs import Breadcrumb from testgen.ui.components.widgets.breadcrumbs import breadcrumbs as tg_breadcrumbs +from testgen.ui.components.widgets.testgen_component import testgen_component +from testgen.ui.services import user_session_service +from testgen.ui.views.dialogs.application_logs_dialog import application_logs_dialog + +UPGRADE_URL = "https://docs.datakitchen.io/articles/#!dataops-testgen-help/upgrade-testgen" -BASE_HELP_URL = "https://docs.datakitchen.io/articles/#!dataops-testgen-help/" -DEFAULT_HELP_TOPIC = "dataops-testgen-help" -SLACK_URL = "https://data-observability-slack.datakitchen.io/join" -TRAINING_URL = "https://info.datakitchen.io/data-quality-training-and-certifications" def page_header( title: str, @@ -16,7 +19,7 @@ def page_header( ): with st.container(): no_flex_gap() - title_column, links_column = st.columns([0.95, 0.05], vertical_alignment="bottom") + title_column, links_column = st.columns([0.75, 0.25], vertical_alignment="bottom") with title_column: no_flex_gap() @@ -25,17 +28,52 @@ def page_header( tg_breadcrumbs(breadcrumbs=breadcrumbs) with links_column: - page_links(help_topic) + help_menu(help_topic) st.html('
') -def page_links(help_topic: str | None = None): - css_class("tg-header--links") - flex_row_end() - st.link_button(":material/question_mark:", f"{BASE_HELP_URL}{help_topic or DEFAULT_HELP_TOPIC}", help="Help Center") - st.link_button(":material/group:", SLACK_URL, help="Slack Community") - st.link_button(":material/school:", TRAINING_URL, help="Training Portal") +def help_menu(help_topic: str | None = None) -> None: + with st.container(key="tg-header--help"): + version = version_service.get_version() + if version.latest != version.current: + st.page_link(UPGRADE_URL, label=f":small[:red[New version available! {version.latest}]]") + + help_container = st.empty() + + # Hack to programmatically close popover: https://github.com/streamlit/streamlit/issues/8265#issuecomment-3001655849 + def close_help(rerun: bool = False) -> None: + with help_container.container(key="tg-header--help-dummy"): + flex_row_end() + st.markdown("Help :material/keyboard_arrow_down:") + if rerun: + st.rerun() + + def open_app_logs(): + close_help() + application_logs_dialog() + + with help_container.container(): + flex_row_end() + with st.popover("Help"): + css_class("tg-header--help-wrapper") + testgen_component( + "help_menu", + props={ + "help_topic": help_topic, + "support_email": settings.SUPPORT_EMAIL, + "version": version.__dict__, + "permissions": { + "can_edit": user_session_service.user_can_edit(), + }, + }, + on_change_handlers={ + "AppLogsClicked": lambda _: open_app_logs(), + }, + event_handlers={ + "ExternalLinkClicked": lambda _: close_help(rerun=True), + }, + ) def whitespace(size: float, unit: str = "rem", container: DeltaGenerator | None = None): diff --git a/testgen/ui/components/widgets/select.py b/testgen/ui/components/widgets/select.py index 31fa748c..23d65d96 100644 --- a/testgen/ui/components/widgets/select.py +++ b/testgen/ui/components/widgets/select.py @@ -1,3 +1,5 @@ +import re + import pandas as pd import streamlit as st from streamlit_extras.no_default_selectbox import selectbox @@ -5,6 +7,9 @@ from testgen.ui.navigation.router import Router EMPTY_VALUE = "---" +CUSTOM_VALUE_TEMPLATE = "Custom: {value}" +CUSTOM_VALUE_PATTERN = r"Custom: (.+)" + def select( label: str, @@ -15,42 +20,61 @@ def select( required: bool = False, bind_to_query: str | None = None, bind_empty_value: bool = False, + accept_new_options: bool = False, + custom_values_wrap: str | None = "%{}%", **kwargs, ): - kwargs = {**kwargs} + kwargs = {**kwargs, "accept_new_options": accept_new_options} kwargs["label"] = label + kwargs["index"] = None + + option_values = options + option_display_labels = options if isinstance(options, pd.DataFrame): value_column = value_column or options.columns[0] display_column = display_column or value_column - kwargs["options"] = options[display_column] - if default_value in options[value_column].values: - kwargs["index"] = int(options[options[value_column] == default_value].index[0]) + (0 if required else 1) - else: - kwargs["options"] = options - if default_value in options: - kwargs["index"] = options.index(default_value) + (0 if required else 1) - elif default_value == EMPTY_VALUE and not required: - kwargs["index"] = 0 + + option_values = options[value_column].values.tolist() + option_display_labels = options[display_column].values.tolist() + + kwargs["options"] = [*option_display_labels] + if default_value in option_values: + kwargs["index"] = option_values.index(default_value) + (0 if required else 1) + elif default_value == EMPTY_VALUE and not required: + kwargs["index"] = 0 + elif default_value and default_value != EMPTY_VALUE and accept_new_options: + kwargs["options"].append(CUSTOM_VALUE_TEMPLATE.format(value=default_value)) + kwargs["index"] = len(kwargs["options"]) if bind_to_query: kwargs["key"] = kwargs.get("key", f"testgen_select_{bind_to_query}") - if default_value is not None and kwargs.get("index") is None: - Router().set_query_params({ bind_to_query: None }) # Unset the query params if the current value is not valid + + # Unset the query params if the current value is not valid and new options are not allowed + if default_value is not None and kwargs.get("index") is None and not accept_new_options: + Router().set_query_params({ bind_to_query: None }) def update_query_params(): query_value = st.session_state[kwargs["key"]] if not required and query_value == EMPTY_VALUE and not bind_empty_value: query_value = None - elif isinstance(options, pd.DataFrame): - query_value = options.loc[options[display_column] == query_value, value_column].iloc[0] + elif query_value in option_display_labels: + query_value = option_values[option_display_labels.index(query_value)] + # elif isinstance(options, pd.DataFrame) and default_value in options[value_column].values: + # query_value = options.loc[options[display_column] == query_value, value_column].iloc[0] Router().set_query_params({ bind_to_query: query_value }) kwargs["on_change"] = update_query_params selected = st.selectbox(**kwargs) if required else selectbox(**kwargs) - if selected and isinstance(options, pd.DataFrame): - return options.loc[options[display_column] == selected, value_column].iloc[0] + if selected: + if selected in option_display_labels: + selected = option_values[option_display_labels.index(selected)] + + if accept_new_options and (match := re.match(CUSTOM_VALUE_PATTERN, selected)): + selected = match.group(1) + if custom_values_wrap: + selected = custom_values_wrap.format(selected) return selected diff --git a/testgen/ui/components/widgets/sidebar.py b/testgen/ui/components/widgets/sidebar.py index 9ff16734..e1f8002e 100644 --- a/testgen/ui/components/widgets/sidebar.py +++ b/testgen/ui/components/widgets/sidebar.py @@ -2,12 +2,12 @@ import time from typing import Literal +from testgen.common.version_service import Version from testgen.ui.components.utils.component import component from testgen.ui.navigation.menu import Menu from testgen.ui.navigation.router import Router from testgen.ui.services import javascript_service, user_session_service from testgen.ui.session import session -from testgen.ui.views.dialogs.application_logs_dialog import application_logs_dialog LOG = logging.getLogger("testgen") @@ -19,9 +19,12 @@ def sidebar( key: str = SIDEBAR_KEY, projects: list[dict[Literal["name", "codde"], str]] | None = None, current_project: str | None = None, - username: str | None = None, menu: Menu = None, current_page: str | None = None, + username: str | None = None, + role: str | None = None, + version: Version | None = None, + support_email: str | None = None, ) -> None: """ Testgen custom component to display a styled menu over streamlit's @@ -38,13 +41,13 @@ def sidebar( props={ "projects": projects, "current_project": current_project, - "username": username, "menu": menu.filter_for_current_user().sort_items().unflatten().asdict(), "current_page": current_page, + "username": username, + "role": role, "logout_path": LOGOUT_PATH, - "permissions": { - "can_edit": user_session_service.user_can_edit(), - }, + "version": version.__dict__, + "support_email": support_email, }, key=key, on_change=on_change, @@ -65,9 +68,7 @@ def on_change(): return session.sidebar_event_id = event_id - if event_data.get("view_logs"): - application_logs_dialog() - elif event_data.get("path") == LOGOUT_PATH: + if event_data.get("path") == LOGOUT_PATH: javascript_service.clear_component_states() user_session_service.end_user_session() Router().queue_navigation(to="") diff --git a/testgen/ui/components/widgets/sorting_selector.py b/testgen/ui/components/widgets/sorting_selector.py index 8b168f1c..5dd1cc95 100644 --- a/testgen/ui/components/widgets/sorting_selector.py +++ b/testgen/ui/components/widgets/sorting_selector.py @@ -73,14 +73,25 @@ def sorting_selector( if state is None: state = default - with st.popover(popover_label): - new_state = component( - id_="sorting_selector", - key=key, - default=state, - on_change=on_change, - props={"columns": columns, "state": state}, - ) + popover_container = st.empty() + + def handle_change() -> None: + if on_change: + on_change() + + # Hack to programmatically close popover: https://github.com/streamlit/streamlit/issues/8265#issuecomment-3001655849 + with popover_container.container(): + st.button(label=f"{popover_label} :material/keyboard_arrow_up:", disabled=True) + + with popover_container.container(): + with st.popover(popover_label): + new_state = component( + id_="sorting_selector", + key=key, + default=state, + on_change=handle_change, + props={"columns": columns, "state": state}, + ) # For some unknown reason, sometimes, streamlit returns None as the component state new_state = [] if new_state is None else new_state diff --git a/testgen/ui/components/widgets/testgen_component.py b/testgen/ui/components/widgets/testgen_component.py index ee80d18d..d52b2bdc 100644 --- a/testgen/ui/components/widgets/testgen_component.py +++ b/testgen/ui/components/widgets/testgen_component.py @@ -20,6 +20,7 @@ "column_selector", "connections", "table_group_wizard", + "help_menu", ] diff --git a/testgen/ui/navigation/menu.py b/testgen/ui/navigation/menu.py index 7c519baf..d44002a1 100644 --- a/testgen/ui/navigation/menu.py +++ b/testgen/ui/navigation/menu.py @@ -17,17 +17,9 @@ class MenuItem: items: list["MenuItem"] | None = dataclasses.field(default=None) -@dataclasses.dataclass -class Version: - current: str - latest: str - schema: str - - @dataclasses.dataclass class Menu: items: list[MenuItem] - version: Version def filter_for_current_user(self) -> "Menu": filtered_items = [] @@ -53,8 +45,5 @@ def unflatten(self) -> "Menu": unflattened_items.append(MenuItem(label=label, items=items)) return dataclasses.replace(self, items=unflattened_items) - def update_version(self, version: Version) -> "Menu": - return dataclasses.replace(self, version=version) - def asdict(self): return dataclasses.asdict(self) diff --git a/testgen/ui/pdf/hygiene_issue_report.py b/testgen/ui/pdf/hygiene_issue_report.py index 1e3ddda3..31844a78 100644 --- a/testgen/ui/pdf/hygiene_issue_report.py +++ b/testgen/ui/pdf/hygiene_issue_report.py @@ -4,6 +4,7 @@ from reportlab.lib.styles import ParagraphStyle from reportlab.platypus import CondPageBreak, KeepTogether, Paragraph, Table, TableStyle +from testgen.settings import ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT from testgen.ui.pdf.dataframe_table import DataFrameTableBuilder from testgen.ui.pdf.style import ( COLOR_GRAY_BG, @@ -26,10 +27,10 @@ SECTION_MIN_AVAILABLE_HEIGHT = 120 CLASS_COLORS = { - "Definite": HexColor(0xE94D4A), - "Likely": HexColor(0xFC8F2A), - "Possible": HexColor(0xFCD349), - "Potential PII": HexColor(0xFC8F2A), + "Definite": HexColor(0xEF5350), + "Likely": HexColor(0xFF9800), + "Possible": HexColor(0xFBC02D), + "Potential PII": HexColor(0x8D6E63), } def build_summary_table(document, hi_data): @@ -185,7 +186,7 @@ def get_report_content(document, hi_data): yield Paragraph("Suggested Action", style=PARA_STYLE_H1) yield Paragraph(hi_data["suggested_action"], style=PARA_STYLE_TEXT) - sample_data_tuple = get_source_data(hi_data) + sample_data_tuple = get_source_data(hi_data, limit=ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT) yield CondPageBreak(SECTION_MIN_AVAILABLE_HEIGHT) yield Paragraph("Sample Data", PARA_STYLE_H1) diff --git a/testgen/ui/pdf/test_result_report.py b/testgen/ui/pdf/test_result_report.py index 883b0346..f583c71e 100644 --- a/testgen/ui/pdf/test_result_report.py +++ b/testgen/ui/pdf/test_result_report.py @@ -10,6 +10,7 @@ TableStyle, ) +from testgen.settings import ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT from testgen.ui.pdf.dataframe_table import TABLE_STYLE_DATA, DataFrameTableBuilder from testgen.ui.pdf.style import ( COLOR_GRAY_BG, @@ -37,9 +38,10 @@ SECTION_MIN_AVAILABLE_HEIGHT = 120 RESULT_STATUS_COLORS = { - "Passed": HexColor(0x94C465), - "Warning": HexColor(0xFCD349), - "Failed": HexColor(0xE94D4A), + "Passed": HexColor(0x8BC34A), + "Warning": HexColor(0xFBC02D), + "Failed": HexColor(0xEF5350), + "Error": HexColor(0x8D6E63), } @@ -164,7 +166,7 @@ def build_summary_table(document, tr_data): def build_history_table(document, tr_data): - history_data = get_test_result_history(get_schema(), tr_data) + history_data = get_test_result_history(get_schema(), tr_data, limit=15) history_table_style = TableStyle( ( @@ -241,9 +243,17 @@ def get_report_content(document, tr_data): yield build_history_table(document, tr_data) if tr_data["test_type"] == "CUSTOM": - sample_data_tuple = do_source_data_lookup_custom(get_schema(), tr_data) + sample_data_tuple = do_source_data_lookup_custom( + get_schema(), + tr_data, + limit=ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT, + ) else: - sample_data_tuple = do_source_data_lookup(get_schema(), tr_data) + sample_data_tuple = do_source_data_lookup( + get_schema(), + tr_data, + limit=ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT, + ) yield CondPageBreak(SECTION_MIN_AVAILABLE_HEIGHT) yield Paragraph("Sample Data", PARA_STYLE_H1) diff --git a/testgen/ui/queries/connection_queries.py b/testgen/ui/queries/connection_queries.py index c3aad89c..fc86517f 100644 --- a/testgen/ui/queries/connection_queries.py +++ b/testgen/ui/queries/connection_queries.py @@ -10,7 +10,8 @@ def get_by_id(connection_id): str_schema = st.session_state["dbschema"] str_sql = f""" SELECT id::VARCHAR(50), project_code, connection_id, connection_name, - sql_flavor, sql_flavor_code, project_host, project_port, project_user, + sql_flavor, COALESCE(sql_flavor_code, sql_flavor) AS sql_flavor_code, + project_host, project_port, project_user, project_db, project_pw_encrypted, NULL as password, max_threads, max_query_chars, url, connect_by_url, connect_by_key, private_key, private_key_passphrase, http_path @@ -24,7 +25,8 @@ def get_connections(project_code): str_schema = st.session_state["dbschema"] str_sql = f""" SELECT id::VARCHAR(50), project_code, connection_id, connection_name, - sql_flavor, sql_flavor_code, project_host, project_port, project_user, + sql_flavor, COALESCE(sql_flavor_code, sql_flavor) AS sql_flavor_code, + project_host, project_port, project_user, project_db, project_pw_encrypted, NULL as password, max_threads, max_query_chars, connect_by_url, url, connect_by_key, private_key, private_key_passphrase, http_path diff --git a/testgen/ui/queries/profiling_queries.py b/testgen/ui/queries/profiling_queries.py index 4893e0ec..db755ab9 100644 --- a/testgen/ui/queries/profiling_queries.py +++ b/testgen/ui/queries/profiling_queries.py @@ -94,14 +94,20 @@ def get_run_by_id(profile_run_id: str) -> pd.Series: @st.cache_data(show_spinner=False) -def get_profiling_results(profiling_run_id: str, table_name: str, column_name: str, sorting_columns = None): +def get_profiling_results(profiling_run_id: str, table_name: str | None = None, column_name: str | None = None, sorting_columns = None): + db_session = get_current_session() + params = { + "profiling_run_id": profiling_run_id, + "table_name": table_name if table_name else "%%", + "column_name": column_name if column_name else "%%", + } + order_by = "" if sorting_columns is None: order_by = "ORDER BY schema_name, table_name, position" elif len(sorting_columns): order_by = "ORDER BY " + ", ".join(" ".join(col) for col in sorting_columns) - schema: str = st.session_state["dbschema"] query = f""" SELECT id::VARCHAR, @@ -125,27 +131,94 @@ def get_profiling_results(profiling_run_id: str, table_name: str, column_name: s functional_table_type AS semantic_table_type, CASE WHEN EXISTS( SELECT 1 - FROM {schema}.profile_anomaly_results + FROM profile_anomaly_results WHERE profile_run_id = profile_results.profile_run_id AND table_name = profile_results.table_name AND column_name = profile_results.column_name ) THEN 'Yes' END AS hygiene_issues - FROM {schema}.profile_results - WHERE profile_run_id = '{profiling_run_id}' - AND table_name ILIKE '{table_name}' - AND column_name ILIKE '{column_name}' + FROM profile_results + WHERE profile_run_id = :profiling_run_id + AND table_name ILIKE :table_name + AND column_name ILIKE :column_name {order_by}; """ - return db.retrieve_data(query) + + results = db_session.execute(query, params=params) + columns = [column.name for column in results.cursor.description] + + return pd.DataFrame(list(results), columns=columns) @st.cache_data(show_spinner=False) -def get_table_by_id(table_id: str) -> dict | None: +def get_table_by_id( + table_id: str, + include_tags: bool = False, + include_has_test_runs: bool = False, + include_active_tests: bool = False, + include_scores: bool = False, +) -> dict | None: if not is_uuid4(table_id): return None + + condition = f"WHERE table_id = '{table_id}'" + return get_tables_by_condition(condition, include_tags, include_has_test_runs, include_active_tests, include_scores)[0] + + +def get_tables_by_id( + table_ids: list[str], + include_tags: bool = False, + include_has_test_runs: bool = False, + include_active_tests: bool = False, + include_scores: bool = False, +) -> list[dict] | None: + condition = f""" + INNER JOIN ( + SELECT UNNEST(ARRAY [{", ".join([ f"'{col}'" for col in table_ids if is_uuid4(col) ])}]) AS id + ) selected ON (table_chars.table_id = selected.id::UUID)""" + return get_tables_by_condition(condition, include_tags, include_has_test_runs, include_active_tests, include_scores) + +def get_tables_by_table_group( + table_group_id: str, + include_tags: bool = False, + include_has_test_runs: bool = False, + include_active_tests: bool = False, + include_scores: bool = False, +) -> list[dict] | None: + if not is_uuid4(table_group_id): + return None + + condition = f"WHERE table_chars.table_groups_id = '{table_group_id}'" + return get_tables_by_condition(condition, include_tags, include_has_test_runs, include_active_tests, include_scores) + + +def get_tables_by_condition( + filter_condition: str, + include_tags: bool = False, + include_has_test_runs: bool = False, + include_active_tests: bool = False, + include_scores: bool = False, +) -> list[dict] | None: schema: str = st.session_state["dbschema"] query = f""" + {f""" + WITH active_test_definitions AS ( + SELECT + test_defs.table_groups_id, + test_defs.table_name, + COUNT(*) AS count + FROM {schema}.test_definitions test_defs + LEFT JOIN {schema}.data_column_chars ON ( + test_defs.table_groups_id = data_column_chars.table_groups_id + AND test_defs.table_name = data_column_chars.table_name + AND test_defs.column_name = data_column_chars.column_name + ) + WHERE test_active = 'Y' + AND column_id IS NULL + GROUP BY test_defs.table_groups_id, + test_defs.table_name + ) + """ if include_active_tests else ""} SELECT table_chars.table_id::VARCHAR AS id, 'table' AS type, @@ -160,39 +233,59 @@ def get_table_by_id(table_id: str) -> dict | None: add_date, last_refresh_date, drop_date, + {f""" -- Table Tags table_chars.description, table_chars.critical_data_element, {", ".join([ f"table_chars.{tag}" for tag in TAG_FIELDS ])}, -- Table Groups Tags {", ".join([ f"table_groups.{tag} AS table_group_{tag}" for tag in TAG_FIELDS if tag != "aggregation_level" ])}, - -- Profile & Test Runs - table_chars.last_complete_profile_run_id::VARCHAR AS profile_run_id, - profiling_starttime AS profile_run_date, - TRUE AS is_latest_profile, + """ if include_tags else ""} + {f""" + -- Has Test Runs EXISTS( SELECT 1 FROM {schema}.test_results WHERE table_groups_id = table_chars.table_groups_id AND table_name = table_chars.table_name ) AS has_test_runs, + """ if include_has_test_runs else ""} + {""" + -- Test Definition Count + active_tests.count AS active_test_count, + """ if include_active_tests else ""} + {""" -- Scores table_chars.dq_score_profiling, - table_chars.dq_score_testing + table_chars.dq_score_testing, + """ if include_scores else ""} + -- Profile Run + table_chars.last_complete_profile_run_id::VARCHAR AS profile_run_id, + profiling_starttime AS profile_run_date, + TRUE AS is_latest_profile FROM {schema}.data_table_chars table_chars LEFT JOIN {schema}.profiling_runs ON ( table_chars.last_complete_profile_run_id = profiling_runs.id ) + {f""" LEFT JOIN {schema}.table_groups ON ( table_chars.table_groups_id = table_groups.id ) - WHERE table_id = '{table_id}'; + """ if include_tags else ""} + {""" + LEFT JOIN active_test_definitions active_tests ON ( + table_chars.table_groups_id = active_tests.table_groups_id + AND table_chars.table_name = active_tests.table_name + ) + """ if include_active_tests else ""} + {filter_condition} + ORDER BY table_name; """ results = db.retrieve_data(query) if not results.empty: # to_json converts datetimes, NaN, etc, to JSON-safe values (Note: to_dict does not) - return json.loads(results.to_json(orient="records"))[0] + return json.loads(results.to_json(orient="records")) @st.cache_data(show_spinner=False) @@ -200,13 +293,14 @@ def get_column_by_id( column_id: str, include_tags: bool = False, include_has_test_runs: bool = False, + include_active_tests: bool = False, include_scores: bool = False, ) -> dict | None: if not is_uuid4(column_id): return None - condition = f"column_chars.column_id = '{column_id}'" - return get_columns_by_condition(condition, include_tags, include_has_test_runs, include_scores)[0] + condition = f"WHERE column_chars.column_id = '{column_id}'" + return get_columns_by_condition(condition, include_tags, include_has_test_runs, include_active_tests, include_scores)[0] @st.cache_data(show_spinner="Loading data ...") @@ -216,33 +310,53 @@ def get_column_by_name( table_group_id: str, include_tags: bool = False, include_has_test_runs: bool = False, + include_active_tests: bool = False, include_scores: bool = False, ) -> dict | None: condition = f""" - column_chars.column_name = '{column_name}' + WHERE column_chars.column_name = '{column_name}' AND column_chars.table_name = '{table_name}' AND column_chars.table_groups_id = '{table_group_id}' """ - return get_columns_by_condition(condition, include_tags, include_has_test_runs, include_scores)[0] + return get_columns_by_condition(condition, include_tags, include_has_test_runs, include_active_tests, include_scores)[0] def get_columns_by_id( column_ids: list[str], include_tags: bool = False, include_has_test_runs: bool = False, + include_active_tests: bool = False, include_scores: bool = False, -) -> dict | None: - condition = f"column_chars.column_id IN ('{"', '".join([ col for col in column_ids if is_uuid4(col) ])}')" - return get_columns_by_condition(condition, include_tags, include_has_test_runs, include_scores) +) -> list[dict] | None: + condition = f""" + INNER JOIN ( + SELECT UNNEST(ARRAY [{", ".join([ f"'{col}'" for col in column_ids if is_uuid4(col) ])}]) AS id + ) selected ON (column_chars.column_id = selected.id::UUID)""" + return get_columns_by_condition(condition, include_tags, include_has_test_runs, include_active_tests, include_scores) + + +def get_columns_by_table_group( + table_group_id: str, + include_tags: bool = False, + include_has_test_runs: bool = False, + include_active_tests: bool = False, + include_scores: bool = False, +) -> list[dict] | None: + if not is_uuid4(table_group_id): + return None + + condition = f"WHERE column_chars.table_groups_id = '{table_group_id}'" + return get_columns_by_condition(condition, include_tags, include_has_test_runs, include_active_tests, include_scores) def get_columns_by_condition( filter_condition: str, include_tags: bool = False, include_has_test_runs: bool = False, + include_active_tests: bool = False, include_scores: bool = False, -) -> dict | None: +) -> list[dict] | None: schema: str = st.session_state["dbschema"] query = f""" @@ -273,11 +387,12 @@ def get_columns_by_condition( -- Table Groups Tags {", ".join([ f"table_groups.{tag} AS table_group_{tag}" for tag in TAG_FIELDS if tag != "aggregation_level" ])}, """ if include_tags else ""} - -- Profile & Test Runs + -- Profile Run column_chars.last_complete_profile_run_id::VARCHAR AS profile_run_id, run_date AS profile_run_date, TRUE AS is_latest_profile, {f""" + -- Has Test Runs EXISTS( SELECT 1 FROM {schema}.test_results @@ -286,6 +401,17 @@ def get_columns_by_condition( AND column_names = column_chars.column_name ) AS has_test_runs, """ if include_has_test_runs else ""} + {f""" + -- Test Definition Count + ( + SELECT COUNT(*) + FROM {schema}.test_definitions + WHERE table_groups_id = column_chars.table_groups_id + AND table_name = column_chars.table_name + AND column_name = column_chars.column_name + AND test_active = 'Y' + ) AS active_test_count, + """ if include_active_tests else ""} {""" -- Scores column_chars.dq_score_profiling, @@ -306,7 +432,8 @@ def get_columns_by_condition( AND column_chars.table_name = profile_results.table_name AND column_chars.column_name = profile_results.column_name ) - WHERE {filter_condition}; + {filter_condition} + ORDER BY table_name, ordinal_position; """ results = db.retrieve_data(query) diff --git a/testgen/ui/queries/profiling_run_queries.py b/testgen/ui/queries/profiling_run_queries.py index a2bfa805..ea40f93d 100644 --- a/testgen/ui/queries/profiling_run_queries.py +++ b/testgen/ui/queries/profiling_run_queries.py @@ -2,6 +2,7 @@ import testgen.ui.services.database_service as db from testgen.common import date_service +from testgen.common.models import get_current_session def update_status(profile_run_id: str, status: str) -> None: @@ -25,3 +26,22 @@ def cancel_all_running() -> None: SET status = 'Cancelled' WHERE status = 'Running'; """) + + +def cascade_delete_multiple_profiling_runs(profiling_run_ids: list[str]) -> None: + session = get_current_session() + + if not profiling_run_ids: + raise ValueError("No profiling run is specified.") + + params = {f"id_{idx}": value for idx, value in enumerate(profiling_run_ids)} + param_keys = [f":{slot}" for slot in params.keys()] + + with session.begin(): + session.execute(f"DELETE FROM profile_pair_rules WHERE profile_run_id IN ({', '.join(param_keys)})", params=params) + session.execute(f"DELETE FROM profile_anomaly_results WHERE profile_run_id IN ({', '.join(param_keys)})", params=params) + session.execute(f"DELETE FROM profile_results WHERE profile_run_id IN ({', '.join(param_keys)})", params=params) + session.execute(f"DELETE FROM profiling_runs WHERE id IN ({', '.join(param_keys)})", params=params) + session.commit() + + st.cache_data.clear() diff --git a/testgen/ui/queries/table_group_queries.py b/testgen/ui/queries/table_group_queries.py index 74c81c31..d69e54cb 100644 --- a/testgen/ui/queries/table_group_queries.py +++ b/testgen/ui/queries/table_group_queries.py @@ -7,17 +7,24 @@ def _get_select_statement(schema): return f""" - SELECT id::VARCHAR(50), project_code, connection_id, table_groups_name, - table_group_schema, - profiling_include_mask, profiling_exclude_mask, - profiling_table_set, - profile_id_column_mask, profile_sk_column_mask, - description, data_source, source_system, source_process, data_location, - business_domain, stakeholder_group, transform_level, data_product, - profile_use_sampling, profile_sample_percent, profile_sample_min_count, - profiling_delay_days, profile_flag_cdes - FROM {schema}.table_groups - """ + WITH table_groups AS ( + SELECT table_groups.*, connections.connection_name, connections.sql_flavor, + COALESCE(connections.sql_flavor_code, connections.sql_flavor) AS sql_flavor_code + FROM {schema}.table_groups + INNER JOIN {schema}.connections ON connections.connection_id = table_groups.connection_id + ) + SELECT id::VARCHAR(50), project_code, connection_id, connection_name, sql_flavor, sql_flavor_code, + table_groups_name, table_group_schema, + profiling_include_mask, profiling_exclude_mask, + profiling_table_set, + profile_id_column_mask, profile_sk_column_mask, + description, data_source, source_system, source_process, data_location, + business_domain, stakeholder_group, transform_level, data_product, + CASE WHEN profile_use_sampling = 'Y' THEN true ELSE false END AS profile_use_sampling, + profile_sample_percent, profile_sample_min_count, + profiling_delay_days, profile_flag_cdes + FROM table_groups + """ @st.cache_data(show_spinner=False) @@ -51,7 +58,6 @@ def get_test_suite_ids_by_table_group_names(schema, table_group_names): return db.retrieve_data(sql) - def get_table_group_dependencies(schema, table_group_names): if table_group_names is None or len(table_group_names) == 0: raise ValueError("No Table Group is specified.") @@ -82,6 +88,15 @@ def get_table_group_usage(schema, table_group_names): return db.retrieve_data(sql) +@st.cache_data(show_spinner=False) +def get_all(schema, project_code): + sql = _get_select_statement(schema) + sql += f"""WHERE project_code = '{project_code}' + ORDER BY table_groups_name + """ + return db.retrieve_data(sql) + + @st.cache_data(show_spinner=False) def get_by_connection(schema, project_code, connection_id): sql = _get_select_statement(schema) @@ -162,7 +177,7 @@ def add(schema, table_group) -> str: '{table_group["profiling_exclude_mask"]}', '{table_group["profile_id_column_mask"]}'::character varying(2000), '{table_group["profile_sk_column_mask"]}'::character varying, - '{'Y' if table_group["profile_use_sampling"]=='True' else 'N' }'::character varying, + '{'Y' if table_group["profile_use_sampling"] else 'N' }'::character varying, '{table_group["profile_sample_percent"]}'::character varying, {table_group["profile_sample_min_count"]}, '{table_group["profiling_delay_days"]}'::character varying, @@ -211,3 +226,21 @@ def cascade_delete(schema, table_group_names): delete from {schema}.table_groups where table_groups_name in ({",".join(table_group_items)});""" db.execute_sql(sql) st.cache_data.clear() + + +def get_test_suite_ids_by_table_group_id(schema, table_group_id: str) -> list[str]: + sql = f""" + SELECT ts.id::VARCHAR + FROM {schema}.test_suites ts + WHERE ts.table_groups_id = '{table_group_id}' + """ + return db.retrieve_data(sql) + + +def get_profiling_run_ids_by_table_group_id(schema, table_group_id: str) -> list[str]: + sql = f""" + SELECT pr.id::VARCHAR + FROM {schema}.profiling_runs pr + WHERE pr.table_groups_id = '{table_group_id}' + """ + return db.retrieve_data(sql) diff --git a/testgen/ui/queries/test_definition_queries.py b/testgen/ui/queries/test_definition_queries.py index ae16d4c6..161da4de 100644 --- a/testgen/ui/queries/test_definition_queries.py +++ b/testgen/ui/queries/test_definition_queries.py @@ -1,6 +1,8 @@ +import pandas as pd import streamlit as st import testgen.ui.services.database_service as db +from testgen.common.models import get_current_session, with_database_session def update_attribute(schema, test_definition_ids, attribute, value): @@ -19,72 +21,83 @@ def update_attribute(schema, test_definition_ids, attribute, value): @st.cache_data(show_spinner=False) -def get_test_definitions(schema, project_code, test_suite, table_name, column_name, test_definition_ids): - if table_name: - table_condition = f" AND d.table_name = '{table_name}'" - else: - table_condition = "" - if column_name: - column_condition = f" AND d.column_name = '{column_name}'" - else: - column_condition = "" - sql = f""" - SELECT - d.schema_name, d.table_name, d.column_name, t.test_name_short, t.test_name_long, - d.id::VARCHAR(50), - s.project_code, d.table_groups_id::VARCHAR(50), s.test_suite, d.test_suite_id::VARCHAR, - d.test_type, d.cat_test_id::VARCHAR(50), - d.test_active, - CASE WHEN d.test_active = 'Y' THEN 'Yes' ELSE 'No' END as test_active_display, - d.lock_refresh, - CASE WHEN d.lock_refresh = 'Y' THEN 'Yes' ELSE 'No' END as lock_refresh_display, - t.test_scope, - d.test_description, - d.profiling_as_of_date, - d.last_manual_update, - d.severity, COALESCE(d.severity, s.severity, t.default_severity) as urgency, - d.export_to_observability as export_to_observability_raw, - CASE - WHEN d.export_to_observability = 'Y' THEN 'Yes' - WHEN d.export_to_observability = 'N' THEN 'No' - WHEN d.export_to_observability IS NULL AND s.export_to_observability = 'Y' THEN 'Inherited (Yes)' - ELSE 'Inherited (No)' - END as export_to_observability, - -- test_action, - d.threshold_value, COALESCE(t.measure_uom_description, t.measure_uom) as export_uom, - d.baseline_ct, d.baseline_unique_ct, d.baseline_value, - d.baseline_value_ct, d.baseline_sum, d.baseline_avg, d.baseline_sd, - d.subset_condition, - d.groupby_names, d.having_condition, d.window_date_column, d.window_days, - d.match_schema_name, d.match_table_name, d.match_column_names, - d.match_subset_condition, d.match_groupby_names, d.match_having_condition, - d.skip_errors, d.custom_query, - COALESCE(d.test_description, t.test_description) as final_test_description, - t.default_parm_columns, t.selection_criteria, - d.profile_run_id::VARCHAR(50), d.test_action, d.test_definition_status, - d.watch_level, d.check_result, d.last_auto_gen_date, - d.test_mode - FROM {schema}.test_definitions d - INNER JOIN {schema}.test_types t ON (d.test_type = t.test_type) - INNER JOIN {schema}.test_suites s ON (d.test_suite_id = s.id) - WHERE True - """ +@with_database_session +def get_test_definitions(_, project_code, test_suite, table_name, column_name, test_definition_ids: list[str] | None): + db_session = get_current_session() + params = {} + order_by = "ORDER BY d.schema_name, d.table_name, d.column_name, d.test_type" + filters = "" if project_code: - sql += f""" AND s.project_code = '{project_code}' - """ + filters += " AND s.project_code = :project_code" + params["project_code"] = project_code if test_suite: - sql += f""" AND s.test_suite = '{test_suite}' {table_condition} {column_condition} - """ + filters += " AND s.test_suite = :test_suite" + params["test_suite"] = test_suite + if test_definition_ids: - sql += f""" AND d.id in ({"'" + "','".join(test_definition_ids) + "'"}) - """ + test_definition_params = {f"test_definition_id_{idx}": status for idx, status in enumerate(test_definition_ids)} + filters += f" AND d.id IN ({', '.join([f':{p}' for p in test_definition_params.keys()])})" + params.update(test_definition_params) - sql += """ORDER BY d.schema_name, d.table_name, d.column_name, d.test_type; + if table_name: + filters += " AND d.table_name = :table_name" + params["table_name"] = table_name + + if column_name: + filters += " AND d.column_name ILIKE :column_name" + params["column_name"] = column_name + + sql = f""" + SELECT + d.schema_name, d.table_name, d.column_name, t.test_name_short, t.test_name_long, + d.id::VARCHAR(50), + s.project_code, d.table_groups_id::VARCHAR(50), s.test_suite, d.test_suite_id::VARCHAR, + d.test_type, d.cat_test_id::VARCHAR(50), + d.test_active, + CASE WHEN d.test_active = 'Y' THEN 'Yes' ELSE 'No' END as test_active_display, + d.lock_refresh, + CASE WHEN d.lock_refresh = 'Y' THEN 'Yes' ELSE 'No' END as lock_refresh_display, + t.test_scope, + d.test_description, + d.profiling_as_of_date, + d.last_manual_update, + d.severity, COALESCE(d.severity, s.severity, t.default_severity) as urgency, + d.export_to_observability as export_to_observability_raw, + CASE + WHEN d.export_to_observability = 'Y' THEN 'Yes' + WHEN d.export_to_observability = 'N' THEN 'No' + WHEN d.export_to_observability IS NULL AND s.export_to_observability = 'Y' THEN 'Inherited (Yes)' + ELSE 'Inherited (No)' + END as export_to_observability, + -- test_action, + d.threshold_value, COALESCE(t.measure_uom_description, t.measure_uom) as export_uom, + d.baseline_ct, d.baseline_unique_ct, d.baseline_value, + d.baseline_value_ct, d.baseline_sum, d.baseline_avg, d.baseline_sd, + d.lower_tolerance, d.upper_tolerance, + d.subset_condition, + d.groupby_names, d.having_condition, d.window_date_column, d.window_days, + d.match_schema_name, d.match_table_name, d.match_column_names, + d.match_subset_condition, d.match_groupby_names, d.match_having_condition, + d.skip_errors, d.custom_query, + COALESCE(d.test_description, t.test_description) as final_test_description, + t.default_parm_columns, t.selection_criteria, + d.profile_run_id::VARCHAR(50), d.test_action, d.test_definition_status, + d.watch_level, d.check_result, d.last_auto_gen_date, + d.test_mode + FROM test_definitions d + INNER JOIN test_types t ON (d.test_type = t.test_type) + INNER JOIN test_suites s ON (d.test_suite_id = s.id) + WHERE True + {filters} + {order_by} """ - return db.retrieve_data(sql) + results = db_session.execute(sql, params=params) + columns = [column.name for column in results.cursor.description] + + return pd.DataFrame(list(results), columns=columns) def update(schema, test_definition): @@ -126,6 +139,8 @@ def update(schema, test_definition): baseline_sum = NULLIF('{test_definition["baseline_sum"]}', ''), baseline_avg = NULLIF('{test_definition["baseline_avg"]}', ''), baseline_sd = NULLIF('{test_definition["baseline_sd"]}', ''), + lower_tolerance = NULLIF('{test_definition["lower_tolerance"]}', ''), + upper_tolerance = NULLIF('{test_definition["upper_tolerance"]}', ''), subset_condition = NULLIF($${test_definition["subset_condition"]}$$, ''), groupby_names = NULLIF($${test_definition["groupby_names"]}$$, ''), having_condition = NULLIF($${test_definition["having_condition"]}$$, ''), @@ -179,6 +194,8 @@ def add(schema, test_definition): baseline_sum, baseline_avg, baseline_sd, + lower_tolerance, + upper_tolerance, subset_condition, groupby_names, having_condition, @@ -223,6 +240,8 @@ def add(schema, test_definition): NULLIF($${test_definition["baseline_sum"]}$$, '') as baseline_sum, NULLIF('{test_definition["baseline_avg"]}', '') as baseline_avg, NULLIF('{test_definition["baseline_sd"]}', '') as baseline_sd, + NULLIF('{test_definition["lower_tolerance"]}', '') as lower_tolerance, + NULLIF('{test_definition["upper_tolerance"]}', '') as upper_tolerance, NULLIF($${test_definition["subset_condition"]}$$, '') as subset_condition, NULLIF($${test_definition["groupby_names"]}$$, '') as groupby_names, NULLIF($${test_definition["having_condition"]}$$, '') as having_condition, @@ -262,13 +281,21 @@ def cascade_delete(schema, test_suite_ids): st.cache_data.clear() -def move(schema, test_definitions, target_table_group, target_test_suite): +def move(schema, test_definitions, target_table_group, target_test_suite, target_table_column=None): + if target_table_column is not None: + update_target_table_column = f""" + column_name = '{target_table_column['column_name']}', + table_name = '{target_table_column['table_name']}', + """ + else: + update_target_table_column = "" sql = f""" WITH selected as ( SELECT UNNEST(ARRAY [{", ".join([ f"'{td['id']}'" for td in test_definitions ])}]) AS id ) UPDATE {schema}.test_definitions SET + {update_target_table_column} table_groups_id = '{target_table_group}'::UUID, test_suite_id = '{target_test_suite}'::UUID FROM {schema}.test_definitions td @@ -279,7 +306,13 @@ def move(schema, test_definitions, target_table_group, target_test_suite): st.cache_data.clear() -def copy(schema, test_definitions, target_table_group, target_test_suite): +def copy(schema, test_definitions, target_table_group, target_test_suite, target_table_column=None): + if target_table_column is not None: + update_target_column = f"'{target_table_column['column_name']}' as column_name" + update_target_table = f"'{target_table_column['table_name']}' as table_name" + else: + update_target_column = "td.colum_name" + update_target_table = "td.table_name" test_definition_ids = [f"'{td['id']}'" for td in test_definitions] sql = f""" INSERT INTO {schema}.test_definitions @@ -314,6 +347,8 @@ def copy(schema, test_definitions, target_table_group, target_test_suite): baseline_sum, baseline_avg, baseline_sd, + lower_tolerance, + upper_tolerance, subset_condition, groupby_names, having_condition, @@ -333,7 +368,7 @@ def copy(schema, test_definitions, target_table_group, target_test_suite): td.custom_query, td.test_definition_status, td.export_to_observability, - td.column_name, + {update_target_column}, td.watch_level, '{target_table_group}'::UUID AS table_groups_id, CASE WHEN td.table_groups_id = '{target_table_group}' THEN td.profile_run_id ELSE NULL END AS profile_run_id, @@ -345,7 +380,7 @@ def copy(schema, test_definitions, target_table_group, target_test_suite): td.lock_refresh, td.last_auto_gen_date, td.schema_name, - td.table_name, + {update_target_table}, td.test_active, td.severity, td.check_result, @@ -357,6 +392,8 @@ def copy(schema, test_definitions, target_table_group, target_test_suite): td.baseline_sum, td.baseline_avg, td.baseline_sd, + td.lower_tolerance, + td.upper_tolerance, td.subset_condition, td.groupby_names, td.having_condition, diff --git a/testgen/ui/queries/test_run_queries.py b/testgen/ui/queries/test_run_queries.py index 1ad15044..9259f3ed 100644 --- a/testgen/ui/queries/test_run_queries.py +++ b/testgen/ui/queries/test_run_queries.py @@ -8,6 +8,23 @@ from testgen.common.models import get_current_session +def is_running(test_run_id: str | tuple[str]) -> bool: + session = get_current_session() + + test_run_ids: tuple[str] = tuple(test_run_id) + if isinstance(test_run_id, str): + test_run_ids = (test_run_id,) + + query = """ + SELECT id + FROM test_runs + WHERE id::text IN :test_run_ids + AND status = 'Running' + """ + result = session.execute(query, params={"test_run_ids": test_run_ids}) + return result and len(result.all()) > 0 + + def cascade_delete(test_suite_ids: list[str]) -> None: if not test_suite_ids: raise ValueError("No Test Suite is specified.") @@ -28,6 +45,45 @@ def cascade_delete(test_suite_ids: list[str]) -> None: st.cache_data.clear() +def cascade_delete_test_run(test_run_id: str) -> None: + if not test_run_id: + raise ValueError("No Test Run is specified.") + + schema: str = st.session_state["dbschema"] + sql = f""" + DELETE + FROM {schema}.working_agg_cat_results + WHERE test_run_id = '{test_run_id}'; + DELETE + FROM {schema}.working_agg_cat_tests + WHERE test_run_id = '{test_run_id}'; + DELETE FROM {schema}.test_runs WHERE id = '{test_run_id}'; + DELETE FROM {schema}.test_results WHERE test_run_id = '{test_run_id}'; + """ + db.execute_sql(sql) + st.cache_data.clear() + + +def cascade_delete_multiple_test_runs(test_run_ids: list[str]) -> None: + if not test_run_ids: + raise ValueError("No Test Run is specified.") + + test_run_ids_str = ", ".join([f"'{run_id}'" for run_id in test_run_ids]) + schema: str = st.session_state["dbschema"] + sql = f""" + DELETE + FROM {schema}.working_agg_cat_results + WHERE test_run_id IN ({test_run_ids_str}); + DELETE + FROM {schema}.working_agg_cat_tests + WHERE test_run_id IN ({test_run_ids_str}); + DELETE FROM {schema}.test_runs WHERE id IN ({test_run_ids_str}); + DELETE FROM {schema}.test_results WHERE test_run_id IN ({test_run_ids_str}); + """ + db.execute_sql(sql) + st.cache_data.clear() + + def update_status(test_run_id: str, status: str) -> None: if not all([test_run_id, status]): raise ValueError("Missing query parameters.") diff --git a/testgen/ui/services/database_service.py b/testgen/ui/services/database_service.py index 98c09ed6..c779948f 100644 --- a/testgen/ui/services/database_service.py +++ b/testgen/ui/services/database_service.py @@ -263,8 +263,7 @@ def _start_target_db_engine(flavor, host, port, db_name, user, password, url, co flavor_service = get_flavor_service(flavor) flavor_service.init(connection_params) connection_string = flavor_service.get_connection_string(password) - connect_args = {"connect_timeout": 3600} - connect_args.update(flavor_service.get_connect_args()) + connect_args = flavor_service.get_connect_args() return create_engine(connection_string, connect_args=connect_args) diff --git a/testgen/ui/services/form_service.py b/testgen/ui/services/form_service.py index ee4bf4a1..b7112f4c 100644 --- a/testgen/ui/services/form_service.py +++ b/testgen/ui/services/form_service.py @@ -1,6 +1,6 @@ +import json import typing from builtins import float -from enum import Enum from pathlib import Path from time import sleep @@ -8,9 +8,7 @@ import streamlit as st from pandas.api.types import is_datetime64_any_dtype from st_aggrid import AgGrid, ColumnsAutoSizeMode, DataReturnMode, GridOptionsBuilder, GridUpdateMode, JsCode -from streamlit_extras.no_default_selectbox import selectbox -import testgen.ui.services.database_service as db from testgen.ui.navigation.router import Router """ @@ -21,153 +19,6 @@ help_icon = (Path(__file__).parent.parent / "assets/question_mark.png").as_posix() -class FormWidget(Enum): - text_md = 1 - text_input = 2 - text_area = 3 - number_input = 4 - selectbox = 5 - date_input = 6 - radio = 7 - checkbox = 8 - multiselect = 9 # TODO: implement - hidden = 99 - - -class FieldSpec: - field_label = None - column_name = None - widget = None - value_original = None - init_value = None - display_only = False - required = False - key_order = 0 - - # Entry Options - max_chars = None - num_min = None - num_max = None - text_multi_lines = 3 - - # Selectbox Options - df_options = None - show_column_name = None - return_column_name = None - - # Radio options - lst_option_text: typing.ClassVar = [] - lst_option_values: typing.ClassVar = [] - show_horizontal = True - - value = None - - def __init__( - self, - str_label, - str_column_name, - form_widget, - orig_val=None, - init_val=None, - read_only=False, - required=False, - int_key=0, - max_chars=None, - num_min=None, - num_max=None, - text_multi_lines=3, - ): - self.field_label = str_label - self.column_name = str_column_name - self.value_original = orig_val - self.init_value = init_val if init_val else orig_val - self.widget = form_widget - self.display_only = read_only - self.required = required - self.key_order = int_key - self.max_chars = max_chars - self.num_min = num_min - self.num_max = num_max - self.text_multi_lines = text_multi_lines - - def set_select_choices(self, df_options, str_show_column_name, str_return_column_name): - if self.widget in [FormWidget.selectbox, FormWidget.multiselect]: - self.df_options = df_options - self.show_column_name = str_show_column_name - self.return_column_name = str_return_column_name - else: - raise ValueError(f"Can't set Select Choices for widget {self.widget}") - - def render_widget(self, boo_form_display_only=False): - # if either form-level or field-level display-only is true, then widget is display-only - boo_display_only = boo_form_display_only or self.display_only - - match self.widget: - case FormWidget.text_md: - st.markdown(f"**{self.field_label}**") - st.markdown(self.init_value) - - case FormWidget.text_input: - self.value = st.text_input( - label=self.field_label, value=self.init_value, disabled=boo_display_only, max_chars=self.max_chars - ) - - case FormWidget.text_area: - box_height = 26 * self.text_multi_lines - self.value = st.text_area( - label=self.field_label, - value=self.init_value, - disabled=boo_display_only, - max_chars=self.max_chars, - height=box_height, - ) - - case FormWidget.number_input: - self.value = st.number_input( - label=self.field_label, - value=self.init_value, - min_value=self.num_min, - max_value=self.num_max, - disabled=boo_display_only, - ) - - case FormWidget.selectbox: - self.value = render_select( - self.field_label, - self.df_options, - self.show_column_name, - not self.return_column_name, - self.required, - self.init_value, - self.display_only, - ) - - case FormWidget.date_input: - self.value = render_select_date(self.field_label, boo_disabled=boo_display_only) - - case FormWidget.radio: - # If no init_value, or if init_value is None (NULL), the first value will be selected by default - self.value = render_radio( - self.field_label, - self.lst_option_text, - self.lst_option_values if self.lst_option_values else self.lst_option_text, - self.init_value, - boo_display_only, - self.show_horizontal, - ) - - case FormWidget.checkbox: - self.value = render_checkbox( - self.field_label, self.lst_option_values, self.init_value, boo_display_only - ) - - case FormWidget.hidden: - self.value = self.init_value - - case _: - raise ValueError(f"Widget {self.widget} is not supported.") - - def render_refresh_button(button_container): with button_container: do_refresh = st.button(":material/refresh:", help="Refresh page data", use_container_width=False) @@ -180,11 +31,6 @@ def show_prompt(str_prompt=None): st.markdown(f":blue[{str_prompt}]") -def show_header(str_header=None): - if str_header: - st.header(f":green[{str_header}]") - - def show_subheader(str_text=None): if str_text: st.subheader(f":green[{str_text}]") @@ -195,61 +41,6 @@ def _show_section_header(str_section_header=None): st.markdown(f":green[**{str_section_header}**]") -def render_form_by_field_specs( - str_form_name, str_table_name, lst_field_specs, str_text_display=None, boo_display_only=False, str_caption=None -): - show_header(str_form_name) - - if str_text_display: - layout_column_1, layout_column_2 = st.columns([0.7, 0.3]) - else: - layout_column_1, layout_column_2 = st.columns([0.95, 0.05]) - - if str_text_display: - with layout_column_2: - st.markdown(str_text_display) - - with layout_column_1: - # Render form - layout_container = st.container() if boo_display_only else st.form(str_form_name, clear_on_submit=True) - with layout_container: - if str_caption: - st.caption(f":green[{str_caption}]") - - # Render all widgets - for field in lst_field_specs: - field.render_widget(boo_display_only) - - submit = ( - False - if boo_display_only - else st.form_submit_button("Save Changes") - ) - - if submit and not boo_display_only: - # Process Results - changes = [] - keys = [] - - # Construct SQL UPDATE statement based on the changed values - lst_field_specs_by_key = sorted(lst_field_specs, key=lambda x: x.key_order) - for field in lst_field_specs_by_key: - if field.key_order > 0: - keys.append(f"{field.column_name} = '{field.value}'") - elif not field.display_only and field.value is None and field.value_original is not None: - changes.append(f"{field.column_name} = NULL") - elif not field.display_only and field.value != field.value_original: - changes.append(f"{field.column_name} = '{field.value}'") - # If there are any changes, construct and run the SQL statement - if changes: - str_schema = st.session_state["dbschema"] - str_sql = ( - f"UPDATE {str_schema}.{str_table_name} SET {', '.join(changes)} WHERE {' AND '.join(keys)};" - ) - db.execute_sql(str_sql) - reset_post_updates("Changes have been saved.") - - def ut_prettify_header(str_header, expand=False): # First drop underscores and make title-case str_new = str_header.replace("_", " ").title() @@ -288,64 +79,6 @@ def reset_post_updates(str_message=None, as_toast=False, clear_cache=True, lst_c st.rerun() -def render_select( - str_label, df_options, str_show_column, str_return_column, boo_required=True, str_default=None, boo_disabled=False -): - # Assemble conditional arguments for selectbox - kwargs = {"label": str_label, "options": df_options[str_show_column], "disabled": boo_disabled} - if str_default: - # Conditionally select index based on index of default value - if str_default not in df_options[str_show_column].values: - message = f"Label: {str_label} - Option: {str_default} not available. Click the refresh button." - st.markdown(f":orange[{message}]") - else: - kwargs["index"] = int(df_options[df_options[str_show_column] == str_default].index[0]) - str_choice_name = st.selectbox(**kwargs) if boo_required else selectbox(**kwargs) - # Assign return-value from selected show-value - if str_choice_name: - return df_options.loc[df_options[str_show_column] == str_choice_name, str_return_column].iloc[0] - - -def render_select_date(str_label, dt_min_date=None, dt_max_date=None, boo_disabled=False, dt_default=None): - dt_select = st.date_input( - label=str_label, - value=dt_default, - min_value=dt_min_date, - max_value=dt_max_date, - format="YYYY-MM-DD", - disabled=boo_disabled, - ) - return dt_select - - -def render_radio( - str_label, lst_option_text, lst_option_values=None, init_value=None, boo_disabled=False, boo_horizontal=True -): - if init_value: - # Lookup index for init value - i = next((i for i, x in enumerate(lst_option_values) if x == init_value), -1) - i = i if i > 0 else 0 - else: - # If no init_value, or if init_value is None (NULL), the first value will be selected by default - i = 0 - str_choice_text = st.radio( - str_label, options=lst_option_text, index=i, disabled=boo_disabled, horizontal=boo_horizontal - ) - if lst_option_values: - # Lookup choice -- get value - i = next((i for i, x in enumerate(lst_option_text) if x == str_choice_text), -1) - val_select = lst_option_values[i] - else: - val_select = str_choice_text - - return val_select - - -def render_checkbox(str_label, lst_true_false_values, boo_init_state=False, boo_disabled=False): - boo_value = st.checkbox(str_label, boo_init_state, disabled=boo_disabled) - return lst_true_false_values[0] if boo_value else lst_true_false_values[1] - - def render_html_list(dct_row, lst_columns, str_section_header=None, int_data_width=300, lst_labels=None): # Renders sets of values as vertical markdown list @@ -503,6 +236,7 @@ def render_grid_select( rendering_counter = st.session_state.get(f"{key}_counter") or 0 previous_dataframe = st.session_state.get(f"{key}_dataframe") + df = df.copy() if previous_dataframe is not None: data_changed = not df.equals(previous_dataframe) @@ -599,4 +333,13 @@ def render_grid_select( if len(selected_rows) > 0: if bind_to_query_name and bind_to_query_prop: Router().set_query_params({bind_to_query_name: selected_rows[0][bind_to_query_prop]}) + + # We need to get the data from the original dataframe + # Otherwise changes to the dataframe (e.g., editing the current selection) do not get reflected in the returned rows + # Adding "modelUpdated" to AgGrid(update_on=...) does not work + # because it causes unnecessary reruns that cause dialogs to close abruptly + selected_props = [row[bind_to_query_prop] for row in selected_rows] + selected_df = df[df[bind_to_query_prop].isin(selected_props)] + selected_rows = json.loads(selected_df.to_json(orient="records")) + return selected_rows diff --git a/testgen/ui/services/hygiene_issues_service.py b/testgen/ui/services/hygiene_issues_service.py index 71a24fe7..3fba8539 100644 --- a/testgen/ui/services/hygiene_issues_service.py +++ b/testgen/ui/services/hygiene_issues_service.py @@ -4,7 +4,7 @@ from testgen.ui.services import database_service as db -def get_source_data(hi_data): +def get_source_data(hi_data, limit: int | None = None): str_schema = st.session_state["dbschema"] # Define the query str_sql = f""" @@ -83,6 +83,8 @@ def replace_parms(str_query): if df.empty: return "ND", "Data that violates Hygiene Issue criteria is not present in the current dataset.", str_sql, None else: + if limit: + df = df.sample(n=min(len(df), limit)).sort_index() return "OK", None, str_sql, df else: return "NA", "Source data lookup is not available for this Issue.", None, None diff --git a/testgen/ui/services/table_group_service.py b/testgen/ui/services/table_group_service.py index 98784fe1..ca6df20a 100644 --- a/testgen/ui/services/table_group_service.py +++ b/testgen/ui/services/table_group_service.py @@ -17,6 +17,11 @@ def get_by_connection(project_code, connection_id): return table_group_queries.get_by_connection(schema, project_code, connection_id) +def get_all(project_code: str): + schema = st.session_state["dbschema"] + return table_group_queries.get_all(schema, project_code) + + def edit(table_group): schema = st.session_state["dbschema"] table_group_queries.edit(schema, table_group) @@ -54,7 +59,7 @@ def table_group_has_dependencies(table_group_names): return not table_group_queries.get_table_group_dependencies(schema, table_group_names).empty -def are_table_groups_in_use(table_group_names): +def are_table_groups_in_use(table_group_names: list[str]): if not table_group_names: return False @@ -69,6 +74,14 @@ def are_table_groups_in_use(table_group_names): return test_suites_in_use or table_groups_in_use +def is_table_group_used(table_group_id: str) -> bool: + schema = st.session_state["dbschema"] + test_suite_ids = table_group_queries.get_test_suite_ids_by_table_group_id(schema, table_group_id) + proling_run_ids = table_group_queries.get_profiling_run_ids_by_table_group_id(schema, table_group_id) + + return len(test_suite_ids) + len(proling_run_ids) > 0 + + def get_test_suite_ids_by_table_group_names(table_group_names): if not table_group_names: return [] diff --git a/testgen/ui/services/test_definition_service.py b/testgen/ui/services/test_definition_service.py index 452e7cda..7c4eca29 100644 --- a/testgen/ui/services/test_definition_service.py +++ b/testgen/ui/services/test_definition_service.py @@ -24,21 +24,38 @@ def get_test_definitions( def get_test_definition(db_schema, test_def_id): str_sql = f""" - SELECT d.id::VARCHAR, tt.test_name_short as test_name, tt.test_name_long as full_name, - tt.test_description as description, tt.usage_notes, - d.column_name, - d.baseline_value, d.baseline_ct, d.baseline_avg, d.baseline_sd, d.threshold_value, - d.subset_condition, d.groupby_names, d.having_condition, d.match_schema_name, - d.match_table_name, d.match_column_names, d.match_subset_condition, - d.match_groupby_names, d.match_having_condition, - d.window_date_column, d.window_days::VARCHAR as window_days, - d.custom_query, - d.severity, tt.default_severity, - d.test_active, d.lock_refresh, d.last_manual_update - FROM {db_schema}.test_definitions d - INNER JOIN {db_schema}.test_types tt - ON (d.test_type = tt.test_type) - WHERE d.id = '{test_def_id}'; + SELECT + d.id::VARCHAR, + tg.table_group_schema as schema_name, + ts.test_suite as test_suite_name, + d.export_to_observability as export_to_observability, + ts.export_to_observability as default_export_to_observability, + tt.test_name_short as test_name, + tt.test_name_long as full_name, + tt.test_description as description, + d.test_definition_status as status, + tt.usage_notes, + d.table_name, + d.column_name, + d.baseline_value, d.baseline_ct, d.baseline_unique_ct, d.baseline_value_ct, + d.baseline_avg, d.baseline_sd, d.threshold_value, d.baseline_sum, + d.lower_tolerance, d.upper_tolerance, + d.subset_condition, d.groupby_names, d.having_condition, d.match_schema_name, + d.match_table_name, d.match_column_names, d.match_subset_condition, + d.match_groupby_names, d.match_having_condition, + d.window_date_column, d.window_days::VARCHAR as window_days, + d.custom_query, d.test_mode, + d.severity, tt.default_severity, + d.test_active, d.lock_refresh, d.last_manual_update, + tt.default_parm_prompts, tt.default_parm_columns, tt.default_parm_help + FROM {db_schema}.test_definitions d + INNER JOIN {db_schema}.test_types tt + ON (d.test_type = tt.test_type) + INNER JOIN {db_schema}.test_suites ts + ON (ts.id = d.test_suite_id) + INNER JOIN {db_schema}.table_groups tg + ON (tg.id = d.table_groups_id) + WHERE d.id = '{test_def_id}'; """ return database_service.retrieve_data(str_sql) @@ -135,15 +152,15 @@ def validate_test(test_definition): ) -def move(test_definitions, target_table_group, target_test_suite): +def move(test_definitions, target_table_group, target_test_suite, target_table_column=None): schema = st.session_state["dbschema"] - test_definition_queries.move(schema, test_definitions, target_table_group, target_test_suite) + test_definition_queries.move(schema, test_definitions, target_table_group, target_test_suite, target_table_column) -def copy(test_definitions, target_table_group, target_test_suite): +def copy(test_definitions, target_table_group, target_test_suite, target_table_column=None): schema = st.session_state["dbschema"] - test_definition_queries.copy(schema, test_definitions, target_table_group, target_test_suite) + test_definition_queries.copy(schema, test_definitions, target_table_group, target_test_suite, target_table_column) def get_test_definitions_collision(test_definitions, target_table_group, target_test_suite): diff --git a/testgen/ui/services/test_results_service.py b/testgen/ui/services/test_results_service.py index 7f2d886b..2cdf327d 100644 --- a/testgen/ui/services/test_results_service.py +++ b/testgen/ui/services/test_results_service.py @@ -1,135 +1,154 @@ import pandas as pd from testgen.common import ConcatColumnList +from testgen.common.models import get_current_session, with_database_session from testgen.common.read_file import replace_templated_functions from testgen.ui.services import database_service as db from testgen.ui.services.string_service import empty_if_null from testgen.ui.services.test_definition_service import get_test_definition +@with_database_session def get_test_results( - schema: str, + _: str, run_id: str, - test_status: str | None = None, + test_status: str | list[str] | None = None, test_type_id: str | None = None, table_name: str | None = None, column_name: str | None = None, sorting_columns: list[str] | None = None, ) -> pd.DataFrame: # First visible row first, so multi-select checkbox will render + db_session = get_current_session() + params = {"run_id": run_id} + order_by = "ORDER BY " + (", ".join(" ".join(col) for col in sorting_columns)) if sorting_columns else "" filters = "" if test_status: - filters += f" AND r.result_status IN ({test_status})" + if isinstance(test_status, str): + test_status = [status.strip() for status in test_status.split(",")] + test_status_params = {f"test_status_{idx}": status for idx, status in enumerate(test_status)} + + filters += f" AND r.result_status IN ({', '.join([f':{p}' for p in test_status_params.keys()])})" + params.update(test_status_params) if test_type_id: - filters += f" AND r.test_type = '{test_type_id}'" + filters += " AND r.test_type = :test_type_id" + params["test_type_id"] = test_type_id if table_name: - filters += f" AND r.table_name = '{table_name}'" + filters += " AND r.table_name = :table_name" + params["table_name"] = table_name if column_name: - filters += f" AND r.column_names = '{column_name}'" + filters += " AND r.column_names ILIKE :column_name" + params["column_name"] = column_name sql = f""" - WITH run_results - AS (SELECT * - FROM {schema}.test_results r - WHERE - r.test_run_id = '{run_id}' - {filters} - ) - SELECT r.table_name, - p.project_name, ts.test_suite, tg.table_groups_name, cn.connection_name, cn.project_host, cn.sql_flavor, - tt.dq_dimension, tt.test_scope, - r.schema_name, r.column_names, r.test_time::DATE as test_date, r.test_type, tt.id as test_type_id, - tt.test_name_short, tt.test_name_long, r.test_description, tt.measure_uom, tt.measure_uom_description, - c.test_operator, r.threshold_value::NUMERIC(16, 5), r.result_measure::NUMERIC(16, 5), r.result_status, - CASE - WHEN r.result_code <> 1 THEN r.disposition - ELSE 'Passed' - END as disposition, - NULL::VARCHAR(1) as action, - r.input_parameters, r.result_message, CASE WHEN result_code <> 1 THEN r.severity END as severity, - r.result_code as passed_ct, - (1 - r.result_code)::INTEGER as exception_ct, - CASE - WHEN result_status = 'Warning' - AND result_message NOT ILIKE 'Inactivated%%' THEN 1 - END::INTEGER as warning_ct, - CASE - WHEN result_status = 'Failed' - AND result_message NOT ILIKE 'Inactivated%%' THEN 1 - END::INTEGER as failed_ct, - CASE - WHEN result_message ILIKE 'Inactivated%%' THEN 1 - END as execution_error_ct, - p.project_code, r.table_groups_id::VARCHAR, - r.id::VARCHAR as test_result_id, r.test_run_id::VARCHAR, - c.id::VARCHAR as connection_id, r.test_suite_id::VARCHAR, - r.test_definition_id::VARCHAR as test_definition_id_runtime, - CASE - WHEN r.auto_gen = TRUE THEN d.id - ELSE r.test_definition_id - END::VARCHAR as test_definition_id_current, - r.auto_gen, - - -- These are used in the PDF report - tt.threshold_description, tt.usage_notes, r.test_time, - dcc.description as column_description, - COALESCE(dcc.critical_data_element, dtc.critical_data_element) as critical_data_element, - COALESCE(dcc.data_source, dtc.data_source, tg.data_source) as data_source, - COALESCE(dcc.source_system, dtc.source_system, tg.source_system) as source_system, - COALESCE(dcc.source_process, dtc.source_process, tg.source_process) as source_process, - COALESCE(dcc.business_domain, dtc.business_domain, tg.business_domain) as business_domain, - COALESCE(dcc.stakeholder_group, dtc.stakeholder_group, tg.stakeholder_group) as stakeholder_group, - COALESCE(dcc.transform_level, dtc.transform_level, tg.transform_level) as transform_level, - COALESCE(dcc.aggregation_level, dtc.aggregation_level) as aggregation_level, - COALESCE(dcc.data_product, dtc.data_product, tg.data_product) as data_product - - FROM run_results r - INNER JOIN {schema}.test_types tt - ON (r.test_type = tt.test_type) - LEFT JOIN {schema}.test_definitions rd - ON (r.test_definition_id = rd.id) - LEFT JOIN {schema}.test_definitions d - ON (r.test_suite_id = d.test_suite_id - AND r.table_name = d.table_name - AND r.column_names = COALESCE(d.column_name, 'N/A') - AND r.test_type = d.test_type - AND r.auto_gen = TRUE - AND d.last_auto_gen_date IS NOT NULL) - INNER JOIN {schema}.test_suites ts - ON r.test_suite_id = ts.id - INNER JOIN {schema}.projects p - ON (ts.project_code = p.project_code) - INNER JOIN {schema}.table_groups tg - ON (ts.table_groups_id = tg.id) - INNER JOIN {schema}.connections cn - ON (tg.connection_id = cn.connection_id) - LEFT JOIN {schema}.cat_test_conditions c - ON (cn.sql_flavor = c.sql_flavor - AND r.test_type = c.test_type) - LEFT JOIN {schema}.data_column_chars dcc - ON (tg.id = dcc.table_groups_id - AND r.schema_name = dcc.schema_name - AND r.table_name = dcc.table_name - AND r.column_names = dcc.column_name) - LEFT JOIN {schema}.data_table_chars dtc - ON dcc.table_id = dtc.table_id - {order_by} ; + WITH run_results AS ( + SELECT * + FROM test_results r + WHERE r.test_run_id = :run_id + {filters} + ) + SELECT r.table_name, + p.project_name, ts.test_suite, tg.table_groups_name, cn.connection_name, cn.project_host, cn.sql_flavor, + tt.dq_dimension, tt.test_scope, + r.schema_name, r.column_names, r.test_time::DATE as test_date, r.test_type, tt.id as test_type_id, + tt.test_name_short, tt.test_name_long, r.test_description, tt.measure_uom, tt.measure_uom_description, + c.test_operator, r.threshold_value::NUMERIC(16, 5), r.result_measure::NUMERIC(16, 5), r.result_status, + CASE + WHEN r.result_code <> 1 THEN r.disposition + ELSE 'Passed' + END as disposition, + NULL::VARCHAR(1) as action, + r.input_parameters, r.result_message, CASE WHEN result_code <> 1 THEN r.severity END as severity, + r.result_code as passed_ct, + (1 - r.result_code)::INTEGER as exception_ct, + CASE + WHEN result_status = 'Warning' + AND result_message NOT ILIKE 'Inactivated%%' THEN 1 + END::INTEGER as warning_ct, + CASE + WHEN result_status = 'Failed' + AND result_message NOT ILIKE 'Inactivated%%' THEN 1 + END::INTEGER as failed_ct, + CASE + WHEN result_message ILIKE 'Inactivated%%' THEN 1 + END as execution_error_ct, + p.project_code, r.table_groups_id::VARCHAR, + r.id::VARCHAR as test_result_id, r.test_run_id::VARCHAR, + c.id::VARCHAR as connection_id, r.test_suite_id::VARCHAR, + r.test_definition_id::VARCHAR as test_definition_id_runtime, + CASE + WHEN r.auto_gen = TRUE THEN d.id + ELSE r.test_definition_id + END::VARCHAR as test_definition_id_current, + r.auto_gen, + + -- These are used in the PDF report + tt.threshold_description, tt.usage_notes, r.test_time, + dcc.description as column_description, + COALESCE(dcc.critical_data_element, dtc.critical_data_element) as critical_data_element, + COALESCE(dcc.data_source, dtc.data_source, tg.data_source) as data_source, + COALESCE(dcc.source_system, dtc.source_system, tg.source_system) as source_system, + COALESCE(dcc.source_process, dtc.source_process, tg.source_process) as source_process, + COALESCE(dcc.business_domain, dtc.business_domain, tg.business_domain) as business_domain, + COALESCE(dcc.stakeholder_group, dtc.stakeholder_group, tg.stakeholder_group) as stakeholder_group, + COALESCE(dcc.transform_level, dtc.transform_level, tg.transform_level) as transform_level, + COALESCE(dcc.aggregation_level, dtc.aggregation_level) as aggregation_level, + COALESCE(dcc.data_product, dtc.data_product, tg.data_product) as data_product + + FROM run_results r + INNER JOIN test_types tt + ON (r.test_type = tt.test_type) + LEFT JOIN test_definitions rd + ON (r.test_definition_id = rd.id) + LEFT JOIN test_definitions d + ON (r.test_suite_id = d.test_suite_id + AND r.table_name = d.table_name + AND COALESCE(r.column_names, 'N/A') = COALESCE(d.column_name, 'N/A') + AND r.test_type = d.test_type + AND r.auto_gen = TRUE + AND d.last_auto_gen_date IS NOT NULL) + INNER JOIN test_suites ts + ON r.test_suite_id = ts.id + INNER JOIN projects p + ON (ts.project_code = p.project_code) + INNER JOIN table_groups tg + ON (ts.table_groups_id = tg.id) + INNER JOIN connections cn + ON (tg.connection_id = cn.connection_id) + LEFT JOIN cat_test_conditions c + ON (cn.sql_flavor = c.sql_flavor + AND r.test_type = c.test_type) + LEFT JOIN data_column_chars dcc + ON (tg.id = dcc.table_groups_id + AND r.schema_name = dcc.schema_name + AND r.table_name = dcc.table_name + AND r.column_names = dcc.column_name) + LEFT JOIN data_table_chars dtc + ON dcc.table_id = dtc.table_id + {order_by} """ - df = db.retrieve_data(sql) - # Clean Up + results = db_session.execute(sql, params=params) + columns = [column.name for column in results.cursor.description] + + df = pd.DataFrame(list(results), columns=columns) df["test_date"] = pd.to_datetime(df["test_date"]) return df -def get_test_result_history(db_schema, tr_data): +def get_test_result_history(db_schema, tr_data, limit: int | None = None): if tr_data["auto_gen"]: + if tr_data["column_names"]: + col_name_cond = f"column_names = '{tr_data["column_names"]}'" + else: + col_name_cond = "column_names IS NULL" + str_where = f""" WHERE test_suite_id = '{tr_data["test_suite_id"]}' AND table_name = '{tr_data["table_name"]}' - AND column_names = '{tr_data["column_names"]}' + AND {col_name_cond} AND test_type = '{tr_data["test_type"]}' AND auto_gen = TRUE """ @@ -143,7 +162,8 @@ def get_test_result_history(db_schema, tr_data): test_name_short, test_name_long, measure_uom, test_operator, threshold_value::NUMERIC, result_measure, result_status FROM {db_schema}.v_test_results {str_where} - ORDER BY test_date DESC; + ORDER BY test_date DESC + {'LIMIT ' + str(limit) if limit else ''}; """ df = db.retrieve_data(str_sql) @@ -153,7 +173,7 @@ def get_test_result_history(db_schema, tr_data): return df -def do_source_data_lookup_custom(db_schema, tr_data): +def do_source_data_lookup_custom(db_schema, tr_data, limit: int | None = None): # Define the query str_sql = f""" SELECT d.custom_query as lookup_query, tg.table_group_schema, @@ -193,6 +213,8 @@ def do_source_data_lookup_custom(db_schema, tr_data): if df.empty: return "ND", "Data that violates Test criteria is not present in the current dataset.", str_sql, None else: + if limit: + df = df.sample(n=min(len(df), limit)).sort_index() return "OK", None, str_sql, df else: return "NA", "Source data lookup is not available for this test.", None, None @@ -201,7 +223,7 @@ def do_source_data_lookup_custom(db_schema, tr_data): return "ERR", f"Source data lookup query caused an error:\n\n{e.args[0]}", str_sql, None -def do_source_data_lookup(db_schema, tr_data, sql_only=False): +def do_source_data_lookup(db_schema, tr_data, sql_only=False, limit: int | None = None): # Define the query str_sql = f""" SELECT t.lookup_query, tg.table_group_schema, @@ -235,6 +257,8 @@ def replace_parms(df_test, str_query): str_query = str_query.replace("{BASELINE_AVG}", empty_if_null(df_test.at[0, "baseline_avg"])) str_query = str_query.replace("{BASELINE_SD}", empty_if_null(df_test.at[0, "baseline_sd"])) str_query = str_query.replace("{THRESHOLD_VALUE}", empty_if_null(df_test.at[0, "threshold_value"])) + str_query = str_query.replace("{LOWER_TOLERANCE}", empty_if_null(df_test.at[0, "lower_tolerance"])) + str_query = str_query.replace("{UPPER_TOLERANCE}", empty_if_null(df_test.at[0, "upper_tolerance"])) str_substitute = empty_if_null(df_test.at[0, "subset_condition"]) str_substitute = "1=1" if str_substitute == "" else str_substitute @@ -298,6 +322,8 @@ def replace_parms(df_test, str_query): if df.empty: return "ND", "Data that violates Test criteria is not present in the current dataset.", str_sql, None else: + if limit: + df = df.sample(n=min(len(df), limit)).sort_index() return "OK", None, str_sql, df else: return "NA", "A source data lookup for this Test is not available.", None, None diff --git a/testgen/ui/session.py b/testgen/ui/session.py index b82cbc21..4ac634ed 100644 --- a/testgen/ui/session.py +++ b/testgen/ui/session.py @@ -1,5 +1,11 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Literal, TypeVar + +if TYPE_CHECKING: + from testgen.common.version_service import Version + from collections.abc import Callable -from typing import Any, Literal, TypeVar import streamlit as st from streamlit.runtime.state import SessionStateProxy @@ -31,7 +37,7 @@ class TestgenSession(Singleton): sidebar_project: str add_project: bool - latest_version: str | None + version: Version | None testgen_event_id: str | None sidebar_event_id: str | None diff --git a/testgen/ui/views/connections.py b/testgen/ui/views/connections.py index b30029ef..21c76ea4 100644 --- a/testgen/ui/views/connections.py +++ b/testgen/ui/views/connections.py @@ -28,15 +28,6 @@ CLEAR_SENTINEL = "" -@dataclass(frozen=True, slots=True, kw_only=True) -class ConnectionFlavor: - value: str - label: str - icon: str - flavor: str - connection_string: str - - class ConnectionsPage(Page): path = "connections" can_activate: typing.ClassVar = [ @@ -48,60 +39,9 @@ class ConnectionsPage(Page): icon="database", label=PAGE_TITLE, section="Data Configuration", - order=0, + order=1, roles=[ role for role in typing.get_args(user_session_service.RoleType) if role != "catalog" ], ) - flavor_options: typing.ClassVar[list[ConnectionFlavor]] = [ - ConnectionFlavor( - label="Amazon Redshift", - value="redshift", - flavor="redshift", - icon=get_asset_data_url("flavors/redshift.svg"), - connection_string=connection_service.get_connection_string("redshift"), - ), - ConnectionFlavor( - label="Azure SQL Database", - value="azure_mssql", - flavor="mssql", - icon=get_asset_data_url("flavors/azure_sql.svg"), - connection_string=connection_service.get_connection_string("mssql"), - ), - ConnectionFlavor( - label="Azure Synapse Analytics", - value="synapse_mssql", - flavor="mssql", - icon=get_asset_data_url("flavors/azure_synapse_table.svg"), - connection_string=connection_service.get_connection_string("mssql"), - ), - ConnectionFlavor( - label="Microsoft SQL Server", - value="mssql", - flavor="mssql", - icon=get_asset_data_url("flavors/mssql.svg"), - connection_string=connection_service.get_connection_string("mssql"), - ), - ConnectionFlavor( - label="PostgreSQL", - value="postgresql", - flavor="postgresql", - icon=get_asset_data_url("flavors/postgresql.svg"), - connection_string=connection_service.get_connection_string("postgresql"), - ), - ConnectionFlavor( - label="Snowflake", - value="snowflake", - flavor="snowflake", - icon=get_asset_data_url("flavors/snowflake.svg"), - connection_string=connection_service.get_connection_string("snowflake"), - ), - ConnectionFlavor( - label="Databricks", - value="databricks", - flavor="databricks", - icon=get_asset_data_url("flavors/databricks.svg"), - connection_string=connection_service.get_connection_string("databricks"), - ), - ] def render(self, project_code: str, **_kwargs) -> None: testgen.page_header( @@ -201,9 +141,10 @@ def on_test_connection_clicked(updated_connection: dict) -> None: return testgen.testgen_component( "connections", props={ + "project_code": project_code, "connection": self._format_connection(connection, should_test=should_check_status()), "has_table_groups": has_table_groups, - "flavors": [asdict(flavor) for flavor in self.flavor_options], + "flavors": [asdict(flavor) for flavor in FLAVOR_OPTIONS], "permissions": { "is_admin": user_is_admin, }, @@ -216,56 +157,16 @@ def on_test_connection_clicked(updated_connection: dict) -> None: }, ) - def _get_sql_flavor_from_value(self, value: str) -> ConnectionFlavor | None: - match = [f for f in self.flavor_options if f.value == value] + def _get_sql_flavor_from_value(self, value: str) -> "ConnectionFlavor | None": + match = [f for f in FLAVOR_OPTIONS if f.value == value] if match: return match[0] return None def _format_connection(self, connection: dict, should_test: bool = False) -> dict: - fields = [ - "project_code", - "connection_id", - "connection_name", - "sql_flavor", - "sql_flavor_code", - "project_host", - "project_port", - "project_db", - "project_user", - "password", - "max_threads", - "max_query_chars", - "connect_by_url", - "connect_by_key", - "private_key", - "private_key_passphrase", - "http_path", - "url", - ] - formatted_connection = {} - - for fieldname in fields: - formatted_connection[fieldname] = format_field(connection[fieldname]) - + formatted_connection = format_connection(connection) if should_test: formatted_connection["status"] = asdict(self.test_connection(connection)) - - if formatted_connection["password"]: - formatted_connection["password"] = "***" # noqa S105 - if formatted_connection["private_key"]: - formatted_connection["private_key"] = "***" # S105 - if formatted_connection["private_key_passphrase"]: - formatted_connection["private_key_passphrase"] = "***" # noqa S105 - - first_match = [f for f in self.flavor_options if f.flavor == formatted_connection.get("sql_flavor")] - if formatted_connection["sql_flavor"] and not formatted_connection.get("sql_flavor_code") and first_match: - formatted_connection["sql_flavor_code"] = first_match[0].flavor - - flavors = [f for f in self.flavor_options if f.value == formatted_connection["sql_flavor_code"]] - if flavors and (flavor := flavors[0]): - formatted_connection["flavor"] = asdict(flavor) - return formatted_connection def test_connection(self, connection: dict) -> "ConnectionStatus": @@ -415,3 +316,105 @@ def is_open_ssl_error(error: Exception): and len(error.args[1]) > 0 and type(error.args[1][0]).__name__ == "OpenSSLError" ) + + +def format_connection(connection: dict) -> dict: + fields = [ + "project_code", + "connection_id", + "connection_name", + "sql_flavor", + "sql_flavor_code", + "project_host", + "project_port", + "project_db", + "project_user", + "password", + "max_threads", + "max_query_chars", + "connect_by_url", + "connect_by_key", + "private_key", + "private_key_passphrase", + "http_path", + "url", + ] + formatted_connection = {} + + for fieldname in fields: + formatted_connection[fieldname] = format_field(connection[fieldname]) + + if formatted_connection["password"]: + formatted_connection["password"] = "***" # noqa S105 + if formatted_connection["private_key"]: + formatted_connection["private_key"] = "***" # S105 + if formatted_connection["private_key_passphrase"]: + formatted_connection["private_key_passphrase"] = "***" # noqa S105 + + flavors = [f for f in FLAVOR_OPTIONS if f.value == formatted_connection["sql_flavor_code"]] + if flavors and (flavor := flavors[0]): + formatted_connection["flavor"] = asdict(flavor) + + return formatted_connection + + +@dataclass(frozen=True, slots=True, kw_only=True) +class ConnectionFlavor: + value: str + label: str + icon: str + flavor: str + connection_string: str + + +FLAVOR_OPTIONS = [ + ConnectionFlavor( + label="Amazon Redshift", + value="redshift", + flavor="redshift", + icon=get_asset_data_url("flavors/redshift.svg"), + connection_string=connection_service.get_connection_string("redshift"), + ), + ConnectionFlavor( + label="Azure SQL Database", + value="azure_mssql", + flavor="mssql", + icon=get_asset_data_url("flavors/azure_sql.svg"), + connection_string=connection_service.get_connection_string("mssql"), + ), + ConnectionFlavor( + label="Azure Synapse Analytics", + value="synapse_mssql", + flavor="mssql", + icon=get_asset_data_url("flavors/azure_synapse_table.svg"), + connection_string=connection_service.get_connection_string("mssql"), + ), + ConnectionFlavor( + label="Microsoft SQL Server", + value="mssql", + flavor="mssql", + icon=get_asset_data_url("flavors/mssql.svg"), + connection_string=connection_service.get_connection_string("mssql"), + ), + ConnectionFlavor( + label="PostgreSQL", + value="postgresql", + flavor="postgresql", + icon=get_asset_data_url("flavors/postgresql.svg"), + connection_string=connection_service.get_connection_string("postgresql"), + ), + ConnectionFlavor( + label="Snowflake", + value="snowflake", + flavor="snowflake", + icon=get_asset_data_url("flavors/snowflake.svg"), + connection_string=connection_service.get_connection_string("snowflake"), + ), + ConnectionFlavor( + label="Databricks", + value="databricks", + flavor="databricks", + icon=get_asset_data_url("flavors/databricks.svg"), + connection_string=connection_service.get_connection_string("databricks"), + ), +] diff --git a/testgen/ui/views/data_catalog.py b/testgen/ui/views/data_catalog.py index 8eff9142..52bd1b68 100644 --- a/testgen/ui/views/data_catalog.py +++ b/testgen/ui/views/data_catalog.py @@ -27,8 +27,11 @@ TAG_FIELDS, get_column_by_id, get_columns_by_id, + get_columns_by_table_group, get_hygiene_issues, get_table_by_id, + get_tables_by_id, + get_tables_by_table_group, ) from testgen.ui.services import user_session_service from testgen.ui.session import session, temp_value @@ -117,10 +120,10 @@ def render(self, project_code: str, table_group_id: str | None = None, selected: ), "TableGroupSelected": on_table_group_selected, "ItemSelected": on_item_selected, - "ExportClicked": lambda columns: download_dialog( + "ExportClicked": lambda items: download_dialog( dialog_title="Download Excel Report", file_content_func=get_excel_report_data, - args=(selected_table_group["table_groups_name"], columns), + args=(selected_table_group, items), ), "RemoveTableClicked": remove_table_dialog, "DataPreviewClicked": lambda item: data_preview_dialog( @@ -149,12 +152,37 @@ def on_item_selected(item_id: str | None) -> None: Router().set_query_params({ "selected": item_id }) -def get_excel_report_data(update_progress: PROGRESS_UPDATE_TYPE, table_group: str, columns: list[str]) -> None: - data = get_columns_by_id( - [ col.split("_")[1] for col in columns ], - include_tags=True, - ) - data = pd.DataFrame(data) +class ExportItem(typing.TypedDict): + id: str + type: typing.Literal["table", "column"] + +def get_excel_report_data(update_progress: PROGRESS_UPDATE_TYPE, table_group: dict, items: list[ExportItem] | None) -> None: + if items: + table_data = get_tables_by_id( + table_ids=[ item["id"] for item in items if item["type"] == "table" ], + include_tags=True, + include_active_tests=True, + ) + column_data = get_columns_by_id( + column_ids=[ item["id"] for item in items if item["type"] == "column" ], + include_tags=True, + include_active_tests=True, + ) + else: + table_data = get_tables_by_table_group( + table_group["id"], + include_tags=True, + include_active_tests=True, + ) + column_data = get_columns_by_table_group( + table_group["id"], + include_tags=True, + include_active_tests=True, + ) + + + data = pd.DataFrame(table_data + column_data) + data = data.sort_values(by=["table_name", "ordinal_position"], na_position="first") for key in ["column_type", "datatype_suggestion"]: data[key] = data[key].apply(lambda val: val.lower() if not pd.isna(val) else None) @@ -169,7 +197,7 @@ def get_excel_report_data(update_progress: PROGRESS_UPDATE_TYPE, table_group: st for key in ["data_source", "source_system", "source_process", "business_domain", "stakeholder_group", "transform_level", "aggregation_level", "data_product"]: data[key] = data.apply( - lambda col: col[key] or col[f"table_{key}"] or col.get(f"table_group_{key}"), + lambda row: row[key] or row[f"table_{key}"] or row.get(f"table_group_{key}"), axis=1, ) @@ -177,17 +205,17 @@ def get_excel_report_data(update_progress: PROGRESS_UPDATE_TYPE, table_group: st data["general_type"] = data["general_type"].apply(lambda val: type_map.get(val)) data["critical_data_element"] = data.apply( - lambda col: "Yes" if col["critical_data_element"] or col["table_critical_data_element"] else None, + lambda row: "Yes" if row["critical_data_element"] == True or row["table_critical_data_element"] == True else None, axis=1, ) data["top_freq_values"] = data["top_freq_values"].apply( lambda val: "\n".join([ f"{part.split(" | ")[1]} | {part.split(" | ")[0]}" for part in val[2:].split("\n| ") ]) - if val + if not pd.isna(val) else None ) data["top_patterns"] = data["top_patterns"].apply( lambda val: "".join([ f"{part}{'\n' if index % 2 else ' | '}" for index, part in enumerate(val.split(" | ")) ]) - if val + if not pd.isna(val) else None ) @@ -196,6 +224,7 @@ def get_excel_report_data(update_progress: PROGRESS_UPDATE_TYPE, table_group: st "table_name": {"header": "Table"}, "column_name": {"header": "Column"}, "critical_data_element": {}, + "active_test_count": {"header": "Active tests"}, "ordinal_position": {"header": "Position"}, "general_type": {}, "column_type": {"header": "Data type"}, @@ -261,7 +290,7 @@ def get_excel_report_data(update_progress: PROGRESS_UPDATE_TYPE, table_group: st return get_excel_file_data( data, "Data Catalog Columns", - details={"Table group": table_group}, + details={"Table group": table_group["table_groups_name"]}, columns=file_columns, update_progress=update_progress, ) @@ -406,7 +435,7 @@ def get_selected_item(selected: str, table_group_id: str) -> dict | None: item_type, item_id = selected.split("_", 2) if item_type == "table": - item = get_table_by_id(item_id) + item = get_table_by_id(item_id, include_tags=True, include_has_test_runs=True, include_scores=True) elif item_type == "column": item = get_column_by_id(item_id, include_tags=True, include_has_test_runs=True, include_scores=True) else: diff --git a/testgen/ui/views/dialogs/data_preview_dialog.py b/testgen/ui/views/dialogs/data_preview_dialog.py index 6911c3d6..9d5beaea 100644 --- a/testgen/ui/views/dialogs/data_preview_dialog.py +++ b/testgen/ui/views/dialogs/data_preview_dialog.py @@ -20,7 +20,8 @@ def data_preview_dialog( f"Table: {table_name}" ) - data = get_preview_data(table_group_id, schema_name, table_name, column_name) + with st.spinner("Loading data ..."): + data = get_preview_data(table_group_id, schema_name, table_name, column_name) if data.empty: st.warning("The preview data could not be loaded.") @@ -32,7 +33,7 @@ def data_preview_dialog( ) -@st.cache_data(show_spinner="Loading data ...") +@st.cache_data(show_spinner=False) def get_preview_data( table_group_id: str, schema_name: str, diff --git a/testgen/ui/views/dialogs/run_profiling_dialog.py b/testgen/ui/views/dialogs/run_profiling_dialog.py index 4250f1e7..1b6cf22f 100644 --- a/testgen/ui/views/dialogs/run_profiling_dialog.py +++ b/testgen/ui/views/dialogs/run_profiling_dialog.py @@ -26,16 +26,19 @@ def run_profiling_dialog(project_code: str, table_group: pd.Series | None = None display_column="table_groups_name", default_value=default_table_group_id, required=True, + placeholder="Select table group to profile", ) - table_group_name: str = table_groups_df.loc[table_groups_df["id"] == table_group_id, "table_groups_name"].iloc[0] + if table_group_id: + table_group_name: str = table_groups_df.loc[table_groups_df["id"] == table_group_id, "table_groups_name"].iloc[0] testgen.whitespace(1) - with st.container(): - st.markdown(f"Execute profiling for the table group **{table_group_name}**?") - st.markdown(":material/info: _Profiling will be performed in a background process._") + if table_group_id: + with st.container(): + st.markdown(f"Execute profiling for the table group **{table_group_name}**?") + st.markdown(":material/info: _Profiling will be performed in a background process._") - if testgen.expander_toggle(expand_label="Show CLI command", key="test_suite:keys:run-tests-show-cli"): - st.code(f"testgen run-profile --table-group-id {table_group_id}", language="shellSession") + if testgen.expander_toggle(expand_label="Show CLI command", key="test_suite:keys:run-tests-show-cli"): + st.code(f"testgen run-profile --table-group-id {table_group_id}", language="shellSession") button_container = st.empty() status_container = st.empty() diff --git a/testgen/ui/views/dialogs/run_tests_dialog.py b/testgen/ui/views/dialogs/run_tests_dialog.py index 212c1361..93c89dbb 100644 --- a/testgen/ui/views/dialogs/run_tests_dialog.py +++ b/testgen/ui/views/dialogs/run_tests_dialog.py @@ -26,19 +26,22 @@ def run_tests_dialog(project_code: str, test_suite: pd.Series | None = None, def display_column="test_suite", default_value=default_test_suite_id, required=True, + placeholder="Select test suite to run", ) - test_suite_name: str = test_suites_df.loc[test_suites_df["id"] == test_suite_id, "test_suite"].iloc[0] + if test_suite_id: + test_suite_name: str = test_suites_df.loc[test_suites_df["id"] == test_suite_id, "test_suite"].iloc[0] testgen.whitespace(1) - with st.container(): - st.markdown(f"Run tests for the test suite **{test_suite_name}**?") - st.markdown(":material/info: _Test execution will be performed in a background process._") + if test_suite_id: + with st.container(): + st.markdown(f"Run tests for the test suite **{test_suite_name}**?") + st.markdown(":material/info: _Test execution will be performed in a background process._") - if testgen.expander_toggle(expand_label="Show CLI command", key="run_tests_dialog:keys:show-cli"): - st.code( - f"testgen run-tests --project-key {project_code} --test-suite-key {test_suite_name}", - language="shellSession" - ) + if testgen.expander_toggle(expand_label="Show CLI command", key="run_tests_dialog:keys:show-cli"): + st.code( + f"testgen run-tests --project-key {project_code} --test-suite-key {test_suite_name}", + language="shellSession" + ) button_container = st.empty() status_container = st.empty() @@ -47,7 +50,7 @@ def run_tests_dialog(project_code: str, test_suite: pd.Series | None = None, def with button_container: _, button_column = st.columns([.8, .2]) with button_column: - run_test_button = st.button("Run Tests", use_container_width=True) + run_test_button = st.button("Run Tests", use_container_width=True, disabled=not test_suite_id) if run_test_button: button_container.empty() diff --git a/testgen/ui/views/hygiene_issues.py b/testgen/ui/views/hygiene_issues.py index 3cf0fe3d..ed1f7991 100644 --- a/testgen/ui/views/hygiene_issues.py +++ b/testgen/ui/views/hygiene_issues.py @@ -12,6 +12,7 @@ from testgen.commands.run_rollup_scores import run_profile_rollup_scoring_queries from testgen.common import date_service from testgen.common.mixpanel_service import MixpanelService +from testgen.common.models import get_current_session from testgen.ui.components import widgets as testgen from testgen.ui.components.widgets.download_dialog import ( FILE_DATA_TYPE, @@ -20,6 +21,7 @@ get_excel_file_data, zip_multi_file_data, ) +from testgen.ui.components.widgets.page import css_class, flex_row_end from testgen.ui.navigation.page import Page from testgen.ui.pdf.hygiene_issue_report import create_report from testgen.ui.services import project_service, user_session_service @@ -68,7 +70,7 @@ def render( others_summary_column, pii_summary_column, score_column, actions_column = st.columns([.25, .25, .2, .3], vertical_alignment="bottom") (liklihood_filter_column, issue_type_filter_column, table_filter_column, column_filter_column, sort_column, export_button_column) = ( - st.columns([.15, .25, .2, .2, .1, .1], vertical_alignment="bottom") + st.columns([.15, .2, .2, .2, .1, .15], vertical_alignment="bottom") ) testgen.flex_row_end(actions_column) testgen.flex_row_end(export_button_column) @@ -77,7 +79,6 @@ def render( issue_class = testgen.select( options=["Definite", "Likely", "Possible", "Potential PII"], default_value=issue_class, - required=False, bind_to_query="issue_class", label="Issue Class", ) @@ -89,7 +90,6 @@ def render( default_value=None if issue_class == "Potential PII" else issue_type, value_column="id", display_column="anomaly_name", - required=False, bind_to_query="issue_type", label="Issue Type", disabled=issue_class == "Potential PII", @@ -113,6 +113,7 @@ def render( bind_to_query="column_name", label="Column Name", disabled=not table_name, + accept_new_options=True, ) with sort_column: @@ -142,57 +143,72 @@ def render( action_map = df_action.set_index("id")["action"].to_dict() df_pa["action"] = df_pa["id"].map(action_map).fillna(df_pa["action"]) - if not df_pa.empty: - summaries = get_profiling_anomaly_summary(run_id) - others_summary = [summary for summary in summaries if summary.get("type") != "PII"] - with others_summary_column: + summaries = get_profiling_anomaly_summary(run_id) + others_summary = [summary for summary in summaries if summary.get("type") != "PII"] + with others_summary_column: + testgen.summary_bar( + items=others_summary, + label="Hygiene Issues", + height=20, + width=400, + ) + + anomalies_pii_summary = [summary for summary in summaries if summary.get("type") == "PII"] + if anomalies_pii_summary: + with pii_summary_column: testgen.summary_bar( - items=others_summary, - label="Hygiene Issues", + items=anomalies_pii_summary, + label="Potential PII", height=20, width=400, ) - anomalies_pii_summary = [summary for summary in summaries if summary.get("type") == "PII"] - if anomalies_pii_summary: - with pii_summary_column: - testgen.summary_bar( - items=anomalies_pii_summary, - label="Potential PII", - height=20, - width=400, - ) + with score_column: + render_score(run_df["project_code"], run_id) + + lst_show_columns = [ + "table_name", + "column_name", + "issue_likelihood", + "action", + "anomaly_name", + "detail", + ] + + # Show main grid and retrieve selections + selected = fm.render_grid_select( + df_pa, + lst_show_columns, + int_height=400, + do_multi_select=do_multi_select, + bind_to_query_name="selected", + bind_to_query_prop="id", + ) - with score_column: - render_score(run_df["project_code"], run_id) + popover_container = export_button_column.empty() - lst_show_columns = [ - "table_name", - "column_name", - "issue_likelihood", - "action", - "anomaly_name", - "detail", - ] + def open_download_dialog(data: pd.DataFrame | None = None) -> None: + # Hack to programmatically close popover: https://github.com/streamlit/streamlit/issues/8265#issuecomment-3001655849 + with popover_container.container(): + flex_row_end() + st.button(label="Export", icon=":material/download:", disabled=True) - # Show main grid and retrieve selections - selected = fm.render_grid_select( - df_pa, - lst_show_columns, - int_height=400, - do_multi_select=do_multi_select, - bind_to_query_name="selected", - bind_to_query_prop="id", + download_dialog( + dialog_title="Download Excel Report", + file_content_func=get_excel_report_data, + args=(run_df["table_groups_name"], run_date, run_id, data), ) - with export_button_column: - if st.button(label=":material/download: Export", help="Download filtered hygiene issues to Excel"): - download_dialog( - dialog_title="Download Excel Report", - file_content_func=get_excel_report_data, - args=(df_pa, run_df["table_groups_name"], run_date), - ) + with popover_container.container(key="tg--export-popover"): + flex_row_end() + with st.popover(label="Export", icon=":material/download:", help="Download hygiene issues to Excel"): + css_class("tg--export-wrapper") + st.button(label="All issues", type="tertiary", on_click=open_download_dialog) + st.button(label="Filtered issues", type="tertiary", on_click=partial(open_download_dialog, df_pa)) + if selected: + st.button(label="Selected issues", type="tertiary", on_click=partial(open_download_dialog, pd.DataFrame(selected))) + if not df_pa.empty: if selected: # Always show details for last selected row selected_row = selected[len(selected) - 1] @@ -341,84 +357,103 @@ def get_profiling_run_columns(profiling_run_id: str) -> pd.DataFrame: @st.cache_data(show_spinner=False) def get_profiling_anomalies( profile_run_id: str, - likelihood: str | None, - issue_type_id: str | None, - table_name: str | None, - column_name: str | None, - sorting_columns: list[str] | None, + likelihood: str | None = None, + issue_type_id: str | None = None, + table_name: str | None = None, + column_name: str | None = None, + sorting_columns: list[str] | None = None, ): - schema: str = st.session_state["dbschema"] + db_session = get_current_session() criteria = "" order_by = "" + params = {"profile_run_id": profile_run_id} if likelihood: - criteria += f" AND t.issue_likelihood = '{likelihood}'" + criteria += " AND t.issue_likelihood = :likelihood" + params["likelihood"] = likelihood if issue_type_id: - criteria += f" AND t.id = '{issue_type_id}'" + criteria += " AND t.id = :issue_type_id" + params["issue_type_id"] = issue_type_id if table_name: - criteria += f" AND r.table_name = '{table_name}'" + criteria += " AND r.table_name = :table_name" + params["table_name"] = table_name if column_name: - criteria += f" AND r.column_name = '{column_name}'" + criteria += " AND r.column_name ILIKE :column_name" + params["column_name"] = column_name if sorting_columns: order_by = "ORDER BY " + (", ".join(" ".join(col) for col in sorting_columns)) # Define the query -- first visible column must be first, because will hold the multi-select box str_sql = f""" - SELECT r.table_name, r.column_name, r.schema_name, - r.column_type,t.anomaly_name, t.issue_likelihood, - r.disposition, null as action, - CASE - WHEN t.issue_likelihood = 'Possible' THEN 'Possible: speculative test that often identifies problems' - WHEN t.issue_likelihood = 'Likely' THEN 'Likely: typically indicates a data problem' - WHEN t.issue_likelihood = 'Definite' THEN 'Definite: indicates a highly-likely data problem' - WHEN t.issue_likelihood = 'Potential PII' - THEN 'Potential PII: may require privacy policies, standards and procedures for access, storage and transmission.' - END AS likelihood_explanation, - CASE - WHEN t.issue_likelihood = 'Potential PII' THEN 1 - WHEN t.issue_likelihood = 'Possible' THEN 2 - WHEN t.issue_likelihood = 'Likely' THEN 3 - WHEN t.issue_likelihood = 'Definite' THEN 4 - END AS likelihood_order, - t.anomaly_description, r.detail, t.suggested_action, - r.anomaly_id, r.table_groups_id::VARCHAR, r.id::VARCHAR, p.profiling_starttime, r.profile_run_id::VARCHAR, - tg.table_groups_name, - - -- These are used in the PDF report - dcc.functional_data_type, - dcc.description as column_description, - COALESCE(dcc.critical_data_element, dtc.critical_data_element) as critical_data_element, - COALESCE(dcc.data_source, dtc.data_source, tg.data_source) as data_source, - COALESCE(dcc.source_system, dtc.source_system, tg.source_system) as source_system, - COALESCE(dcc.source_process, dtc.source_process, tg.source_process) as source_process, - COALESCE(dcc.business_domain, dtc.business_domain, tg.business_domain) as business_domain, - COALESCE(dcc.stakeholder_group, dtc.stakeholder_group, tg.stakeholder_group) as stakeholder_group, - COALESCE(dcc.transform_level, dtc.transform_level, tg.transform_level) as transform_level, - COALESCE(dcc.aggregation_level, dtc.aggregation_level) as aggregation_level, - COALESCE(dcc.data_product, dtc.data_product, tg.data_product) as data_product - - FROM {schema}.profile_anomaly_results r - INNER JOIN {schema}.profile_anomaly_types t - ON r.anomaly_id = t.id - INNER JOIN {schema}.profiling_runs p - ON r.profile_run_id = p.id - INNER JOIN {schema}.table_groups tg - ON r.table_groups_id = tg.id - LEFT JOIN {schema}.data_column_chars dcc - ON (tg.id = dcc.table_groups_id - AND r.schema_name = dcc.schema_name - AND r.table_name = dcc.table_name - AND r.column_name = dcc.column_name) - LEFT JOIN {schema}.data_table_chars dtc - ON dcc.table_id = dtc.table_id - WHERE r.profile_run_id = '{profile_run_id}' - {criteria} - {order_by} + SELECT + r.table_name, + r.column_name, + r.schema_name, + r.column_type, + t.anomaly_name, + t.issue_likelihood, + r.disposition, + null as action, + CASE + WHEN t.issue_likelihood = 'Possible' THEN 'Possible: speculative test that often identifies problems' + WHEN t.issue_likelihood = 'Likely' THEN 'Likely: typically indicates a data problem' + WHEN t.issue_likelihood = 'Definite' THEN 'Definite: indicates a highly-likely data problem' + WHEN t.issue_likelihood = 'Potential PII' + THEN 'Potential PII: may require privacy policies, standards and procedures for access, storage and transmission.' + END AS likelihood_explanation, + CASE + WHEN t.issue_likelihood = 'Potential PII' THEN 1 + WHEN t.issue_likelihood = 'Possible' THEN 2 + WHEN t.issue_likelihood = 'Likely' THEN 3 + WHEN t.issue_likelihood = 'Definite' THEN 4 + END AS likelihood_order, + t.anomaly_description, + r.detail, + t.suggested_action, + r.anomaly_id, + r.table_groups_id::VARCHAR, + r.id::VARCHAR, + p.profiling_starttime, + r.profile_run_id::VARCHAR, + tg.table_groups_name, + + -- These are used in the PDF report + dcc.functional_data_type, + dcc.description as column_description, + COALESCE(dcc.critical_data_element, dtc.critical_data_element) as critical_data_element, + COALESCE(dcc.data_source, dtc.data_source, tg.data_source) as data_source, + COALESCE(dcc.source_system, dtc.source_system, tg.source_system) as source_system, + COALESCE(dcc.source_process, dtc.source_process, tg.source_process) as source_process, + COALESCE(dcc.business_domain, dtc.business_domain, tg.business_domain) as business_domain, + COALESCE(dcc.stakeholder_group, dtc.stakeholder_group, tg.stakeholder_group) as stakeholder_group, + COALESCE(dcc.transform_level, dtc.transform_level, tg.transform_level) as transform_level, + COALESCE(dcc.aggregation_level, dtc.aggregation_level) as aggregation_level, + COALESCE(dcc.data_product, dtc.data_product, tg.data_product) as data_product + + FROM profile_anomaly_results r + INNER JOIN profile_anomaly_types t + ON r.anomaly_id = t.id + INNER JOIN profiling_runs p + ON r.profile_run_id = p.id + INNER JOIN table_groups tg + ON r.table_groups_id = tg.id + LEFT JOIN data_column_chars dcc + ON (tg.id = dcc.table_groups_id + AND r.schema_name = dcc.schema_name + AND r.table_name = dcc.table_name + AND r.column_name = dcc.column_name) + LEFT JOIN data_table_chars dtc + ON dcc.table_id = dtc.table_id + WHERE r.profile_run_id = :profile_run_id + {criteria} + {order_by} """ - # Retrieve data as df - df = db.retrieve_data(str_sql) + results = db_session.execute(str_sql, params=params) + columns = [column.name for column in results.cursor.description] + + df = pd.DataFrame(list(results), columns=columns) dct_replace = {"Confirmed": "✓", "Dismissed": "✘", "Inactive": "🔇"} df["action"] = df["disposition"].replace(dct_replace) @@ -490,10 +525,14 @@ def get_profiling_anomaly_summary(str_profile_run_id): def get_excel_report_data( update_progress: PROGRESS_UPDATE_TYPE, - data: pd.DataFrame, table_group: str, run_date: str, + run_id: str, + data: pd.DataFrame | None = None, ) -> FILE_DATA_TYPE: + if data is None: + data = get_profiling_anomalies(run_id) + columns = { "schema_name": {"header": "Schema"}, "table_name": {"header": "Table"}, @@ -515,8 +554,8 @@ def get_excel_report_data( @st.cache_data(show_spinner=False) -def get_source_data(hi_data): - return get_source_data_uncached(hi_data) +def get_source_data(hi_data, limit): + return get_source_data_uncached(hi_data, limit) @st.dialog(title="Source Data") @@ -529,7 +568,7 @@ def source_data_dialog(selected_row): fm.render_html_list(selected_row, ["detail"], None, 700, ["Hygiene Issue Detail"]) with st.spinner("Retrieving source data..."): - bad_data_status, bad_data_msg, _, df_bad = get_source_data(selected_row) + bad_data_status, bad_data_msg, _, df_bad = get_source_data(selected_row, limit=500) if bad_data_status in {"ND", "NA"}: st.info(bad_data_msg) elif bad_data_status == "ERR": diff --git a/testgen/ui/views/login.py b/testgen/ui/views/login.py index 14bda7c0..d59ab817 100644 --- a/testgen/ui/views/login.py +++ b/testgen/ui/views/login.py @@ -32,7 +32,7 @@ def render(self, **_kwargs) -> None: _, login_column, links_column = st.columns([0.25, 0.5, 0.25]) with links_column: - testgen.page_links() + testgen.help_menu() with login_column: st.html(""" @@ -43,6 +43,7 @@ def render(self, **_kwargs) -> None: if authentication_status is False: st.error("Username or password is incorrect.") + MixpanelService().send_event("login-denied", username=username) if authentication_status is None: javascript_service.clear_component_states() @@ -52,4 +53,4 @@ def render(self, **_kwargs) -> None: if authentication_status: user_session_service.start_user_session(name, username) session.logging_in = True - MixpanelService().send_event("login") + MixpanelService().send_event("login", include_usage=True) diff --git a/testgen/ui/views/profiling_results.py b/testgen/ui/views/profiling_results.py index cc656ce0..dec8b4ab 100644 --- a/testgen/ui/views/profiling_results.py +++ b/testgen/ui/views/profiling_results.py @@ -1,6 +1,7 @@ import json import typing from datetime import datetime +from functools import partial import pandas as pd import streamlit as st @@ -9,6 +10,7 @@ import testgen.ui.services.database_service as db import testgen.ui.services.form_service as fm from testgen.common import date_service +from testgen.common.models import with_database_session from testgen.ui.components import widgets as testgen from testgen.ui.components.widgets.download_dialog import ( FILE_DATA_TYPE, @@ -16,6 +18,7 @@ download_dialog, get_excel_file_data, ) +from testgen.ui.components.widgets.page import css_class, flex_row_end from testgen.ui.components.widgets.testgen_component import testgen_component from testgen.ui.navigation.page import Page from testgen.ui.services import project_service, user_session_service @@ -79,6 +82,7 @@ def render(self, run_id: str, table_name: str | None = None, column_name: str | bind_to_query="column_name", label="Column Name", disabled=not table_name, + accept_new_options=bool(table_name), ) with sort_column: @@ -93,16 +97,15 @@ def render(self, run_id: str, table_name: str | None = None, column_name: str | default_sorting = [(sortable_columns[i][1], "ASC") for i in (0, 1, 2)] sorting_columns = testgen.sorting_selector(sortable_columns, default_sorting) - # Use SQL wildcard to match all values - if not table_name: - table_name = "%%" - if not column_name: - column_name = "%%" - # Display main results grid with st.container(): with st.spinner("Loading data ..."): - df = profiling_queries.get_profiling_results(run_id, table_name, column_name, sorting_columns) + df = profiling_queries.get_profiling_results( + run_id, + table_name=table_name, + column_name=column_name, + sorting_columns=sorting_columns, + ) show_columns = [ "schema_name", @@ -125,14 +128,29 @@ def render(self, run_id: str, table_name: str | None = None, column_name: str | bind_to_query_prop="id", ) - with export_button_column: - testgen.flex_row_end() - if st.button(label=":material/download: Export", help="Download filtered profiling results to Excel"): - download_dialog( - dialog_title="Download Excel Report", - file_content_func=get_excel_report_data, - args=(df, run_df["table_groups_name"], run_date), - ) + popover_container = export_button_column.empty() + + def open_download_dialog(data: pd.DataFrame | None = None) -> None: + # Hack to programmatically close popover: https://github.com/streamlit/streamlit/issues/8265#issuecomment-3001655849 + with popover_container.container(): + flex_row_end() + st.button(label="Export", icon=":material/download:", disabled=True) + + download_dialog( + dialog_title="Download Excel Report", + file_content_func=get_excel_report_data, + args=(run_df["table_groups_name"], run_date, run_id, data), + ) + + with popover_container.container(key="tg--export-popover"): + flex_row_end() + with st.popover(label="Export", icon=":material/download:", help="Download profiling results to Excel"): + css_class("tg--export-wrapper") + st.button(label="All results", type="tertiary", on_click=open_download_dialog) + st.button(label="Filtered results", type="tertiary", on_click=partial(open_download_dialog, df)) + if selected_row: + st.button(label="Selected results", type="tertiary", on_click=partial(open_download_dialog, pd.DataFrame(selected_row))) + # Display profiling for selected row if not selected_row: @@ -154,13 +172,19 @@ def render(self, run_id: str, table_name: str | None = None, column_name: str | ) +@with_database_session def get_excel_report_data( update_progress: PROGRESS_UPDATE_TYPE, - data: pd.DataFrame, table_group: str, run_date: str, + run_id: str, + data: pd.DataFrame | None = None, ) -> FILE_DATA_TYPE: - data = data.copy() + if data is not None: + data = data.copy() + else: + data = profiling_queries.get_profiling_results(run_id) + date_service.accommodate_dataframe_to_timezone(data, st.session_state) for key in ["column_type", "datatype_suggestion"]: data[key] = data[key].apply(lambda val: val.lower() if not pd.isna(val) else None) @@ -170,7 +194,7 @@ def get_excel_report_data( for key in ["min_date", "max_date"]: data[key] = data[key].apply( - lambda val: datetime.strptime(val, "%Y-%m-%dT%H:%M:%S").strftime("%b %-d %Y, %-I:%M %p") if val != "NaT" else None + lambda val: datetime.strptime(val, "%Y-%m-%d %H:%M:%S").strftime("%b %-d %Y, %-I:%M %p") if not pd.isna(val) and val != "NaT" else None ) data["hygiene_issues"] = data["hygiene_issues"].apply(lambda val: "Yes" if val else None) diff --git a/testgen/ui/views/profiling_runs.py b/testgen/ui/views/profiling_runs.py index cd3c8fe0..263b4b4d 100644 --- a/testgen/ui/views/profiling_runs.py +++ b/testgen/ui/views/profiling_runs.py @@ -1,3 +1,4 @@ +import logging import typing from functools import partial @@ -8,17 +9,19 @@ import testgen.ui.services.database_service as db import testgen.ui.services.form_service as fm import testgen.ui.services.query_service as dq +from testgen.common.models import with_database_session from testgen.ui.components import widgets as testgen from testgen.ui.components.widgets import testgen_component from testgen.ui.navigation.menu import MenuItem from testgen.ui.navigation.page import Page from testgen.ui.queries import profiling_run_queries, project_queries from testgen.ui.services import user_session_service -from testgen.ui.session import session +from testgen.ui.session import session, temp_value from testgen.ui.views.dialogs.manage_schedules import ScheduleDialog from testgen.ui.views.dialogs.run_profiling_dialog import run_profiling_dialog from testgen.utils import friendly_score, to_int +LOG = logging.getLogger("testgen") FORM_DATA_WIDTH = 400 PAGE_SIZE = 50 PAGE_ICON = "data_thresholding" @@ -61,6 +64,7 @@ def render(self, project_code: str, table_group_id: str | None = None, **_kwargs default_value=table_group_id, bind_to_query="table_group_id", label="Table Group", + placeholder="---", ) with actions_column: @@ -68,7 +72,7 @@ def render(self, project_code: str, table_group_id: str | None = None, **_kwargs st.button( ":material/today: Profiling Schedules", - help="Manages when profiling should run for a given table group", + help="Manage when profiling should run for table groups", on_click=partial(ProfilingScheduleDialog().open, project_code) ) @@ -97,9 +101,13 @@ def render(self, project_code: str, table_group_id: str | None = None, **_kwargs "items": paginated_df.to_json(orient="records"), "permissions": { "can_run": user_can_run, + "can_edit": user_can_run, }, }, - event_handlers={ "RunCanceled": on_cancel_run } + event_handlers={ + "RunCanceled": on_cancel_run, + "RunsDeleted": partial(on_delete_runs, project_code, table_group_id), + } ) @@ -125,6 +133,7 @@ def arg_value_input(self) -> tuple[bool, list[typing.Any], dict[str, typing.Any] value_column="id", display_column="table_groups_name", required=True, + placeholder="Select table group", ) return bool(tg_id), [], {"table_group_id": tg_id} @@ -151,8 +160,11 @@ def render_empty_state(project_code: str, user_can_run: bool) -> bool: icon=PAGE_ICON, message=testgen.EmptyStateMessage.TableGroup, action_label="Go to Table Groups", - link_href="connections:table-groups", - link_params={ "connection_id": str(project_summary_df["default_connection_id"]) } + link_href="table-groups", + link_params={ + "project_code": project_code, + "connection_id": str(project_summary_df["default_connection_id"]), + }, ) else: testgen.empty_state( @@ -175,6 +187,60 @@ def on_cancel_run(profiling_run: pd.Series) -> None: fm.reset_post_updates(str_message=f":{'green' if process_status else 'red'}[{process_message}]", as_toast=True) +@st.dialog(title="Delete Profiling Runs") +@with_database_session +def on_delete_runs(project_code: str, table_group_id: str, profiling_run_ids: list[str]) -> None: + def on_delete_confirmed(*_args) -> None: + set_delete_confirmed(True) + + message = f"Are you sure you want to delete the {len(profiling_run_ids)} selected profiling runs?" + constraint = { + "warning": "Any running processes will be canceled.", + "confirmation": "Yes, cancel and delete the profiling runs.", + } + if len(profiling_run_ids) == 1: + message = "Are you sure you want to delete the selected profiling run?" + constraint["confirmation"] = "Yes, cancel and delete the profiling run." + + result, set_result = temp_value("profiling-runs:result-value", default=None) + delete_confirmed, set_delete_confirmed = temp_value("profiling-runs:confirm-delete", default=False) + + testgen.testgen_component( + "confirm_dialog", + props={ + "project_code": project_code, + "message": message, + "constraint": constraint, + "button_label": "Delete", + "button_color": "warn", + "result": result(), + }, + on_change_handlers={ + "ActionConfirmed": on_delete_confirmed, + }, + ) + + if delete_confirmed(): + try: + with st.spinner("Deleting runs ..."): + profiling_runs = get_db_profiling_runs(project_code, table_group_id, profiling_run_ids=profiling_run_ids) + for _, profiling_run in profiling_runs.iterrows(): + profiling_run_id = profiling_run["profiling_run_id"] + if profiling_run["status"] == "Running": + process_status, process_message = process_service.kill_profile_run(to_int(profiling_run["process_id"])) + if process_status: + profiling_run_queries.update_status(profiling_run_id, "Cancelled") + profiling_run_queries.cascade_delete_multiple_profiling_runs(profiling_run_ids) + st.rerun() + except Exception: + LOG.exception("Failed to delete profiling runs") + set_result({ + "success": False, + "message": "Unable to delete the selected profiling runs, try again.", + }) + st.rerun(scope="fragment") + + @st.cache_data(show_spinner=False) def get_db_table_group_choices(project_code: str) -> pd.DataFrame: schema = st.session_state["dbschema"] @@ -182,9 +248,19 @@ def get_db_table_group_choices(project_code: str) -> pd.DataFrame: @st.cache_data(show_spinner="Loading data ...") -def get_db_profiling_runs(project_code: str, table_group_id: str | None = None) -> pd.DataFrame: +def get_db_profiling_runs( + project_code: str, + table_group_id: str | None = None, + profiling_run_ids: list[str] | None = None, +) -> pd.DataFrame: schema = st.session_state["dbschema"] table_group_condition = f" AND v_profiling_runs.table_groups_id = '{table_group_id}' " if table_group_id else "" + + profling_runs_condition = "" + if profiling_run_ids and len(profiling_run_ids) > 0: + profiling_run_ids_ = [f"'{run_id}'" for run_id in profiling_run_ids] + profling_runs_condition = f" AND v_profiling_runs.profiling_run_id::VARCHAR IN ({', '.join(profiling_run_ids_)})" + sql = f""" WITH profile_anomalies AS ( SELECT profile_anomaly_results.profile_run_id, @@ -242,6 +318,7 @@ def get_db_profiling_runs(project_code: str, table_group_id: str | None = None) LEFT JOIN profile_anomalies ON (v_profiling_runs.profiling_run_id = profile_anomalies.profile_run_id) WHERE project_code = '{project_code}' {table_group_condition} + {profling_runs_condition} ORDER BY start_time DESC; """ diff --git a/testgen/ui/views/project_dashboard.py b/testgen/ui/views/project_dashboard.py index 3515e68c..68c5ef9b 100644 --- a/testgen/ui/views/project_dashboard.py +++ b/testgen/ui/views/project_dashboard.py @@ -12,7 +12,6 @@ from testgen.ui.session import session from testgen.utils import format_field, friendly_score, score -STALE_PROFILE_DAYS = 30 PAGE_TITLE = "Project Dashboard" PAGE_ICON = "home" @@ -34,11 +33,12 @@ class ProjectDashboardPage(Page): def render(self, project_code: str, **_kwargs): testgen.page_header( PAGE_TITLE, - "introduction-to-dataops-testgen", ) - table_groups = get_table_groups_summary(project_code) - project_summary_df = project_queries.get_summary_by_code(project_code) + with st.spinner("Loading data ..."): + table_groups = get_table_groups_summary(project_code) + test_suites = test_suite_service.get_by_project(project_code) + project_summary_df = project_queries.get_summary_by_code(project_code) table_groups_fields: list[str] = [ "id", @@ -52,20 +52,11 @@ def render(self, project_code: str, **_kwargs): "latest_anomalies_likely_ct", "latest_anomalies_possible_ct", "latest_anomalies_dismissed_ct", - "latest_tests_start", - "latest_tests_suite_ct", - "latest_tests_ct", - "latest_tests_passed_ct", - "latest_tests_warning_ct", - "latest_tests_failed_ct", - "latest_tests_error_ct", - "latest_tests_dismissed_ct", ] test_suite_fields: list[str] = [ "id", "test_suite", "test_ct", - "latest_auto_gen_date", "latest_run_start", "latest_run_id", "last_run_test_ct", @@ -77,16 +68,12 @@ def render(self, project_code: str, **_kwargs): ] table_groups_sort = st.session_state.get("overview_table_groups_sort") or "latest_activity_date" - expanded_table_groups = st.session_state.get("overview_table_groups_expanded", []) testgen.testgen_component( "project_dashboard", props={ "project": { "project_code": project_code, - "table_groups_count": len(table_groups.index), - "test_suites_count": int(table_groups["latest_tests_suite_ct"].sum()), - "test_definitions_count": int(table_groups["latest_tests_ct"].sum()), "test_runs_count": int(project_summary_df["test_runs_ct"]), "profiling_runs_count": int(project_summary_df["profiling_runs_ct"]), "connections_count": int(project_summary_df["connections_ct"]), @@ -97,9 +84,9 @@ def render(self, project_code: str, **_kwargs): **{field: format_field(table_group[field]) for field in table_groups_fields}, "test_suites": [ { field: format_field(test_suite[field]) for field in test_suite_fields} - for _, test_suite in test_suite_service.get_by_project(project_code, table_group_id).iterrows() - ] if table_group_id in expanded_table_groups else None, - "expanded": table_group_id in expanded_table_groups, + for _, test_suite in test_suites[test_suites["table_groups_id"] == table_group_id].iterrows() + ], + "latest_tests_start": format_field(test_suites[test_suites["table_groups_id"] == table_group_id]["latest_run_start"].max()), "dq_score": friendly_score(score(table_group["dq_score_profiling"], table_group["dq_score_testing"])), "dq_score_profiling": friendly_score(table_group["dq_score_profiling"]), "dq_score_testing": friendly_score(table_group["dq_score_testing"]), @@ -125,28 +112,10 @@ def render(self, project_code: str, **_kwargs): }, ], }, - on_change_handlers={ - "TableGroupExpanded": on_table_group_expanded, - "TableGroupCollapsed": on_table_group_collapsed, - }, - event_handlers={}, ) -def on_table_group_expanded(table_group_id: str) -> None: - expanded_table_groups = st.session_state.get("overview_table_groups_expanded", []) - expanded_table_groups.append(table_group_id) - st.session_state["overview_table_groups_expanded"] = expanded_table_groups - - -def on_table_group_collapsed(table_group_id: str) -> None: - expanded_table_groups = st.session_state.get("overview_table_groups_expanded", []) - try: - expanded_table_groups.remove(table_group_id) - except ValueError: ... - st.session_state["overview_table_groups_expanded"] = expanded_table_groups - - +@st.cache_data(show_spinner=False) def get_table_groups_summary(project_code: str) -> pd.DataFrame: schema = st.session_state["dbschema"] sql = f""" @@ -196,54 +165,6 @@ def get_table_groups_summary(project_code: str) -> pd.DataFrame: anomaly_types.id = latest_anomalies.anomaly_id ) GROUP BY latest_run.id - ), - latest_tests AS ( - SELECT suites.table_groups_id, - MAX(latest_run.test_starttime) AS test_starttime, - COUNT(DISTINCT latest_run.test_suite_id) as test_suite_ct, - COUNT(latest_results.id) as test_ct, - SUM( - CASE - WHEN COALESCE(latest_results.disposition, 'Confirmed') = 'Confirmed' - AND latest_results.result_status = 'Passed' THEN 1 - ELSE 0 - END - ) as passed_ct, - SUM( - CASE - WHEN COALESCE(latest_results.disposition, 'Confirmed') = 'Confirmed' - AND latest_results.result_status = 'Warning' THEN 1 - ELSE 0 - END - ) as warning_ct, - SUM( - CASE - WHEN COALESCE(latest_results.disposition, 'Confirmed') = 'Confirmed' - AND latest_results.result_status = 'Failed' THEN 1 - ELSE 0 - END - ) as failed_ct, - SUM( - CASE - WHEN COALESCE(latest_results.disposition, 'Confirmed') = 'Confirmed' - AND latest_results.result_status = 'Error' THEN 1 - ELSE 0 - END - ) as error_ct, - SUM( - CASE - WHEN COALESCE(latest_results.disposition, 'Confirmed') IN ('Dismissed', 'Inactive') THEN 1 - ELSE 0 - END - ) as dismissed_ct - FROM {schema}.test_suites suites - LEFT JOIN {schema}.test_runs latest_run ON ( - suites.last_complete_test_run_id = latest_run.id - ) - LEFT JOIN {schema}.test_results latest_results ON ( - latest_run.id = latest_results.test_run_id - ) - GROUP BY suites.table_groups_id ) SELECT groups.id::VARCHAR(50), groups.table_groups_name, @@ -257,18 +178,9 @@ def get_table_groups_summary(project_code: str) -> pd.DataFrame: latest_profile.definite_ct as latest_anomalies_definite_ct, latest_profile.likely_ct as latest_anomalies_likely_ct, latest_profile.possible_ct as latest_anomalies_possible_ct, - latest_profile.dismissed_ct as latest_anomalies_dismissed_ct, - latest_tests.test_starttime as latest_tests_start, - latest_tests.test_suite_ct as latest_tests_suite_ct, - latest_tests.test_ct as latest_tests_ct, - latest_tests.passed_ct as latest_tests_passed_ct, - latest_tests.warning_ct as latest_tests_warning_ct, - latest_tests.failed_ct as latest_tests_failed_ct, - latest_tests.error_ct as latest_tests_error_ct, - latest_tests.dismissed_ct as latest_tests_dismissed_ct + latest_profile.dismissed_ct as latest_anomalies_dismissed_ct FROM {schema}.table_groups as groups LEFT JOIN latest_profile ON (groups.id = latest_profile.table_groups_id) - LEFT JOIN latest_tests ON (groups.id = latest_tests.table_groups_id) WHERE groups.project_code = '{project_code}'; """ diff --git a/testgen/ui/views/table_groups.py b/testgen/ui/views/table_groups.py new file mode 100644 index 00000000..aff424c7 --- /dev/null +++ b/testgen/ui/views/table_groups.py @@ -0,0 +1,321 @@ +import typing +from dataclasses import asdict +from functools import partial + +import streamlit as st +from sqlalchemy.exc import IntegrityError + +import testgen.ui.services.connection_service as connection_service +import testgen.ui.services.table_group_service as table_group_service +from testgen.commands.run_profiling_bridge import run_profiling_in_background +from testgen.common.models import with_database_session +from testgen.ui.components import widgets as testgen +from testgen.ui.navigation.menu import MenuItem +from testgen.ui.navigation.page import Page +from testgen.ui.services import user_session_service +from testgen.ui.session import session, temp_value +from testgen.ui.views.connections import FLAVOR_OPTIONS, format_connection +from testgen.ui.views.profiling_runs import ProfilingScheduleDialog + +PAGE_TITLE = "Table Groups" + + +class TableGroupsPage(Page): + path = "table-groups" + can_activate: typing.ClassVar = [ + lambda: session.authentication_status, + lambda: not user_session_service.user_has_catalog_role(), + ] + menu_item = MenuItem( + icon="table_view", + label=PAGE_TITLE, + section="Data Configuration", + order=0, + roles=[ role for role in typing.get_args(user_session_service.RoleType) if role != "catalog" ], + ) + + def render(self, project_code: str, connection_id: str | None = None, **_kwargs) -> None: + testgen.page_header(PAGE_TITLE, "create-a-table-group") + + user_can_edit = user_session_service.user_can_edit() + if connection_id: + table_groups = table_group_service.get_by_connection(project_code, connection_id) + else: + table_groups = table_group_service.get_all(project_code) + + return testgen.testgen_component( + "table_group_list", + props={ + "project_code": project_code, + "connection_id": connection_id, + "permissions": { + "can_edit": user_can_edit, + }, + "connections": self._get_connections(project_code), + "table_groups": self._format_table_group_list([ + table_group.to_dict() for _, table_group in table_groups.iterrows() + ]), + }, + on_change_handlers={ + "RunSchedulesClicked": lambda *_: ProfilingScheduleDialog().open(project_code), + "AddTableGroupClicked": partial(self.add_table_group_dialog, project_code), + "EditTableGroupClicked": partial(self.edit_table_group_dialog, project_code), + "DeleteTableGroupClicked": partial(self.delete_table_group_dialog, project_code), + "RunProfilingClicked": partial(self.run_profiling_dialog, project_code), + "ConnectionSelected": lambda inner_connection_id: self.router.queue_navigation( + to="table-groups", + with_args={"project_code": project_code, "connection_id": inner_connection_id}, + ), + }, + ) + + @st.dialog(title="Add Table Group") + @with_database_session + def add_table_group_dialog(self, project_code, *_args): + def on_preview_table_group_clicked(table_group: dict): + mark_for_preview(True) + set_table_group(table_group) + + def on_save_table_group_clicked(table_group: dict): + set_save(True) + set_table_group(table_group) + + should_preview, mark_for_preview = temp_value("table_groups:preview:new", default=False) + should_save, set_save = temp_value("table_groups:save:new", default=False) + get_table_group, set_table_group = temp_value("table_groups:updated:new", default={}) + + connections = self._get_connections(project_code) + table_group = { + "project_code": project_code, + **get_table_group(), + } + table_group_preview = {} + result = None + + if len(connections) == 1: + table_group["connection_id"] = connections[0]["connection_id"] + + if should_save(): + try: + table_group_service.add(table_group) + st.rerun() + except IntegrityError: + result = {"success": False, "message": "A Table Group with the same name already exists."} + + if should_preview(): + table_group_preview = self._get_table_group_preview(project_code, table_group["connection_id"], {"id": "temp", **table_group}) + + return testgen.testgen_component( + "table_group", + props={ + "project_code": project_code, + "connections": connections, + "table_group": table_group, + "table_group_preview": table_group_preview, + "result": result, + }, + on_change_handlers={ + "PreviewTableGroupClicked": on_preview_table_group_clicked, + "TableGroupSaveClicked": on_save_table_group_clicked, + }, + ) + + @st.dialog(title="Edit Table Group") + def edit_table_group_dialog(self, project_code: str, table_group_id: str): + def on_preview_table_group_clicked(table_group: dict): + mark_for_preview(True) + set_updated_table_group(table_group) + + def on_save_table_group_clicked(table_group: dict): + set_update(True) + set_updated_table_group(table_group) + + should_preview, mark_for_preview = temp_value( + f"table_groups:preview:{table_group_id}", + default=False, + ) + should_update, set_update = temp_value( + f"table_groups:save:{table_group_id}", + default=False, + ) + get_updated_table_group, set_updated_table_group = temp_value( + f"table_groups:updated:{table_group_id}", + default={}, + ) + + original_table_group = table_group_service.get_by_id(table_group_id=table_group_id).to_dict() + is_table_group_used = table_group_service.is_table_group_used(table_group_id) + table_group = { + **original_table_group, + **get_updated_table_group(), + } + if is_table_group_used: + table_group["table_group_schema"] = original_table_group["table_group_schema"] + + table_group_preview = { + "schema": table_group["table_group_schema"], + } + result = None + + if should_update(): + try: + table_group_service.edit(table_group) + st.rerun() + except IntegrityError: + result = {"success": False, "message": "A Table Group with the same name already exists."} + + if should_preview(): + table_group_preview = self._get_table_group_preview(project_code, table_group["connection_id"], table_group) + + return testgen.testgen_component( + "table_group", + props={ + "project_code": project_code, + "connections": self._get_connections(project_code, connection_id=table_group["connection_id"]), + "table_group": table_group, + "in_used": is_table_group_used, + "table_group_preview": table_group_preview, + "result": result, + }, + on_change_handlers={ + "PreviewTableGroupClicked": on_preview_table_group_clicked, + "TableGroupSaveClicked": on_save_table_group_clicked, + }, + ) + + def _get_connections(self, project_code: str, connection_id: str | None = None) -> list[dict]: + if connection_id: + connections = [connection_service.get_by_id(connection_id, hide_passwords=True)] + else: + connections = [ + connection for _, connection in connection_service.get_connections( + project_code, hide_passwords=True + ).iterrows() + ] + return [ format_connection(connection) for connection in connections ] + + def _format_table_group_list(self, table_groups: list[dict]) -> list[dict]: + for table_group in table_groups: + flavors = [f for f in FLAVOR_OPTIONS if f.value == table_group["sql_flavor_code"]] + if flavors and (flavor := flavors[0]): + table_group["connection"] = { + "name": table_group["connection_name"], + "flavor": asdict(flavor), + } + return table_groups + + def _get_table_group_preview(self, project_code: str, connection_id: str | None, table_group: dict) -> dict: + table_group_preview = { + "schema": table_group["table_group_schema"], + "tables": set(), + "column_count": 0, + "success": True, + "message": None, + } + if connection_id: + try: + table_group_results = table_group_service.test_table_group(table_group, connection_id, project_code) + + for column in table_group_results: + table_group_preview["schema"] = column["table_schema"] + table_group_preview["tables"].add(column["table_name"]) + table_group_preview["column_count"] += 1 + + if len(table_group_results) <= 0: + table_group_preview["success"] = False + table_group_preview["message"] = ( + "No tables found matching the criteria. Please check the Table Group configuration." + ) + except Exception as error: + table_group_preview["success"] = False + table_group_preview["message"] = error.args[0] + else: + table_group_preview["success"] = False + table_group_preview["message"] = "No connection selected. Please select a connection to preview the Table Group." + + table_group_preview["tables"] = list(table_group_preview["tables"]) + return table_group_preview + + @st.dialog(title="Run Profiling") + def run_profiling_dialog(self, project_code: str, table_group_id: str) -> None: + def on_go_to_profiling_runs_clicked(table_group_id: str) -> None: + set_navigation_params({ "project_code": project_code, "table_group_id": table_group_id }) + + def on_run_profiling_confirmed(*_args) -> None: + set_run_profiling(True) + + get_navigation_params, set_navigation_params = temp_value( + f"table_groups:{table_group_id}:go_to_profiling_run", + default=None, + ) + if (params := get_navigation_params()): + self.router.navigate(to="profiling-runs", with_args=params) + + should_run_profiling, set_run_profiling = temp_value( + f"table_groups:{table_group_id}:run_profiling", + default=False, + ) + + table_group = table_group_service.get_by_id(table_group_id).to_dict() + result = None + if should_run_profiling(): + success = True + message = "Profiling run started" + + try: + run_profiling_in_background(table_group_id) + except Exception as error: + success = False + message = f"Profiling run encountered errors: {error!s}." + result = {"success": success, "message": message} + + return testgen.testgen_component( + "run_profiling_dialog", + props={ + "project_code": project_code, + "table_group": table_group, + "result": result, + }, + on_change_handlers={ + "GoToProfilingRunsClicked": on_go_to_profiling_runs_clicked, + "RunProfilingConfirmed": on_run_profiling_confirmed, + }, + ) + + @st.dialog(title="Delete Table Group") + def delete_table_group_dialog(self, project_code: str, table_group_id: str): + def on_delete_confirmed(*_args): + confirm_deletion(True) + + table_group = table_group_service.get_by_id(table_group_id=table_group_id) + table_group_name = table_group["table_groups_name"] + can_be_deleted = table_group_service.cascade_delete([table_group_name], dry_run=True) + is_deletion_confirmed, confirm_deletion = temp_value( + f"table_groups:confirm_delete:{table_group_id}", + default=False, + ) + success = False + message = None + + result = None + if is_deletion_confirmed(): + if not table_group_service.are_table_groups_in_use([table_group_name]): + table_group_service.cascade_delete([table_group_name]) + message = f"Table Group {table_group_name} has been deleted. " + st.rerun() + else: + message = "This Table Group is in use by a running process and cannot be deleted." + result = {"success": success, "message": message}, + + testgen.testgen_component( + "table_group_delete", + props={ + "project_code": project_code, + "table_group": table_group.to_dict(), + "can_be_deleted": can_be_deleted, + "result": result, + }, + on_change_handlers={ + "DeleteTableGroupConfirmed": on_delete_confirmed, + }, + ) diff --git a/testgen/ui/views/table_groups/__init__.py b/testgen/ui/views/table_groups/__init__.py deleted file mode 100644 index 77b50278..00000000 --- a/testgen/ui/views/table_groups/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# ruff: noqa: F401 - -from testgen.ui.views.table_groups.forms import TableGroupForm -from testgen.ui.views.table_groups.page import TableGroupsPage diff --git a/testgen/ui/views/table_groups/forms.py b/testgen/ui/views/table_groups/forms.py deleted file mode 100644 index 5f2151ba..00000000 --- a/testgen/ui/views/table_groups/forms.py +++ /dev/null @@ -1,191 +0,0 @@ -# type: ignore -import typing - -from streamlit.delta_generator import DeltaGenerator - -from testgen.ui.forms import BaseForm, Field, ManualRender - -SQLFlavor = typing.Literal["redshift", "snowflake", "mssql", "postgresql"] - - -class TableGroupForm(BaseForm, ManualRender): - table_groups_name: str = Field( - default="", - min_length=1, - max_length=40, - st_kwargs_label="Table Group Name", - st_kwargs_max_chars=40, - st_kwargs_help="A unique name to describe the table group", - ) - profiling_include_mask: str = Field( - default="%", - max_length=40, - st_kwargs_label="Tables to Include Mask", - st_kwargs_max_chars=40, - st_kwargs_help="A SQL filter supported by your database's LIKE operator for table names to include", - ) - profiling_exclude_mask: str = Field( - default="tmp%", - st_kwargs_label="Tables to Exclude Mask", - st_kwargs_max_chars=40, - st_kwargs_help="A SQL filter supported by your database's LIKE operator for table names to exclude", - ) - profiling_table_set: str = Field( - default="", - st_kwargs_label="Explicit Table List", - st_kwargs_max_chars=2000, - st_kwargs_help="A list of specific table names to include, separated by commas", - ) - table_group_schema: str = Field( - default="", - min_length=1, - max_length=40, - st_kwargs_label="Schema", - st_kwargs_max_chars=40, - st_kwargs_help="The database schema containing the tables in the Table Group", - ) - profile_id_column_mask: str = Field( - default="%_id", - st_kwargs_label="Profiling ID column mask", - st_kwargs_max_chars=40, - st_kwargs_help="A SQL filter supported by your database's LIKE operator representing ID columns (optional)", - ) - profile_sk_column_mask: str = Field( - default="%_sk", - st_kwargs_label="Profiling Surrogate Key column mask", - st_kwargs_max_chars=40, - st_kwargs_help="A SQL filter supported by your database's LIKE operator representing surrogate key columns (optional)", - ) - profiling_delay_days: int = Field( - default=0, - st_kwargs_label="Min Profiling Age, Days", - st_kwargs_min_value=0, - st_kwargs_max_value=999, - st_kwargs_help="The number of days to wait before new profiling will be available to generate tests", - ) - profile_flag_cdes: bool = Field( - default=True, - st_kwargs_label="Detect critical data elements (CDEs) during profiling", - ) - add_scorecard_definition: bool = Field( - default=True, - st_kwargs_label="Add scorecard for table group", - st_kwargs_help="Add a new scorecard to the Quality Dashboard upon creation of this table group", - ) - profile_use_sampling: bool = Field( - default=True, - st_kwargs_label="Use profile sampling", - st_kwargs_help="Toggle on to base profiling on a sample of records instead of the full table", - ) - profile_sample_percent: int = Field( - default=30, - st_kwargs_label="Sample percent", - st_kwargs_min_value=1, - st_kwargs_max_value=100, - st_kwargs_help="Percent of records to include in the sample, unless the calculated count falls below the specified minimum.", - ) - profile_sample_min_count: int = Field( - default=15000, - st_kwargs_label="Min Sample Record Count", - st_kwargs_min_value=1, - st_kwargs_max_value=1000000, - st_kwargs_help="The minimum number of records to be included in any sample (if available)", - ) - description: str = Field( - default="", - st_kwargs_label="Description", - st_kwargs_max_chars=1000, - st_kwargs_help="", - ) - data_source: str = Field( - default="", - st_kwargs_label="Data Source", - st_kwargs_max_chars=40, - st_kwargs_help="Original source of the dataset", - ) - source_system: str = Field( - default="", - st_kwargs_label="Source System", - st_kwargs_max_chars=40, - st_kwargs_help="Enterprise system source for the dataset", - ) - source_process: str = Field( - default="", - st_kwargs_label="Source Process", - st_kwargs_max_chars=40, - st_kwargs_help="Process, program, or data flow that produced the dataset", - ) - data_location: str = Field( - default="", - st_kwargs_label="Location", - st_kwargs_max_chars=40, - st_kwargs_help="Physical or virtual location of the dataset, e.g., Headquarters, Cloud", - ) - business_domain: str = Field( - default="", - st_kwargs_label="Business Domain", - st_kwargs_max_chars=40, - st_kwargs_help="Business division responsible for the dataset, e.g., Finance, Sales, Manufacturing", - ) - stakeholder_group: str = Field( - default="", - st_kwargs_label="Stakeholder Group", - st_kwargs_max_chars=40, - st_kwargs_help="Data owners or stakeholders responsible for the dataset", - ) - transform_level: str = Field( - default="", - st_kwargs_label="Transform Level", - st_kwargs_max_chars=40, - st_kwargs_help="Data warehouse processing stage, e.g., Raw, Conformed, Processed, Reporting, or Medallion level (bronze, silver, gold)", - ) - data_product: str = Field( - default="", - st_kwargs_label="Data Product", - st_kwargs_max_chars=40, - st_kwargs_help="Data domain that comprises the dataset", - ) - table_group_id: int | None = Field(default=None) - - def form_key(self): - return f"table_group_form:{self.table_group_id or 'new'}" - - def render_input_ui(self, container: DeltaGenerator, _: dict) -> "TableGroupForm": - left_column, right_column = container.columns([.5, .5]) - - self.render_field("table_groups_name", left_column) - self.render_field("profiling_include_mask", left_column) - self.render_field("profiling_exclude_mask", left_column) - self.render_field("profiling_table_set", left_column) - self.render_field("profile_flag_cdes", left_column) - - self.render_field("table_group_schema", right_column) - self.render_field("profile_id_column_mask", right_column) - self.render_field("profile_sk_column_mask", right_column) - self.render_field("profiling_delay_days", right_column) - - if not self.table_group_id: - self.render_field("add_scorecard_definition", right_column) - - self.render_field("profile_use_sampling", container) - profile_sampling_expander = container.expander("Sampling Parameters", expanded=False) - with profile_sampling_expander: - expander_left_column, expander_right_column = profile_sampling_expander.columns([0.50, 0.50]) - self.render_field("profile_sample_percent", expander_left_column) - self.render_field("profile_sample_min_count", expander_right_column) - - tags_expander = container.expander("Table Group Tags", expanded=False) - with tags_expander: - self.render_field("description", tags_expander) - tags_left_column, tags_right_column = tags_expander.columns([0.50, 0.50]) - - self.render_field("data_source", tags_left_column) - self.render_field("source_system", tags_right_column) - self.render_field("source_process", tags_left_column) - self.render_field("data_location", tags_right_column) - self.render_field("business_domain", tags_left_column) - self.render_field("stakeholder_group", tags_right_column) - self.render_field("transform_level", tags_left_column) - self.render_field("data_product", tags_right_column) - - return self diff --git a/testgen/ui/views/table_groups/page.py b/testgen/ui/views/table_groups/page.py deleted file mode 100644 index 5d4a50d7..00000000 --- a/testgen/ui/views/table_groups/page.py +++ /dev/null @@ -1,475 +0,0 @@ -import time -import typing -from functools import partial - -import pandas as pd -import streamlit as st -from sqlalchemy.exc import IntegrityError - -import testgen.ui.services.connection_service as connection_service -import testgen.ui.services.form_service as fm -import testgen.ui.services.table_group_service as table_group_service -from testgen.common.models import with_database_session -from testgen.ui.components import widgets as testgen -from testgen.ui.navigation.page import Page -from testgen.ui.services import project_service, user_session_service -from testgen.ui.services.string_service import empty_if_null -from testgen.ui.session import session -from testgen.ui.views.dialogs.run_profiling_dialog import run_profiling_dialog - - -class TableGroupsPage(Page): - path = "connections:table-groups" - can_activate: typing.ClassVar = [ - lambda: session.authentication_status, - lambda: not user_session_service.user_has_catalog_role(), - lambda: "connection_id" in st.query_params or "connections", - ] - - def render(self, connection_id: str, **_kwargs) -> None: - connection = connection_service.get_by_id(connection_id, hide_passwords=False) - if not connection: - return self.router.navigate_with_warning( - f"Connection with ID '{connection_id}' does not exist. Redirecting to list of Connections ...", - "connections", - ) - - project_code = connection["project_code"] - project_service.set_sidebar_project(project_code) - user_can_edit = user_session_service.user_can_edit() - - testgen.page_header( - "Table Groups", - "create-a-table-group", - breadcrumbs=[ # type: ignore - { "label": "Connections", "path": "connections", "params": { "project_code": project_code } }, - { "label": connection["connection_name"] }, - ], - ) - - df = table_group_service.get_by_connection(project_code, connection_id) - - if df.empty: - testgen.whitespace(3) - testgen.empty_state( - label="No table groups yet", - icon="table_view", - message=testgen.EmptyStateMessage.TableGroup, - action_label="Add Table Group", - action_disabled=not user_can_edit, - button_onclick=partial(self.add_table_group_dialog, project_code, connection), - ) - return - - testgen.whitespace(0.3) - _, actions_column = st.columns([.1, .9], vertical_alignment="bottom") - testgen.flex_row_end(actions_column) - - if user_can_edit: - actions_column.button( - ":material/add: Add Table Group", - on_click=partial(self.add_table_group_dialog, project_code, connection) - ) - - for _, table_group in df.iterrows(): - with testgen.card(title=table_group["table_groups_name"]) as table_group_card: - if user_can_edit: - with table_group_card.actions: - testgen.button( - type_="icon", - icon="edit", - tooltip="Edit table group", - tooltip_position="right", - on_click=partial(self.edit_table_group_dialog, project_code, connection, table_group), - key=f"tablegroups:keys:edit:{table_group['id']}", - ) - testgen.button( - type_="icon", - icon="delete", - tooltip="Delete table group", - tooltip_position="right", - on_click=partial(self.delete_table_group_dialog, table_group), - key=f"tablegroups:keys:delete:{table_group['id']}", - ) - - main_section, actions_section = st.columns([.8, .2]) - - with main_section: - testgen.link( - label="Test Suites", - href="test-suites", - params={ "project_code": project_code, "table_group_id": table_group["id"] }, - right_icon="chevron_right", - key=f"tablegroups:keys:go-to-tsuites:{table_group['id']}", - ) - - col1, col2, col3 = st.columns([1/3] * 3, vertical_alignment="bottom") - col4, col5, col6 = st.columns([1/3] * 3, vertical_alignment="bottom") - - with col1: - testgen.no_flex_gap() - testgen.caption("DB Schema") - st.markdown(table_group["table_group_schema"] or "--") - with col2: - testgen.no_flex_gap() - testgen.caption("Tables to Include Mask") - st.markdown(table_group["profiling_include_mask"] or "--") - with col3: - testgen.no_flex_gap() - testgen.caption("Tables to Exclude Mask") - st.markdown(table_group["profiling_exclude_mask"] or "--") - with col4: - testgen.no_flex_gap() - testgen.caption("Explicit Table List") - st.markdown(table_group["profiling_table_set"] or "--") - with col5: - testgen.no_flex_gap() - testgen.caption("Uses Record Sampling") - st.markdown(table_group["profile_use_sampling"] or "N") - with col6: - testgen.no_flex_gap() - testgen.caption("Min Profiling Age (Days)") - st.markdown(table_group["profiling_delay_days"] or "0") - - if user_can_edit: - with actions_section: - testgen.button( - type_="stroked", - label="Run Profiling", - on_click=partial(run_profiling_dialog, project_code, table_group), - key=f"tablegroups:keys:runprofiling:{table_group['id']}", - ) - - @st.dialog(title="Add Table Group") - @with_database_session - def add_table_group_dialog(self, project_code, connection): - show_table_group_form("add", project_code, connection) - - @st.dialog(title="Edit Table Group") - def edit_table_group_dialog(self, project_code: str, connection: dict, table_group: pd.Series): - show_table_group_form("edit", project_code, connection, table_group) - - @st.dialog(title="Delete Table Group") - def delete_table_group_dialog(self, table_group: pd.Series): - table_group_name = table_group["table_groups_name"] - can_be_deleted = table_group_service.cascade_delete([table_group_name], dry_run=True) - - fm.render_html_list( - table_group, - [ - "id", - "table_groups_name", - "table_group_schema", - ], - "Table Group Information", - int_data_width=700, - ) - - if not can_be_deleted: - st.html( - """ -
- - This Table Group has related data, which may include profiling, test definitions and test results. - If you proceed, all related data will be permanently deleted. - -
- Are you sure you want to proceed? -
- """ - ) - accept_cascade_delete = st.toggle("I accept deletion of this Table Group and all related TestGen data.") - - with st.form("Delete Table Group", clear_on_submit=True, border=False): - _, button_column = st.columns([.85, .15]) - with button_column: - delete = st.form_submit_button( - "Delete", - disabled=not can_be_deleted and not accept_cascade_delete, - type="primary", - use_container_width=True, - ) - - if delete: - if table_group_service.are_table_groups_in_use([table_group_name]): - st.error("This Table Group is in use by a running process and cannot be deleted.") - else: - table_group_service.cascade_delete([table_group_name]) - success_message = f"Table Group {table_group_name} has been deleted. " - st.success(success_message) - time.sleep(1) - st.rerun() - - -def show_table_group_form(mode, project_code: str, connection: dict, table_group: pd.Series | None = None): - connection_id = connection["connection_id"] - table_groups_settings_tab, table_groups_preview_tab = st.tabs(["Table Group Settings", "Test"]) - - table_group_id = None - table_groups_name = "" - table_group_schema = "" - profiling_table_set = "" - profiling_include_mask = "%" - profiling_exclude_mask = "tmp%" - profile_id_column_mask = "%_id" - profile_sk_column_mask = "%_sk" - profile_use_sampling = False - profile_sample_percent = 30 - profile_sample_min_count = 15000 - profiling_delay_days = 0 - profile_flag_cdes = True - - with table_groups_settings_tab: - selected_table_group = table_group if mode == "edit" else None - - if selected_table_group is not None: - # establish default values - table_group_id = selected_table_group["id"] - table_groups_name = selected_table_group["table_groups_name"] - table_group_schema = selected_table_group["table_group_schema"] - profiling_table_set = selected_table_group["profiling_table_set"] or "" - profiling_include_mask = selected_table_group["profiling_include_mask"] - profiling_exclude_mask = selected_table_group["profiling_exclude_mask"] - profile_id_column_mask = selected_table_group["profile_id_column_mask"] - profile_sk_column_mask = selected_table_group["profile_sk_column_mask"] - profile_use_sampling = selected_table_group["profile_use_sampling"] == "Y" - profile_sample_percent = int(selected_table_group["profile_sample_percent"]) - profile_sample_min_count = int(selected_table_group["profile_sample_min_count"]) - profiling_delay_days = int(selected_table_group["profiling_delay_days"]) - profile_flag_cdes = selected_table_group["profile_flag_cdes"] - - left_column, right_column = st.columns([0.50, 0.50]) - - profile_sampling_expander = st.expander("Sampling Parameters", expanded=False) - with profile_sampling_expander: - expander_left_column, expander_right_column = st.columns([0.50, 0.50]) - - table_group_tags_expander = st.expander("Table Group Tags", expanded=False) - with table_group_tags_expander: - full_width_column = st.container() - tags_left_column, tags_right_column = st.columns([0.5, 0.5], vertical_alignment="bottom") - - with st.form("Table Group Add / Edit", clear_on_submit=True, border=False): - entity = { - "id": table_group_id, - "project_code": project_code, - "connection_id": connection["connection_id"], - "table_groups_name": left_column.text_input( - label="Name", - max_chars=40, - value=table_groups_name, - help="A unique name to describe the table group", - ), - "profiling_include_mask": left_column.text_input( - label="Tables to Include Mask", - max_chars=40, - value=profiling_include_mask, - help="A SQL filter supported by your database's LIKE operator for table names to include", - ), - "profiling_exclude_mask": left_column.text_input( - label="Tables to Exclude Mask", - max_chars=40, - value=profiling_exclude_mask, - help="A SQL filter supported by your database's LIKE operator for table names to exclude", - ), - "profiling_table_set": left_column.text_input( - label="Explicit Table List", - max_chars=2000, - value=profiling_table_set, - help="A list of specific table names to include, separated by commas", - ), - "table_group_schema": right_column.text_input( - label="Schema", - max_chars=40, - value=table_group_schema, - help="The database schema containing the tables in the Table Group", - ), - "profile_id_column_mask": right_column.text_input( - label="Profiling ID column mask", - max_chars=40, - value=profile_id_column_mask, - help="A SQL filter supported by your database's LIKE operator representing ID columns (optional)", - ), - "profile_sk_column_mask": right_column.text_input( - label="Profiling Surrogate Key column mask", - max_chars=40, - value=profile_sk_column_mask, - help="A SQL filter supported by your database's LIKE operator representing surrogate key columns (optional)", - ), - "profiling_delay_days": right_column.number_input( - label="Min Profiling Age, Days", - min_value=0, - max_value=999, - value=profiling_delay_days, - help="The number of days to wait before new profiling will be available to generate tests", - ), - "profile_flag_cdes": left_column.checkbox( - "Detect critical data elements (CDEs) during profiling", - value=profile_flag_cdes, - ), - "add_scorecard_definition": right_column.checkbox( - "Add scorecard for table group", - value=True, - help="Add a new scorecard to the Quality Dashboard upon creation of this table group", - ) if mode != "edit" else None, - "profile_use_sampling": left_column.checkbox( - "Use profile sampling", - value=profile_use_sampling, - help="Toggle on to base profiling on a sample of records instead of the full table", - ), - "profile_sample_percent": str( - expander_left_column.number_input( - label="Sample percent", - min_value=1, - max_value=100, - value=profile_sample_percent, - help="Percent of records to include in the sample, unless the calculated count falls below the specified minimum.", - ) - ), - "profile_sample_min_count": expander_right_column.number_input( - label="Min Sample Record Count", - min_value=1, - max_value=1000000, - value=profile_sample_min_count, - help="The minimum number of records to be included in any sample (if available)", - ), - "description": full_width_column.text_input( - label="Description", - max_chars=1000, - value=empty_if_null(selected_table_group["description"]) - if mode == "edit" and selected_table_group is not None else "", - ), - "data_source": tags_left_column.text_input( - label="Data Source", - max_chars=40, - value=empty_if_null(selected_table_group["data_source"]) - if mode == "edit" and selected_table_group is not None else "", - help="Original source of the dataset", - ), - "source_system": tags_right_column.text_input( - label="Source System", - max_chars=40, - value=empty_if_null(selected_table_group["source_system"]) - if mode == "edit" and selected_table_group is not None else "", - help="Enterprise system source for the dataset", - ), - "source_process": tags_left_column.text_input( - label="Source Process", - max_chars=40, - value=empty_if_null(selected_table_group["source_process"]) - if mode == "edit" and selected_table_group is not None else "", - help="Process, program, or data flow that produced the dataset", - ), - "data_location": tags_right_column.text_input( - label="Data Location", - max_chars=40, - value=empty_if_null(selected_table_group["data_location"]) - if mode == "edit" and selected_table_group is not None else "", - help="Physical or virtual location of the dataset, e.g., Headquarters, Cloud", - ), - "business_domain": tags_left_column.text_input( - label="Business Domain", - max_chars=40, - value=empty_if_null(selected_table_group["business_domain"]) - if mode == "edit" and selected_table_group is not None else "", - help="Business division responsible for the dataset, e.g., Finance, Sales, Manufacturing", - ), - "stakeholder_group": tags_right_column.text_input( - label="Stakeholder Group", - max_chars=40, - value=empty_if_null(selected_table_group["stakeholder_group"]) - if mode == "edit" and selected_table_group is not None else "", - help="Data owners or stakeholders responsible for the dataset", - ), - "transform_level": tags_left_column.text_input( - label="Transform Level", - max_chars=40, - value=empty_if_null(selected_table_group["transform_level"]) - if mode == "edit" and selected_table_group is not None else "", - help="Data warehouse processing stage, e.g., Raw, Conformed, Processed, Reporting, or Medallion level (bronze, silver, gold)", - ), - "data_product": tags_right_column.text_input( - label="Data Product", - max_chars=40, - value=empty_if_null(selected_table_group["data_product"]) - if mode == "edit" and selected_table_group is not None else "", - help="Data domain that comprises the dataset" - ), - } - - _, button_column = st.columns([.85, .15]) - with button_column: - submit = st.form_submit_button( - "Save" if mode == "edit" else "Add", - use_container_width=True, - ) - - if submit: - if not entity["table_groups_name"]: - st.error("'Name' is required. ") - return - - try: - if mode == "edit": - table_group_service.edit(entity) - success_message = "Changes have been saved successfully. " - else: - table_group_service.add(entity) - success_message = "New table group added successfully. " - except IntegrityError: - st.error("A Table Group with the same name already exists. ") - return - else: - st.success(success_message) - time.sleep(1) - st.rerun() - - with table_groups_preview_tab: - if mode == "edit": - preview_left_column, preview_right_column = st.columns([0.5, 0.5]) - status_preview = preview_right_column.empty() - preview = preview_left_column.button("Test Table Group") - if preview: - table_group_preview(entity, connection_id, project_code, status_preview) - else: - st.write("No preview available while adding a Table Group. Save the configuration first.") - - -def table_group_preview(entity, connection_id, project_code, status): - status.empty() - status.info("Connecting to the Table Group ...") - try: - table_group_results = table_group_service.test_table_group(entity, connection_id, project_code) - if len(table_group_results) > 0: - tables = set() - columns = [] - schemas = set() - for result in table_group_results: - schemas.add(result["table_schema"]) - tables.add(result["table_name"]) - columns.append(result["column_name"]) - - show_test_results(schemas, tables, columns) - - status.empty() - status.success("Operation has finished successfully.") - else: - status.empty() - status.error("Operation was unsuccessful.") - error_message = "" - if len(table_group_results) == 0: - error_message = "Result is empty." - st.text_area("Table Group Error Details", value=error_message) - except Exception as e: - status.empty() - status.error("Error testing the Table Group.") - error_message = e.args[0] - st.text_area("Table Group Error Details", value=error_message) - - -def show_test_results(schemas, tables, columns): - st.markdown(f"**Schema**: {schemas.pop()}") - st.markdown(f"**Column Count**: {len(columns)}") - - tables_df = pd.DataFrame({"[tables]": list(tables)}) - fm.render_grid_select(tables_df, ["[tables]"]) diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index 8f4e6ad8..15157e92 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -2,9 +2,11 @@ import time import typing from datetime import datetime +from functools import partial import pandas as pd import streamlit as st +from streamlit.delta_generator import DeltaGenerator from streamlit_extras.no_default_selectbox import selectbox import testgen.ui.services.database_service as db @@ -21,6 +23,7 @@ download_dialog, get_excel_file_data, ) +from testgen.ui.components.widgets.page import css_class, flex_row_end from testgen.ui.navigation.page import Page from testgen.ui.services import project_service, user_session_service from testgen.ui.services.string_service import empty_if_null, snake_case_to_title_case @@ -70,22 +73,24 @@ def render(self, test_suite_id: str, table_name: str | None = None, column_name: with table_filter_column: columns_df = get_test_suite_columns(test_suite_id) + table_options = list(columns_df["table_name"].unique()) table_name = testgen.select( - options=list(columns_df["table_name"].unique()), + options=table_options, value_column="table_name", - default_value=table_name, + default_value=table_name or (table_options[0] if table_options else None), bind_to_query="table_name", required=True, label="Table Name", ) with column_filter_column: - column_options = list(columns_df.loc[columns_df["table_name"] == table_name]["column_name"].unique()) + column_options = columns_df.loc[columns_df["table_name"] == table_name]["column_name"].dropna().unique().tolist() column_name = testgen.select( options=column_options, default_value=column_name, bind_to_query="column_name", label="Column Name", disabled=not table_name, + accept_new_options=True, ) with disposition_column: @@ -121,12 +126,16 @@ def render(self, test_suite_id: str, table_name: str | None = None, column_name: # This has to be done as a second loop - otherwise, the rest of the buttons after the clicked one are not displayed briefly while refreshing for action in disposition_actions: if action["button"]: - fm.reset_post_updates( - update_test_definition(selected, action["attribute"], action["value"], action["message"]), - as_toast=True, - clear_cache=True, - lst_cached_functions=[], - ) + is_unlocking = action["attribute"] == "lock_refresh" and not action["value"] + if is_unlocking: + confirm_unlocking_test_definition(selected) + else: + fm.reset_post_updates( + update_test_definition(selected, action["attribute"], action["value"], action["message"]), + as_toast=True, + clear_cache=True, + lst_cached_functions=[], + ) if selected: selected_test_def = selected[0] @@ -262,6 +271,9 @@ def show_test_form( test_definition_status = selected_test_def["test_definition_status"] if mode == "edit" else "" check_result = selected_test_def["check_result"] if mode == "edit" else None column_name = empty_if_null(selected_test_def["column_name"]) if mode == "edit" else "" + last_auto_gen_date = empty_if_null(selected_test_def["last_auto_gen_date"]) if mode == "edit" else "" + profiling_as_of_date = empty_if_null(selected_test_def["profiling_as_of_date"]) if mode == "edit" else "" + profile_run_id = empty_if_null(selected_test_def["profile_run_id"]) if mode == "edit" else "" # dynamic attributes custom_query = empty_if_null(selected_test_def["custom_query"]) if mode == "edit" else "" @@ -273,6 +285,8 @@ def show_test_form( baseline_sum = empty_if_null(selected_test_def["baseline_sum"]) if mode == "edit" else "" baseline_avg = empty_if_null(selected_test_def["baseline_avg"]) if mode == "edit" else "" baseline_sd = empty_if_null(selected_test_def["baseline_sd"]) if mode == "edit" else "" + lower_tolerance = empty_if_null(selected_test_def["lower_tolerance"]) if mode == "edit" else 0 + upper_tolerance = empty_if_null(selected_test_def["upper_tolerance"]) if mode == "edit" else 0 subset_condition = empty_if_null(selected_test_def["subset_condition"]) if mode == "edit" else "" groupby_names = empty_if_null(selected_test_def["groupby_names"]) if mode == "edit" else "" having_condition = empty_if_null(selected_test_def["having_condition"]) if mode == "edit" else "" @@ -367,9 +381,6 @@ def show_test_form( value=lock_refresh, help="Protects test parameters from being overwritten when tests in this Test Suite are regenerated.", ), - "schema_name": right_column.text_input( - label="Schema Name", max_chars=100, value=schema_name, disabled=True - ), "test_active": left_column.toggle(label="Test Active", value=test_active), "check_result": check_result, "custom_query": custom_query, @@ -381,6 +392,8 @@ def show_test_form( "baseline_sum": baseline_sum, "baseline_avg": baseline_avg, "baseline_sd": baseline_sd, + "lower_tolerance": lower_tolerance, + "upper_tolerance": upper_tolerance, "subset_condition": subset_condition, "groupby_names": groupby_names, "having_condition": having_condition, @@ -421,10 +434,45 @@ def show_test_form( help=severity_help, ) + if mode == "edit": + columns = st.columns([0.5, 0.5]) + if profiling_as_of_date and profile_run_id and (container := columns.pop()): + if isinstance(profiling_as_of_date, str): + formatted_time = datetime.strptime(profiling_as_of_date, "%Y-%m-%d %H:%M:%S").strftime("%b %d, %I:%M %p") + else: + formatted_time = profiling_as_of_date.strftime("%b %d, %I:%M %p") + testgen.caption("Based on Profiling", container=container) + with container: + testgen.link( + href="profiling-runs:results", + params={"run_id": profile_run_id}, + label=formatted_time, + open_new=True, + ) + + if last_auto_gen_date and (container := columns.pop()): + if isinstance(last_auto_gen_date, str): + formatted_time = datetime.strptime(last_auto_gen_date, "%Y-%m-%d %H:%M:%S").strftime("%b %d, %I:%M %p") + else: + formatted_time = last_auto_gen_date.strftime("%b %d, %I:%M %p") + testgen.caption("Auto-generated at", container=container) + testgen.text( + formatted_time, + container=container, + ) + st.divider() + has_match_attributes = any(attribute.startswith("match_") for attribute in dynamic_attributes) + left_column, right_column = st.columns([0.5, 0.5]) if has_match_attributes else (st.container(), None) + + # schema_name + test_definition["schema_name"] = left_column.text_input( + label="Schema Name", max_chars=100, value=schema_name, disabled=True + ) + # table_name - test_definition["table_name"] = st.text_input( + test_definition["table_name"] = left_column.text_input( label="Table Name", max_chars=100, value=table_name, disabled=False ) @@ -443,7 +491,7 @@ def show_test_form( column_name_label = None elif test_scope == "referential": column_name_disabled = False - test_definition["column_name"] = st.text_input( + test_definition["column_name"] = left_column.text_input( label=column_name_label, value=column_name, max_chars=500, @@ -464,7 +512,7 @@ def show_test_form( else: # query edit not-present column_name_disabled = False - test_definition["column_name"] = st.text_input( + test_definition["column_name"] = left_column.text_input( label=column_name_label, value=column_name, max_chars=100, @@ -498,72 +546,93 @@ def show_test_form( disabled=column_name_disabled, ) - st.divider() + leftover_attributes = dynamic_attributes.copy() - # dynamic attributes - mid_left_column, mid_right_column = st.columns([0.5, 0.5]) + def render_dynamic_attribute(attribute: str, container: DeltaGenerator): + if not attribute in dynamic_attributes: + return + + numeric_attributes = ["threshold_value", "lower_tolerance", "upper_tolerance"] - current_column = mid_left_column - show_custom_query = False - dynamic_attributes_length = len(dynamic_attributes) - dynamic_attributes_half_length = max(round((dynamic_attributes_length + 0.5) / 2), 1) - for i, dynamic_attribute in enumerate(dynamic_attributes): - if i >= dynamic_attributes_half_length: - current_column = mid_right_column + default_value = 0 if attribute in numeric_attributes else "" + value = empty_if_null(selected_test_def[attribute]) if mode == "edit" else default_value - default_value = "" if dynamic_attribute != "threshold_value" else 0 - value = empty_if_null(selected_test_def[dynamic_attribute]) if mode == "edit" else default_value + index = dynamic_attributes.index(attribute) + leftover_attributes.remove(attribute) - actual_dynamic_attributes_labels = ( - dynamic_attributes_labels[i] - if dynamic_attributes_labels and len(dynamic_attributes_labels) > i - else "Help text is not available." + label_text = ( + dynamic_attributes_labels[index] + if dynamic_attributes_labels and len(dynamic_attributes_labels) > index + else snake_case_to_title_case(attribute) ) - - actual_dynamic_attributes_help = ( - dynamic_attributes_help[i] - if dynamic_attributes_help and len(dynamic_attributes_help) > i - else snake_case_to_title_case(dynamic_attribute) + help_text = ( + dynamic_attributes_help[index] + if dynamic_attributes_help and len(dynamic_attributes_help) > index + else "Help text is not available." ) - if dynamic_attribute in ["custom_query"]: - show_custom_query = True - elif dynamic_attribute in ["threshold_value"]: - test_definition[dynamic_attribute] = current_column.number_input( - label=actual_dynamic_attributes_labels, + if attribute == "custom_query": + custom_query_placeholder = None + if test_type == "Condition_Flag": + custom_query_placeholder = "EXAMPLE: status = 'SHIPPED' and qty_shipped = 0" + elif test_type == "CUSTOM": + custom_query_placeholder = "EXAMPLE: SELECT product, SUM(qty_sold) as sum_sold, SUM(qty_shipped) as qty_shipped \n FROM {DATA_SCHEMA}.sales_history \n GROUP BY product \n HAVING SUM(qty_shipped) > SUM(qty_sold)" + + test_definition[attribute] = st.text_area( + label=label_text, + value=custom_query, + placeholder=custom_query_placeholder, + height=150 if test_type == "CUSTOM" else 75, + help=help_text, + ) + elif attribute in numeric_attributes: + test_definition[attribute] = container.number_input( + label=label_text, value=float(value), - help=actual_dynamic_attributes_help, + step=1.0, + help=help_text, ) else: - test_definition[dynamic_attribute] = current_column.text_input( - label=actual_dynamic_attributes_labels, - max_chars=4000 if dynamic_attribute in ["match_column_names", "match_groupby_names", "groupby_names"] else 1000, + test_definition[attribute] = container.text_input( + label=label_text, + max_chars=4000 if attribute in ["match_column_names", "match_groupby_names", "groupby_names"] else 1000, value=value, - help=actual_dynamic_attributes_help, + help=help_text, ) - # Custom Query - if show_custom_query: - if test_type == "Condition_Flag": - custom_query_default = "EXAMPLE: status = 'SHIPPED' and qty_shipped = 0" - custom_query_height = 75 - elif test_type == "CUSTOM": - custom_query_default = "EXAMPLE: SELECT product, SUM(qty_sold) as sum_sold, SUM(qty_shipped) as qty_shipped \n FROM {DATA_SCHEMA}.sales_history \n GROUP BY product \n HAVING SUM(qty_shipped) > SUM(qty_sold)" - custom_query_height = 150 - else: - custom_query_default = None - custom_query_height = 75 - test_definition["custom_query"] = st.text_area( - label=actual_dynamic_attributes_labels, - value=custom_query, - placeholder=custom_query_default, - height=custom_query_height, - help=actual_dynamic_attributes_help, + if has_match_attributes: + for attribute in ["match_schema_name", "match_table_name", "match_column_names"]: + render_dynamic_attribute(attribute, right_column) + + st.divider() + + mid_left_column, mid_right_column = st.columns([0.5, 0.5]) + + if has_match_attributes: + for attribute in ["subset_condition", "groupby_names", "having_condition"]: + if attribute in dynamic_attributes and f"match_{attribute}" in dynamic_attributes: + render_dynamic_attribute(attribute, mid_left_column) + render_dynamic_attribute(f"match_{attribute}", mid_right_column) + + if "custom_query" in dynamic_attributes: + render_dynamic_attribute("custom_query", mid_left_column) + + total_length = len(leftover_attributes) + half_length = round(total_length / 2) + for index, attribute in enumerate(leftover_attributes.copy()): + render_dynamic_attribute( + attribute, + mid_left_column if index == 0 or index < half_length else mid_right_column, ) # skip_errors if run_type == "QUERY": - test_definition["skip_errors"] = left_column.number_input(label="Threshold Error Count", value=skip_errors) + container = mid_right_column if total_length % 2 else mid_left_column + test_definition["skip_errors"] = container.number_input( + label="Threshold Error Count", + value=skip_errors, + step=1, + ) else: test_definition["skip_errors"] = skip_errors @@ -614,7 +683,7 @@ def edit_test_dialog(project_code, table_group, test_suite, str_table_name, str_ def copy_move_test_dialog(project_code, origin_table_group, origin_test_suite, selected_test_definitions): st.text(f"Selected tests: {len(selected_test_definitions)}") - group_filter_column, suite_filter_column = st.columns([.5, .5], vertical_alignment="bottom") + group_filter_column, suite_filter_column, table_filter_column = st.columns([.33, .33, .33], vertical_alignment="bottom") with group_filter_column: table_groups_df = run_table_groups_lookup_query(project_code) @@ -623,24 +692,45 @@ def copy_move_test_dialog(project_code, origin_table_group, origin_test_suite, s value_column="id", display_column="table_groups_name", default_value=origin_table_group["id"], + required=True, label="Target Table Group", ) with suite_filter_column: test_suites_df = run_test_suite_lookup_query(target_table_group_id) - try: - origin_index = test_suites_df[test_suites_df["id"] == origin_test_suite["id"]].index - test_suites_df.drop(origin_index, inplace=True) - except KeyError: - pass target_test_suite_id = testgen.select( options=test_suites_df, value_column="id", display_column="test_suite", default_value=None, + required=True, label="Target Test Suite", ) + target_table_column = None + if target_test_suite_id == origin_test_suite["id"]: + with table_filter_column: + columns_df = get_test_suite_columns(origin_test_suite["id"]) + table_name = testgen.select( + options=list(columns_df["table_name"].unique()), + value_column="table_name", + default_value=None, + required=True, + label="Target Table Name", + ) + column_options = list(columns_df.loc[columns_df["table_name"] == table_name]["column_name"].unique()) + column_name = testgen.select( + options=column_options, + default_value=None, + required=True, + label="Column Name", + disabled=not table_name, + ) + target_table_column = { + "table_name": table_name, + "column_name":column_name + } + movable_test_definitions = [] if target_table_group_id and target_test_suite_id: collision_test_definitions = test_definition_service.get_test_definitions_collision(selected_test_definitions, target_table_group_id, target_test_suite_id) @@ -673,13 +763,13 @@ def copy_move_test_dialog(project_code, origin_table_group, origin_test_suite, s ) if move: - test_definition_service.move(movable_test_definitions, target_table_group_id, target_test_suite_id) + test_definition_service.move(movable_test_definitions, target_table_group_id, target_test_suite_id, target_table_column) success_message = "Test Definitions have been moved." st.success(success_message) time.sleep(1) st.rerun() elif copy: - test_definition_service.copy(movable_test_definitions, target_table_group_id, target_test_suite_id) + test_definition_service.copy(movable_test_definitions, target_table_group_id, target_test_suite_id, target_table_column) success_message = "Test Definitions have been copied." st.success(success_message) time.sleep(1) @@ -740,6 +830,37 @@ def prompt_for_test_type(): return str_value, row_selected +@st.dialog(title="Unlock Test Definition") +def confirm_unlocking_test_definition(test_definitions: list[dict]): + unlock_confirmed, set_unlock_confirmed = temp_value("test-definitions:confirm-unlock-tests") + + st.warning( + """Unlocked tests subject to auto-generation will be overwritten during the next test generation run.""" + ) + + st.html(f""" + Are you sure you want to unlock + {f"{len(test_definitions)} selected test definitions?" + if len(test_definitions) > 1 + else "the selected test definition?"} + """) + + if unlock_confirmed(): + update_test_definition(test_definitions, "lock_refresh", False, "Test definitions have been unlocked.") + time.sleep(1) + st.rerun() + + _, button_column = st.columns([.85, .15]) + with button_column: + testgen.button( + label="Unlock", + type_="stroked", + color="basic", + key="test-definitions:confirm-unlock-tests-btn", + on_click=lambda: set_unlock_confirmed(True), + ) + + def update_test_definition(selected, attribute, value, message): result = None test_definition_ids = [row["id"] for row in selected if "id" in row] @@ -799,13 +920,28 @@ def show_test_defs_grid( bind_to_query_prop="id", ) - with export_container: - if st.button(label=":material/download: Export", help="Download filtered test definitions to Excel"): - download_dialog( - dialog_title="Download Excel Report", - file_content_func=get_excel_report_data, - args=(df, str_test_suite), - ) + popover_container = export_container.empty() + + def open_download_dialog(data: pd.DataFrame | None = None) -> None: + # Hack to programmatically close popover: https://github.com/streamlit/streamlit/issues/8265#issuecomment-3001655849 + with popover_container.container(): + flex_row_end() + st.button(label="Export", icon=":material/download:", disabled=True) + + download_dialog( + dialog_title="Download Excel Report", + file_content_func=get_excel_report_data, + args=(str_project_code, str_test_suite, data), + ) + + with popover_container.container(key="tg--export-popover"): + flex_row_end() + with st.popover(label="Export", icon=":material/download:", help="Download test definitions to Excel"): + css_class("tg--export-wrapper") + st.button(label="All tests", type="tertiary", on_click=open_download_dialog) + st.button(label="Filtered tests", type="tertiary", on_click=partial(open_download_dialog, df)) + if dct_selected_row: + st.button(label="Selected tests", type="tertiary", on_click=partial(open_download_dialog, pd.DataFrame(dct_selected_row))) if dct_selected_row: st.html("

 
") @@ -872,8 +1008,17 @@ def show_test_defs_grid( return dct_selected_row -def get_excel_report_data(update_progress: PROGRESS_UPDATE_TYPE, data: pd.DataFrame, test_suite: str) -> FILE_DATA_TYPE: - data = data.copy() +def get_excel_report_data( + update_progress: PROGRESS_UPDATE_TYPE, + project_code: str, + test_suite: str, + data: pd.DataFrame | None = None, +) -> FILE_DATA_TYPE: + if data is not None: + data = data.copy() + else: + data = test_definition_service.get_test_definitions(project_code, test_suite) + date_service.accommodate_dataframe_to_timezone(data, st.session_state) for key in ["test_active_display", "lock_refresh_display"]: data[key] = data[key].apply(lambda val: val if val == "Yes" else None) diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index 39373a40..8053babf 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -1,7 +1,8 @@ import typing -from datetime import date from functools import partial from io import BytesIO +from itertools import zip_longest +from operator import itemgetter import pandas as pd import plotly.express as px @@ -23,10 +24,11 @@ get_excel_file_data, zip_multi_file_data, ) +from testgen.ui.components.widgets.page import css_class, flex_row_end from testgen.ui.navigation.page import Page from testgen.ui.pdf.test_result_report import create_report from testgen.ui.services import project_service, test_definition_service, test_results_service, user_session_service -from testgen.ui.services.string_service import empty_if_null +from testgen.ui.services.string_service import empty_if_null, snake_case_to_title_case from testgen.ui.session import session from testgen.ui.views.dialogs.profiling_results_dialog import view_profiling_button from testgen.ui.views.test_definitions import show_test_form_by_id @@ -75,7 +77,7 @@ def render( summary_column, score_column, actions_column = st.columns([.4, .2, .4], vertical_alignment="bottom") status_filter_column, test_type_filter_column, table_filter_column, column_filter_column, sort_column, export_button_column = st.columns( - [.2, .2, .2, .2, .1, .1], vertical_alignment="bottom" + [.175, .175, .2, .2, .1, .15], vertical_alignment="bottom" ) testgen.flex_row_end(actions_column) @@ -94,11 +96,11 @@ def render( "Failed", "Warning", "Passed", + "Error", ] status = testgen.select( options=status_options, default_value=status or "Failed + Warning", - required=False, bind_to_query="status", bind_empty_value=True, label="Result Status", @@ -110,7 +112,6 @@ def render( value_column="test_type", display_column="test_name_short", default_value=test_type, - required=False, bind_to_query="test_type", label="Test Type", ) @@ -125,7 +126,9 @@ def render( ) with column_filter_column: - column_options = list(run_columns_df.loc[run_columns_df["table_name"] == table_name]["column_name"].unique()) + column_options = run_columns_df.loc[ + run_columns_df["table_name"] == table_name + ]["column_name"].dropna().unique().tolist() column_name = testgen.select( options=column_options, value_column="column_name", @@ -133,6 +136,7 @@ def render( bind_to_query="column_name", label="Column Name", disabled=not table_name, + accept_new_options=True, ) with sort_column: @@ -154,13 +158,15 @@ def render( match status: case "Failed + Warning": - status = "'Failed','Warning'" + status = ["Failed", "Warning"] case "Failed": - status = "'Failed'" + status = "Failed" case "Warning": - status = "'Warning'" + status = "Warning" case "Passed": - status = "'Passed'" + status = "Passed" + case "Error": + status = "Error" # Display main grid and retrieve selection selected = show_result_detail( @@ -289,7 +295,7 @@ def get_test_run_columns(test_run_id: str) -> pd.DataFrame: @st.cache_data(show_spinner=False) def get_test_results( run_id: str, - test_status: str | None = None, + test_status: str | list[str] | None = None, test_type_id: str | None = None, table_name: str | None = None, column_name: str | None = None, @@ -380,13 +386,13 @@ def get_test_definition(str_test_def_id): @st.cache_data(show_spinner=False) def do_source_data_lookup(selected_row): schema = st.session_state["dbschema"] - return test_results_service.do_source_data_lookup(schema, selected_row) + return test_results_service.do_source_data_lookup(schema, selected_row, limit=500) @st.cache_data(show_spinner=False) def do_source_data_lookup_custom(selected_row): schema = st.session_state["dbschema"] - return test_results_service.do_source_data_lookup_custom(schema, selected_row) + return test_results_service.do_source_data_lookup_custom(schema, selected_row, limit=500) @st.cache_data(show_spinner=False) @@ -395,79 +401,73 @@ def get_test_result_history(selected_row): return test_results_service.get_test_result_history(schema, selected_row) -def show_test_def_detail(str_test_def_id): - if not str_test_def_id: +def show_test_def_detail(test_def_id: str): + def readable_boolean(v: typing.Literal["Y", "N"]): + return "Yes" if v == "Y" else "No" + + if not test_def_id: st.warning("Test definition no longer exists.") return - df = get_test_definition(str_test_def_id) + df = get_test_definition(test_def_id) specs = [] if not df.empty: - # Get First Row - row = df.iloc[0] - - specs.append( - fm.FieldSpec( - "Usage Notes", - "usage_notes", - fm.FormWidget.text_area, - row["usage_notes"], - read_only=True, - text_multi_lines=7, - ) - ) - specs.append( - fm.FieldSpec( - "Threshold Value", - "threshold_value", - fm.FormWidget.number_input, - float(row["threshold_value"]) if row["threshold_value"] else None, - required=True, - ) - ) - - default_severity_choice = f"Test Default ({row['default_severity']})" - - spec = fm.FieldSpec("Test Result Urgency", "severity", fm.FormWidget.radio, row["severity"], required=True) - spec.lst_option_text = [default_severity_choice, "Warning", "Fail", "Log"] - spec.lst_option_values = [None, "Warning", "Fail", "Ignore"] - spec.show_horizontal = True - specs.append(spec) - - spec = fm.FieldSpec( - "Perform Test in Future Runs", "test_active", fm.FormWidget.radio, row["test_active"], required=True - ) - spec.lst_option_text = ["Yes", "No"] - spec.lst_option_values = ["Y", "N"] - spec.show_horizontal = True - specs.append(spec) - - spec = fm.FieldSpec( - "Lock from Refresh", "lock_refresh", fm.FormWidget.radio, row["lock_refresh"], required=True - ) - spec.lst_option_text = ["Unlocked", "Locked"] - spec.lst_option_values = ["N", "Y"] - spec.show_horizontal = True - specs.append(spec) - - specs.append(fm.FieldSpec("", "id", form_widget=fm.FormWidget.hidden, int_key=1, init_val=row["id"])) - - specs.append( - fm.FieldSpec( - "Last Manual Update", - "last_manual_update", - fm.FormWidget.date_input, - row["last_manual_update"], - date.today().strftime("%Y-%m-%d hh:mm:ss"), - read_only=True, - ) - ) - fm.render_form_by_field_specs( - None, - "test_definitions", - specs, - boo_display_only=True, + test_definition = df.iloc[0] + row = test_definition + + dynamic_attributes_labels_raw: str = test_definition["default_parm_prompts"] + if not dynamic_attributes_labels_raw: + dynamic_attributes_labels_raw = "" + dynamic_attributes_labels = dynamic_attributes_labels_raw.split(",") + + dynamic_attributes_raw: str = test_definition["default_parm_columns"] + dynamic_attributes_fields = dynamic_attributes_raw.split(",") + dynamic_attributes_values = itemgetter(*dynamic_attributes_fields)(test_definition)\ + if len(dynamic_attributes_fields) > 1\ + else (test_definition[dynamic_attributes_fields[0]],) + + for field_name in dynamic_attributes_fields[len(dynamic_attributes_labels):]: + dynamic_attributes_labels.append(snake_case_to_title_case(field_name)) + + dynamic_attributes_help_raw: str = test_definition["default_parm_help"] + if not dynamic_attributes_help_raw: + dynamic_attributes_help_raw = "" + dynamic_attributes_help = dynamic_attributes_help_raw.split("|") + + testgen.testgen_component( + "test_definition_summary", + props={ + "test_definition": { + "schema": test_definition["schema_name"], + "test_suite_name": test_definition["test_suite_name"], + "table_name": test_definition["table_name"], + "test_focus": test_definition["column_name"], + "export_to_observability": readable_boolean(test_definition["export_to_observability"]) + if test_definition["export_to_observability"] + else f"Inherited ({readable_boolean(test_definition["default_export_to_observability"])})", + "severity": test_definition["severity"] or f"Test Default ({test_definition['default_severity']})", + "locked": readable_boolean(test_definition["lock_refresh"]), + "active": readable_boolean(test_definition["test_active"]), + "status": test_definition["status"], + "usage_notes": test_definition["usage_notes"], + "last_manual_update": test_definition["last_manual_update"].isoformat() + if test_definition["last_manual_update"] + else None, + "custom_query": test_definition["custom_query"] + if "custom_query" in dynamic_attributes_fields + else None, + "attributes": [ + {"label": label, "value": value, "help": help_} + for label, value, help_ in zip_longest( + dynamic_attributes_labels, + dynamic_attributes_values, + dynamic_attributes_help, + ) + if label and value + ], + }, + }, ) @@ -502,6 +502,7 @@ def show_result_detail( "measure_uom", "result_status", "action", + "result_message", ] lst_show_headers = [ @@ -512,6 +513,7 @@ def show_result_detail( "UOM", "Status", "Action", + "Details", ] selected_rows = fm.render_grid_select( @@ -523,13 +525,28 @@ def show_result_detail( bind_to_query_prop="test_result_id", ) - with export_container: - if st.button(label=":material/download: Export", help="Download filtered test results to Excel"): - download_dialog( - dialog_title="Download Excel Report", - file_content_func=get_excel_report_data, - args=(df, test_suite, run_date), - ) + popover_container = export_container.empty() + + def open_download_dialog(data: pd.DataFrame | None = None) -> None: + # Hack to programmatically close popover: https://github.com/streamlit/streamlit/issues/8265#issuecomment-3001655849 + with popover_container.container(): + flex_row_end() + st.button(label="Export", icon=":material/download:", disabled=True) + + download_dialog( + dialog_title="Download Excel Report", + file_content_func=get_excel_report_data, + args=(test_suite, run_date, run_id, data), + ) + + with popover_container.container(key="tg--export-popover"): + flex_row_end() + with st.popover(label="Export", icon=":material/download:", help="Download test results to Excel"): + css_class("tg--export-wrapper") + st.button(label="All tests", type="tertiary", on_click=open_download_dialog) + st.button(label="Filtered tests", type="tertiary", on_click=partial(open_download_dialog, df)) + if selected_rows: + st.button(label="Selected tests", type="tertiary", on_click=partial(open_download_dialog, pd.DataFrame(selected_rows))) # Display history and detail for selected row if not selected_rows: @@ -628,10 +645,14 @@ def show_result_detail( def get_excel_report_data( update_progress: PROGRESS_UPDATE_TYPE, - data: pd.DataFrame, test_suite: str, run_date: str, + run_id: str, + data: pd.DataFrame | None = None, ) -> FILE_DATA_TYPE: + if data is None: + data = get_test_results(run_id) + columns = { "schema_name": {"header": "Schema"}, "table_name": {"header": "Table"}, diff --git a/testgen/ui/views/test_runs.py b/testgen/ui/views/test_runs.py index 5bd8888f..0b50d649 100644 --- a/testgen/ui/views/test_runs.py +++ b/testgen/ui/views/test_runs.py @@ -1,3 +1,4 @@ +import logging import typing from functools import partial @@ -8,13 +9,14 @@ import testgen.ui.services.database_service as db import testgen.ui.services.form_service as fm import testgen.ui.services.query_service as dq +from testgen.common.models import with_database_session from testgen.ui.components import widgets as testgen from testgen.ui.components.widgets import testgen_component from testgen.ui.navigation.menu import MenuItem from testgen.ui.navigation.page import Page from testgen.ui.queries import project_queries, test_run_queries from testgen.ui.services import user_session_service -from testgen.ui.session import session +from testgen.ui.session import session, temp_value from testgen.ui.views.dialogs.manage_schedules import ScheduleDialog from testgen.ui.views.dialogs.run_tests_dialog import run_tests_dialog from testgen.utils import friendly_score, to_int @@ -22,6 +24,7 @@ PAGE_SIZE = 50 PAGE_ICON = "labs" PAGE_TITLE = "Test Runs" +LOG = logging.getLogger("testgen") class TestRunsPage(Page): @@ -60,6 +63,7 @@ def render(self, project_code: str, table_group_id: str | None = None, test_suit default_value=table_group_id, bind_to_query="table_group_id", label="Table Group", + placeholder="---", ) with suite_filter_column: @@ -71,6 +75,7 @@ def render(self, project_code: str, table_group_id: str | None = None, test_suit default_value=test_suite_id, bind_to_query="test_suite_id", label="Test Suite", + placeholder="---", ) with actions_column: @@ -78,7 +83,7 @@ def render(self, project_code: str, table_group_id: str | None = None, test_suit st.button( ":material/today: Test Run Schedules", - help="Manages when a test suite should run.", + help="Manage when test suites should run", on_click=partial(TestRunScheduleDialog().open, project_code) ) @@ -106,9 +111,13 @@ def render(self, project_code: str, table_group_id: str | None = None, test_suit "items": paginated_df.to_json(orient="records"), "permissions": { "can_run": user_can_run, + "can_edit": user_can_run, }, }, - event_handlers={ "RunCanceled": on_cancel_run } + event_handlers={ + "RunCanceled": on_cancel_run, + "RunsDeleted": partial(on_delete_runs, project_code, table_group_id, test_suite_id), + } ) @@ -132,6 +141,7 @@ def arg_value_input(self) -> tuple[bool, list[typing.Any], dict[str, typing.Any] value_column="test_suite", display_column="test_suite", required=True, + placeholder="Select test suite", ) return bool(ts_name), [], {"project_key": self.project_code, "test_suite_key": ts_name} @@ -158,8 +168,11 @@ def render_empty_state(project_code: str, user_can_run: bool) -> bool: icon=PAGE_ICON, message=testgen.EmptyStateMessage.TableGroup, action_label="Go to Table Groups", - link_href="connections:table-groups", - link_params={ "connection_id": str(project_summary_df["default_connection_id"]) } + link_href="table-groups", + link_params={ + "project_code": project_code, + "connection_id": str(project_summary_df["default_connection_id"]), + } ) elif not project_summary_df["test_suites_ct"] or not project_summary_df["test_definitions_ct"]: testgen.empty_state( @@ -187,10 +200,61 @@ def on_cancel_run(test_run: pd.Series) -> None: process_status, process_message = process_service.kill_test_run(to_int(test_run["process_id"])) if process_status: test_run_queries.update_status(test_run["test_run_id"], "Cancelled") - fm.reset_post_updates(str_message=f":{'green' if process_status else 'red'}[{process_message}]", as_toast=True) +@st.dialog(title="Delete Test Runs") +@with_database_session +def on_delete_runs(project_code: str, table_group_id: str, test_suite_id: str, test_run_ids: list[str]) -> None: + def on_delete_confirmed(*_args) -> None: + set_delete_confirmed(True) + + message = f"Are you sure you want to delete the {len(test_run_ids)} selected test runs?" + constraint = { + "warning": "Any running processes will be canceled.", + "confirmation": "Yes, cancel and delete the test runs.", + } + if len(test_run_ids) == 1 and (test_run_id := test_run_ids[0]): + message = "Are you sure you want to delete the selected test run?" + constraint["confirmation"] = "Yes, cancel and delete the test run." + + if not test_run_queries.is_running(test_run_ids): + constraint = None + + result = None + delete_confirmed, set_delete_confirmed = temp_value("test-runs:confirm-delete", default=False) + testgen.testgen_component( + "confirm_dialog", + props={ + "project_code": project_code, + "message": message, + "constraint": constraint, + "button_label": "Delete", + "button_color": "warn", + "result": result, + }, + on_change_handlers={ + "ActionConfirmed": on_delete_confirmed, + }, + ) + + if delete_confirmed(): + try: + with st.spinner("Deleting runs ..."): + test_runs = _get_db_test_runs(project_code, table_group_id, test_suite_id, test_runs_ids=test_run_ids) + for _, test_run in test_runs.iterrows(): + test_run_id = test_run["test_run_id"] + if test_run["status"] == "Running": + process_status, _ = process_service.kill_test_run(to_int(test_run["process_id"])) + if process_status: + test_run_queries.update_status(test_run_id, "Cancelled") + test_run_queries.cascade_delete_multiple_test_runs(test_run_ids) + st.rerun() + except Exception: + LOG.exception("Failed to delete test run") + result = {"success": False, "message": "Unable to delete the test run, try again."} + + @st.cache_data(show_spinner=False) def run_test_suite_lookup_query(schema: str, project_code: str, table_groups_id: str | None = None) -> pd.DataFrame: table_group_condition = f" AND test_suites.table_groups_id = '{table_groups_id}' " if table_groups_id else "" @@ -219,10 +283,32 @@ def get_db_test_suite_choices(project_code: str, table_groups_id: str | None = N @st.cache_data(show_spinner="Loading data ...") -def get_db_test_runs(project_code: str, table_groups_id: str | None = None, test_suite_id: str | None = None) -> pd.DataFrame: +def get_db_test_runs( + project_code: str, + table_groups_id: str | None = None, + test_suite_id: str | None = None, + test_runs_ids: list[str] | None = None, +) -> pd.DataFrame: + return _get_db_test_runs( + project_code, table_groups_id=table_groups_id, test_suite_id=test_suite_id, test_runs_ids=test_runs_ids + ) + + +def _get_db_test_runs( + project_code: str, + table_groups_id: str | None = None, + test_suite_id: str | None = None, + test_runs_ids: list[str] | None = None, +) -> pd.DataFrame: schema = st.session_state["dbschema"] table_group_condition = f" AND test_suites.table_groups_id = '{table_groups_id}' " if table_groups_id else "" test_suite_condition = f" AND test_suites.id = '{test_suite_id}' " if test_suite_id else "" + + test_runs_conditions = "" + if test_runs_ids and len(test_runs_ids) > 0: + test_runs_ids_ = [f"'{run_id}'" for run_id in test_runs_ids] + test_runs_conditions = f" AND test_runs.id::VARCHAR IN ({', '.join(test_runs_ids_)})" + sql = f""" WITH run_results AS ( SELECT test_run_id, @@ -286,6 +372,7 @@ def get_db_test_runs(project_code: str, table_groups_id: str | None = None, test WHERE test_suites.project_code = '{project_code}' {table_group_condition} {test_suite_condition} + {test_runs_conditions} ORDER BY test_runs.test_starttime DESC; """ diff --git a/testgen/ui/views/test_suites.py b/testgen/ui/views/test_suites.py index 524c74f5..13250493 100644 --- a/testgen/ui/views/test_suites.py +++ b/testgen/ui/views/test_suites.py @@ -18,6 +18,7 @@ from testgen.ui.session import session from testgen.ui.views.dialogs.generate_tests_dialog import generate_tests_dialog from testgen.ui.views.dialogs.run_tests_dialog import run_tests_dialog +from testgen.ui.views.test_runs import TestRunScheduleDialog from testgen.utils import format_field PAGE_ICON = "rule" @@ -96,6 +97,7 @@ def render(self, project_code: str, table_group_id: str | None = None, **_kwargs }, on_change_handlers={ "FilterApplied": on_test_suites_filtered, + "RunSchedulesClicked": lambda *_: TestRunScheduleDialog().open(project_code), "AddTestSuiteClicked": lambda *_: add_test_suite_dialog(project_code, table_groups), "ExportActionClicked": observability_export_dialog, "EditActionClicked": partial(edit_test_suite_dialog, project_code, table_groups), diff --git a/tests/unit/test_profiling_query.py b/tests/unit/test_profiling_query.py index 6bfb010b..826faad1 100644 --- a/tests/unit/test_profiling_query.py +++ b/tests/unit/test_profiling_query.py @@ -7,7 +7,7 @@ def test_include_exclude_mask_basic(): # test configuration project_code = "dummy_project_code" - flavor = "redshift" + flavor = "postgresql" profiling_query = CProfilingSQL(project_code, flavor) profiling_query.parm_table_set = "" profiling_query.parm_table_include_mask = "important%, %useful%" @@ -18,9 +18,9 @@ def test_include_exclude_mask_basic(): # test assertions assert "SELECT 'dummy_project_code'" in query - assert "AND ((c.table_name LIKE 'important%') OR (c.table_name LIKE '%useful%'))" in query + assert r"AND ((c.table_name LIKE 'important%' ) OR (c.table_name LIKE '%useful%' ))" in query assert ( - "AND NOT ((c.table_name LIKE 'temp%') OR (c.table_name LIKE 'tmp%') OR (c.table_name LIKE 'raw_slot_utilization%') OR (c.table_name LIKE 'gps_product_step_change_log'))" + r"AND NOT ((c.table_name LIKE 'temp%' ) OR (c.table_name LIKE 'tmp%' ) OR (c.table_name LIKE 'raw\_slot\_utilization%' ) OR (c.table_name LIKE 'gps\_product\_step\_change\_log' ))" in query ) @@ -30,7 +30,7 @@ def test_include_exclude_mask_basic(): def test_include_empty_exclude_mask(mask): # test configuration project_code = "dummy_project_code" - flavor = "redshift" + flavor = "snowflake" profiling_query = CProfilingSQL(project_code, flavor) profiling_query.parm_table_set = "" profiling_query.parm_table_include_mask = mask @@ -41,7 +41,7 @@ def test_include_empty_exclude_mask(mask): # test assertions assert ( - "AND NOT ((c.table_name LIKE 'temp%') OR (c.table_name LIKE 'tmp%') OR (c.table_name LIKE 'raw_slot_utilization%') OR (c.table_name LIKE 'gps_product_step_change_log'))" + r"AND NOT ((c.table_name LIKE 'temp%' ESCAPE '\\') OR (c.table_name LIKE 'tmp%' ESCAPE '\\') OR (c.table_name LIKE 'raw\\_slot\\_utilization%' ESCAPE '\\') OR (c.table_name LIKE 'gps\\_product\\_step\\_change\\_log' ESCAPE '\\')" in query ) @@ -51,14 +51,14 @@ def test_include_empty_exclude_mask(mask): def test_include_empty_include_mask(mask): # test configuration project_code = "dummy_project_code" - flavor = "redshift" + flavor = "mssql" profiling_query = CProfilingSQL(project_code, flavor) profiling_query.parm_table_set = "" - profiling_query.parm_table_include_mask = "important%, %useful%" + profiling_query.parm_table_include_mask = "important%, %useful_%" profiling_query.parm_table_exclude_mask = mask # test run query = profiling_query.GetDDFQuery() # test assertions - assert "AND ((c.table_name LIKE 'important%') OR (c.table_name LIKE '%useful%'))" in query + assert r"AND ((c.table_name LIKE 'important%' ) OR (c.table_name LIKE '%useful[_]%' ))" in query diff --git a/tests/unit/test_version_service.py b/tests/unit/test_version_service.py deleted file mode 100644 index b97890cf..00000000 --- a/tests/unit/test_version_service.py +++ /dev/null @@ -1,150 +0,0 @@ -from unittest import mock - -import pytest - -from testgen.common.version_service import get_latest_version - - -@pytest.mark.unit -@mock.patch("testgen.common.version_service.settings") -@mock.patch("testgen.common.version_service.requests") -def test_calls_pypi_api(requests: mock.Mock, settings: mock.Mock): - settings.CHECK_FOR_LATEST_VERSION = "pypi" - get_latest_version() - requests.get.assert_called_with("https://pypi.org/pypi/dataops-testgen/json", timeout=3) - - -@pytest.mark.unit -@mock.patch("testgen.common.version_service.settings") -@mock.patch("testgen.common.version_service.requests") -def test_return_unknown_when_pypi_request_fails(requests: mock.Mock, settings: mock.Mock): - response = mock.Mock() - response.status_code = 400 - requests.get.return_value = response - settings.CHECK_FOR_LATEST_VERSION = "pypi" - - assert get_latest_version() == "unknown" - - -@pytest.mark.unit -@mock.patch("testgen.common.version_service.settings") -@mock.patch("testgen.common.version_service.requests") -def test_get_the_latest_version_from_pypi(requests: mock.Mock, settings: mock.Mock): - response = mock.Mock() - response.status_code = 200 - requests.get.return_value = response - response.json.return_value = { - "releases": { - "0.0.1": "", - "0.1.0": "", - "1.0.0": "", - "1.1.0": "", - "v1.2.3": "", - "v1.2.0": "", - } - } - settings.CHECK_FOR_LATEST_VERSION = "pypi" - - assert get_latest_version() == "1.2.3" - - -@pytest.mark.unit -@mock.patch("testgen.common.version_service.settings") -@mock.patch("testgen.common.version_service.requests") -def test_calls_docker_tags_api(requests: mock.Mock, settings: mock.Mock): - settings.DOCKER_HUB_USERNAME = None - settings.DOCKER_HUB_PASSWORD = None - settings.DOCKER_HUB_REPOSITORY = "datakitchen/testgen-a" - settings.CHECK_FOR_LATEST_VERSION = "docker" - get_latest_version() - - requests.get.assert_called_with( - "https://hub.docker.com/v2/repositories/datakitchen/testgen-a/tags", - headers={}, - params={"page_size": 25, "page": 1, "ordering": "last_updated"}, - timeout=3, - ) - - -@pytest.mark.unit -@mock.patch("testgen.common.version_service.settings") -@mock.patch("testgen.common.version_service.requests") -def test_return_unknown_when_docker_request_fails(requests: mock.Mock, settings: mock.Mock): - response = mock.Mock() - response.status_code = 400 - requests.get.return_value = response - settings.DOCKER_HUB_USERNAME = None - settings.DOCKER_HUB_PASSWORD = None - settings.CHECK_FOR_LATEST_VERSION = "docker" - - assert get_latest_version() == "unknown" - - -@pytest.mark.unit -@mock.patch("testgen.common.version_service.settings") -@mock.patch("testgen.common.version_service.requests") -def test_get_the_latest_version_from_dockerhub(requests: mock.Mock, settings: mock.Mock): - settings.DOCKER_HUB_USERNAME = None - settings.DOCKER_HUB_PASSWORD = None - settings.CHECK_FOR_LATEST_VERSION = "docker" - - response = mock.Mock() - response.status_code = 200 - requests.get.return_value = response - response.json.return_value = { - "results": [ - {"name": "v0.0.1"}, - {"name": "v0.1.0"}, - {"name": "v1.0.0"}, - {"name": "v1.1.0"}, - {"name": "v1.2.0"}, - {"name": "v1.2.3-experimental"}, - ], - } - - assert get_latest_version() == "1.2.0" - -@pytest.mark.unit -@mock.patch("testgen.common.version_service.settings") -@mock.patch("testgen.common.version_service.requests") -def test_authenticates_docker_request(requests: mock.Mock, settings: mock.Mock): - username = settings.DOCKER_HUB_USERNAME = "docker-username" - password = settings.DOCKER_HUB_PASSWORD = "docker-password" # noqa: S105 - docker_auth_token = "docker-auth-token" # noqa: S105 - settings.CHECK_FOR_LATEST_VERSION = "docker" - settings.DOCKER_HUB_REPOSITORY = "datakitchen/testgen-b" - - response = mock.Mock() - response.status_code = 200 - response.json.return_value = {"token": docker_auth_token} - requests.post.return_value = response - - get_latest_version() - - requests.post.assert_called_with( - "https://hub.docker.com/v2/users/login", - json={"username": username, "password": password}, - timeout=5, - ) - requests.get.assert_called_with( - "https://hub.docker.com/v2/repositories/datakitchen/testgen-b/tags", - headers={"Authorization": f"Bearer {docker_auth_token}"}, - params={"page_size": 25, "page": 1, "ordering": "last_updated"}, - timeout=3, - ) - - -@pytest.mark.unit -@mock.patch("testgen.common.version_service.settings") -@mock.patch("testgen.common.version_service.requests") -def test_return_unknown_when_docker_auth_request_fails(requests: mock.Mock, settings: mock.Mock): - settings.DOCKER_HUB_USERNAME = "docker-username" - settings.DOCKER_HUB_PASSWORD = "docker-password" # noqa: S105 - settings.CHECK_FOR_LATEST_VERSION = "docker" - settings.DOCKER_HUB_REPOSITORY = "datakitchen/testgen-b" - - response = mock.Mock() - response.status_code = 400 - requests.post.return_value = response - - assert get_latest_version() == "unknown"