From bb75d828702de31da53fc377db05f8d68f52d3c5 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 13 Nov 2025 10:01:42 +0000 Subject: [PATCH 1/5] Add exclude_detection_period_from_training flag to column anomalies tests Co-Authored-By: Yosef Arbiv --- macros/edr/tests/test_all_columns_anomalies.sql | 5 +++-- macros/edr/tests/test_column_anomalies.sql | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/macros/edr/tests/test_all_columns_anomalies.sql b/macros/edr/tests/test_all_columns_anomalies.sql index 37c78f3c9..3c117de50 100644 --- a/macros/edr/tests/test_all_columns_anomalies.sql +++ b/macros/edr/tests/test_all_columns_anomalies.sql @@ -1,4 +1,4 @@ -{% test all_columns_anomalies(model, column_anomalies, exclude_prefix, exclude_regexp, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, dimensions) %} +{% test all_columns_anomalies(model, column_anomalies, exclude_prefix, exclude_regexp, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, dimensions, exclude_detection_period_from_training=false) %} {{ config(tags = ['elementary-tests']) }} {%- if execute and elementary.is_test_command() and elementary.is_elementary_enabled() %} {% set model_relation = elementary.get_model_relation_for_test(model, elementary.get_test_model()) %} @@ -37,7 +37,8 @@ anomaly_exclude_metrics=anomaly_exclude_metrics, detection_period=detection_period, training_period=training_period, - dimensions=dimensions) %} + dimensions=dimensions, + exclude_detection_period_from_training=exclude_detection_period_from_training) %> {%- if not test_configuration %} {{ exceptions.raise_compiler_error("Failed to create test configuration dict for test `{}`".format(test_table_name)) }} diff --git a/macros/edr/tests/test_column_anomalies.sql b/macros/edr/tests/test_column_anomalies.sql index 94480d2c2..39f8eb630 100644 --- a/macros/edr/tests/test_column_anomalies.sql +++ b/macros/edr/tests/test_column_anomalies.sql @@ -1,4 +1,4 @@ -{% test column_anomalies(model, column_name, column_anomalies, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, dimensions) %} +{% test column_anomalies(model, column_name, column_anomalies, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, dimensions, exclude_detection_period_from_training=false) %} {{ config(tags = ['elementary-tests']) }} {%- if execute and elementary.is_test_command() and elementary.is_elementary_enabled() %} {% set model_relation = elementary.get_model_relation_for_test(model, elementary.get_test_model()) %} @@ -36,7 +36,8 @@ anomaly_exclude_metrics=anomaly_exclude_metrics, detection_period=detection_period, training_period=training_period, - dimensions=dimensions) %} + dimensions=dimensions, + exclude_detection_period_from_training=exclude_detection_period_from_training) %} {%- if not test_configuration %} {{ exceptions.raise_compiler_error("Failed to create test configuration dict for test `{}`".format(test_table_name)) }} From 8c2588a4f71a80d8e8a881fefc472d00fe6ba66d Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 13 Nov 2025 10:07:17 +0000 Subject: [PATCH 2/5] Fix Jinja syntax error: change %> to %} in test_all_columns_anomalies.sql Co-Authored-By: Yosef Arbiv --- macros/edr/tests/test_all_columns_anomalies.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/macros/edr/tests/test_all_columns_anomalies.sql b/macros/edr/tests/test_all_columns_anomalies.sql index 3c117de50..e3d8995a1 100644 --- a/macros/edr/tests/test_all_columns_anomalies.sql +++ b/macros/edr/tests/test_all_columns_anomalies.sql @@ -38,7 +38,7 @@ detection_period=detection_period, training_period=training_period, dimensions=dimensions, - exclude_detection_period_from_training=exclude_detection_period_from_training) %> + exclude_detection_period_from_training=exclude_detection_period_from_training) %} {%- if not test_configuration %} {{ exceptions.raise_compiler_error("Failed to create test configuration dict for test `{}`".format(test_table_name)) }} From 6b60a66bd837cee4174e53b3a63b83e41d654231 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 13 Nov 2025 13:23:35 +0000 Subject: [PATCH 3/5] Add integration test for exclude_detection_period_from_training flag Test demonstrates that: - When flag=False: detection period data is included in training baseline, preventing anomaly detection - When flag=True: detection period data is excluded from training baseline, enabling anomaly detection The test uses constrained time windows (1 day training/detection) to make the behavior deterministic. Co-Authored-By: Yosef Arbiv --- .../tests/test_column_anomalies.py | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/integration_tests/tests/test_column_anomalies.py b/integration_tests/tests/test_column_anomalies.py index 709c49cf9..dc6e7173c 100644 --- a/integration_tests/tests/test_column_anomalies.py +++ b/integration_tests/tests/test_column_anomalies.py @@ -476,3 +476,74 @@ def test_anomalous_boolean_column_anomalies(test_id: str, dbt_project: DbtProjec "count_true", "count_false", } + + +# Anomalies currently not supported on ClickHouse +@pytest.mark.skip_targets(["clickhouse"]) +def test_column_anomalies_exclude_detection_period_from_training( + test_id: str, dbt_project: DbtProject +): + utc_today = datetime.utcnow().date() + test_date, *training_dates = generate_dates(base_date=utc_today - timedelta(1)) + + data: List[Dict[str, Any]] = [ + { + TIMESTAMP_COLUMN: cur_date.strftime(DATE_FORMAT), + "superhero": superhero, + } + for cur_date in training_dates + for superhero in ["Superman", "Batman"] + ] + + data += [ + {TIMESTAMP_COLUMN: test_date.strftime(DATE_FORMAT), "superhero": None} + for _ in range(10) + ] + + test_args_false = { + "timestamp_column": TIMESTAMP_COLUMN, + "column_anomalies": ["null_count"], + "time_bucket": {"period": "day", "count": 1}, + "training_period": {"period": "day", "count": 1}, + "detection_period": {"period": "day", "count": 1}, + "min_training_set_size": 1, + "anomaly_sensitivity": 3, + "anomaly_direction": "spike", + "exclude_detection_period_from_training": False, + } + test_result_false = dbt_project.test( + test_id, + DBT_TEST_NAME, + test_args_false, + data=data, + test_column="superhero", + test_vars={"force_metrics_backfill": True}, + ) + assert test_result_false["status"] == "pass", ( + "Expected PASS when exclude_detection_period_from_training=False " + "(detection data included in training baseline)" + ) + + test_args_true = { + "timestamp_column": TIMESTAMP_COLUMN, + "column_anomalies": ["null_count"], + "time_bucket": {"period": "day", "count": 1}, + "training_period": {"period": "day", "count": 1}, + "detection_period": {"period": "day", "count": 1}, + "min_training_set_size": 1, + "anomaly_sensitivity": 3, + "anomaly_direction": "spike", + "exclude_detection_period_from_training": True, + } + test_result_true = dbt_project.test( + test_id, + DBT_TEST_NAME, + test_args_true, + data=data, + test_column="superhero", + test_vars={"force_metrics_backfill": True}, + ) + assert test_result_true["status"] == "fail", ( + "Expected FAIL when exclude_detection_period_from_training=True " + "(detection data excluded from training baseline, anomaly detected)" + ) From 8a70f2abc1023ad9c2322881250247f837dcc897 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 19 Nov 2025 10:49:33 +0000 Subject: [PATCH 4/5] Fix test_column_anomalies_exclude_detection_period_from_training with more substantial dataset - Use 30 days of normal data with low null count (0-2 nulls/day) instead of 1 day - Use 7 days of anomalous data with high null count (20 nulls/day) instead of 1 day - Update training period to 30 days and detection period to 7 days - Add more data per day to create clearer anomaly signal - Use separate test IDs for the two test runs to avoid conflicts - Pattern matches successful volume and freshness anomalies tests Co-Authored-By: Yosef Arbiv --- .../tests/test_column_anomalies.py | 108 ++++++++++++------ 1 file changed, 70 insertions(+), 38 deletions(-) diff --git a/integration_tests/tests/test_column_anomalies.py b/integration_tests/tests/test_column_anomalies.py index dc6e7173c..5019c3c22 100644 --- a/integration_tests/tests/test_column_anomalies.py +++ b/integration_tests/tests/test_column_anomalies.py @@ -483,67 +483,99 @@ def test_anomalous_boolean_column_anomalies(test_id: str, dbt_project: DbtProjec def test_column_anomalies_exclude_detection_period_from_training( test_id: str, dbt_project: DbtProject ): + """ + Test the exclude_detection_period_from_training flag functionality for column anomalies. + + Scenario: + - 30 days of normal data with low null count (0-2 nulls per day) + - 7 days of anomalous data with high null count (20 nulls per day) in detection period + - Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly) + - With exclusion: anomaly excluded from training, test fails (detects anomaly) + """ utc_today = datetime.utcnow().date() - test_date, *training_dates = generate_dates(base_date=utc_today - timedelta(1)) - - data: List[Dict[str, Any]] = [ - { - TIMESTAMP_COLUMN: cur_date.strftime(DATE_FORMAT), - "superhero": superhero, - } - for cur_date in training_dates - for superhero in ["Superman", "Batman"] - ] - - data += [ - {TIMESTAMP_COLUMN: test_date.strftime(DATE_FORMAT), "superhero": None} - for _ in range(10) - ] - test_args_false = { + # Generate 30 days of normal data with low null count (0-2 nulls per day) + normal_data = [] + for i in range(30): + date = utc_today - timedelta(days=37 - i) + normal_data.extend( + [ + {TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": superhero} + for superhero in ["Superman", "Batman", "Wonder Woman", "Flash"] * 5 + ] + ) + null_count = i % 3 + normal_data.extend( + [ + {TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": None} + for _ in range(null_count) + ] + ) + + # Generate 7 days of anomalous data with high null count (20 nulls per day) + anomalous_data = [] + for i in range(7): + date = utc_today - timedelta(days=7 - i) + anomalous_data.extend( + [ + {TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": superhero} + for superhero in ["Superman", "Batman"] + ] + ) + anomalous_data.extend( + [ + {TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": None} + for _ in range(20) + ] + ) + + all_data = normal_data + anomalous_data + + # Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training) + test_args_without_exclusion = { "timestamp_column": TIMESTAMP_COLUMN, "column_anomalies": ["null_count"], "time_bucket": {"period": "day", "count": 1}, - "training_period": {"period": "day", "count": 1}, - "detection_period": {"period": "day", "count": 1}, - "min_training_set_size": 1, + "training_period": {"period": "day", "count": 30}, + "detection_period": {"period": "day", "count": 7}, + "min_training_set_size": 5, "anomaly_sensitivity": 3, "anomaly_direction": "spike", "exclude_detection_period_from_training": False, } - test_result_false = dbt_project.test( - test_id, + + test_result_without_exclusion = dbt_project.test( + test_id + "_without_exclusion", DBT_TEST_NAME, - test_args_false, - data=data, + test_args_without_exclusion, + data=all_data, test_column="superhero", test_vars={"force_metrics_backfill": True}, ) - assert test_result_false["status"] == "pass", ( + + # This should PASS because the anomaly is included in training, making it part of the baseline + assert test_result_without_exclusion["status"] == "pass", ( "Expected PASS when exclude_detection_period_from_training=False " "(detection data included in training baseline)" ) - test_args_true = { - "timestamp_column": TIMESTAMP_COLUMN, - "column_anomalies": ["null_count"], - "time_bucket": {"period": "day", "count": 1}, - "training_period": {"period": "day", "count": 1}, - "detection_period": {"period": "day", "count": 1}, - "min_training_set_size": 1, - "anomaly_sensitivity": 3, - "anomaly_direction": "spike", + # Test 2: WITH exclusion (should fail - detects the anomaly because it's excluded from training) + test_args_with_exclusion = { + **test_args_without_exclusion, "exclude_detection_period_from_training": True, } - test_result_true = dbt_project.test( - test_id, + + test_result_with_exclusion = dbt_project.test( + test_id + "_with_exclusion", DBT_TEST_NAME, - test_args_true, - data=data, + test_args_with_exclusion, + data=all_data, test_column="superhero", test_vars={"force_metrics_backfill": True}, ) - assert test_result_true["status"] == "fail", ( + + # This should FAIL because the anomaly is excluded from training, so it's detected as anomalous + assert test_result_with_exclusion["status"] == "fail", ( "Expected FAIL when exclude_detection_period_from_training=True " "(detection data excluded from training baseline, anomaly detected)" ) From 7fdd2d732bfa50a8a538fbbd7c57875f500e1999 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 19 Nov 2025 11:08:58 +0000 Subject: [PATCH 5/5] Fix test_col_anom_excl_detect_train: rename function and adjust test data - Rename test function to fix Postgres 63-char table name limit - Update test data: 8-12 nulls/day normal, 20 nulls/day anomalous - Increase sensitivity to 5 to match volume anomalies pattern - Shorten test ID suffixes to _f/_t - Test now passes locally with postgres Co-Authored-By: Yosef Arbiv --- .../tests/test_column_anomalies.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/integration_tests/tests/test_column_anomalies.py b/integration_tests/tests/test_column_anomalies.py index 5019c3c22..d4217009c 100644 --- a/integration_tests/tests/test_column_anomalies.py +++ b/integration_tests/tests/test_column_anomalies.py @@ -480,9 +480,7 @@ def test_anomalous_boolean_column_anomalies(test_id: str, dbt_project: DbtProjec # Anomalies currently not supported on ClickHouse @pytest.mark.skip_targets(["clickhouse"]) -def test_column_anomalies_exclude_detection_period_from_training( - test_id: str, dbt_project: DbtProject -): +def test_col_anom_excl_detect_train(test_id: str, dbt_project: DbtProject): """ Test the exclude_detection_period_from_training flag functionality for column anomalies. @@ -494,17 +492,18 @@ def test_column_anomalies_exclude_detection_period_from_training( """ utc_today = datetime.utcnow().date() - # Generate 30 days of normal data with low null count (0-2 nulls per day) + # Generate 30 days of normal data with variance in null count (8, 10, 12 pattern) + normal_pattern = [8, 10, 12] normal_data = [] for i in range(30): date = utc_today - timedelta(days=37 - i) + null_count = normal_pattern[i % 3] normal_data.extend( [ {TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": superhero} - for superhero in ["Superman", "Batman", "Wonder Woman", "Flash"] * 5 + for superhero in ["Superman", "Batman", "Wonder Woman", "Flash"] * 10 ] ) - null_count = i % 3 normal_data.extend( [ {TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": None} @@ -512,14 +511,14 @@ def test_column_anomalies_exclude_detection_period_from_training( ] ) - # Generate 7 days of anomalous data with high null count (20 nulls per day) + # Generate 7 days of anomalous data (20 nulls per day) - 100% increase from mean anomalous_data = [] for i in range(7): date = utc_today - timedelta(days=7 - i) anomalous_data.extend( [ {TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": superhero} - for superhero in ["Superman", "Batman"] + for superhero in ["Superman", "Batman", "Wonder Woman", "Flash"] * 10 ] ) anomalous_data.extend( @@ -539,13 +538,13 @@ def test_column_anomalies_exclude_detection_period_from_training( "training_period": {"period": "day", "count": 30}, "detection_period": {"period": "day", "count": 7}, "min_training_set_size": 5, - "anomaly_sensitivity": 3, + "anomaly_sensitivity": 5, "anomaly_direction": "spike", "exclude_detection_period_from_training": False, } test_result_without_exclusion = dbt_project.test( - test_id + "_without_exclusion", + test_id + "_f", DBT_TEST_NAME, test_args_without_exclusion, data=all_data, @@ -566,7 +565,7 @@ def test_column_anomalies_exclude_detection_period_from_training( } test_result_with_exclusion = dbt_project.test( - test_id + "_with_exclusion", + test_id + "_t", DBT_TEST_NAME, test_args_with_exclusion, data=all_data,