From f83265cae6549d9640819ebb10e19e04f1d366fd Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 13 Nov 2025 13:10:06 +0000 Subject: [PATCH 1/6] Add exclude_detection_period_from_training flag to dimension anomaly test - Added exclude_detection_period_from_training parameter to test_dimension_anomalies macro signature with default value false - Passed the parameter through to get_anomalies_test_configuration - This brings dimension anomalies in line with table/volume anomalies which already support this flag - The underlying logic in get_anomaly_scores_query.sql already handles this parameter for all anomaly types Co-Authored-By: Yosef Arbiv --- macros/edr/tests/test_dimension_anomalies.sql | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/macros/edr/tests/test_dimension_anomalies.sql b/macros/edr/tests/test_dimension_anomalies.sql index 6412973a2..350e6e925 100644 --- a/macros/edr/tests/test_dimension_anomalies.sql +++ b/macros/edr/tests/test_dimension_anomalies.sql @@ -1,4 +1,4 @@ -{% test dimension_anomalies(model, dimensions, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, exclude_final_results) %} +{% test dimension_anomalies(model, dimensions, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, exclude_final_results, exclude_detection_period_from_training=false) %} {{ config(tags = ['elementary-tests']) }} {%- if execute and elementary.is_test_command() and elementary.is_elementary_enabled() %} {% set model_relation = elementary.get_model_relation_for_test(model, elementary.get_test_model()) %} @@ -39,7 +39,8 @@ anomaly_exclude_metrics=anomaly_exclude_metrics, detection_period=detection_period, training_period=training_period, - exclude_final_results=exclude_final_results) %} + exclude_final_results=exclude_final_results, + exclude_detection_period_from_training=exclude_detection_period_from_training) %} {%- if not test_configuration %} {{ exceptions.raise_compiler_error("Failed to create test configuration dict for test `{}`".format(test_table_name)) }} From 9872781e735042eec778fbe9f8ba212f1bb30fbd Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 13 Nov 2025 13:24:03 +0000 Subject: [PATCH 2/6] Add integration test for exclude_detection_period_from_training in dimension anomalies - Added test_dimension_exclude_detection_from_training to demonstrate the flag's behavior - Test shows that without exclusion, anomaly is missed (test passes) because training includes the detection period - Test shows that with exclusion, anomaly is detected (test fails) because training excludes the detection period - Uses 30 days of normal data with variance (45/50/55 pattern) and 7 days of anomalous data (72/28 distribution) - Follows the same pattern as test_exclude_detection_from_training in test_volume_anomalies.py Co-Authored-By: Yosef Arbiv --- .../tests/test_dimension_anomalies.py | 106 ++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/integration_tests/tests/test_dimension_anomalies.py b/integration_tests/tests/test_dimension_anomalies.py index 69e9e8637..7cd7c18fd 100644 --- a/integration_tests/tests/test_dimension_anomalies.py +++ b/integration_tests/tests/test_dimension_anomalies.py @@ -218,3 +218,109 @@ def test_dimension_anomalies_with_timestamp_exclude_final_results( test_result = dbt_project.test(test_id, DBT_TEST_NAME, test_args, data=data) assert test_result["status"] == "fail" assert test_result["failures"] == 1 + + +# Test for exclude_detection_period_from_training functionality +# This test demonstrates the use case where: +# 1. Detection period contains anomalous distribution data that would normally be included in training +# 2. With exclude_detection_period_from_training=False: anomaly is missed (test passes) because training includes the anomaly +# 3. With exclude_detection_period_from_training=True: anomaly is detected (test fails) because training excludes the anomaly +@pytest.mark.skip_targets(["clickhouse"]) +def test_dimension_exclude_detection_from_training( + test_id: str, dbt_project: DbtProject +): + """ + Test the exclude_detection_period_from_training flag functionality for dimension anomalies. + + Scenario: + - 30 days of normal data with variance (45/50/55 Superman, 55/50/45 Spiderman pattern) + - 7 days of anomalous data (72 Superman, 28 Spiderman per day) in detection period + - Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly) + - With exclusion: anomaly excluded from training, test fails (detects anomaly) + """ + utc_now = datetime.utcnow() + + # Generate 30 days of normal data with variance (45/50/55 pattern for Superman) + normal_pattern = [45, 50, 55] + normal_data = [] + for i in range(30): + date = utc_now - timedelta(days=37 - i) + superman_count = normal_pattern[i % 3] + spiderman_count = 100 - superman_count + normal_data.extend( + [ + {TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": "Superman"} + for _ in range(superman_count) + ] + ) + normal_data.extend( + [ + { + TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), + "superhero": "Spiderman", + } + for _ in range(spiderman_count) + ] + ) + + # Generate 7 days of anomalous data (72 Superman, 28 Spiderman per day) - this will be in detection period + anomalous_data = [] + for i in range(7): + date = utc_now - timedelta(days=7 - i) + anomalous_data.extend( + [ + {TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": "Superman"} + for _ in range(72) + ] + ) + anomalous_data.extend( + [ + { + TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), + "superhero": "Spiderman", + } + for _ in range(28) + ] + ) + + all_data = normal_data + anomalous_data + + # Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training) + test_args_without_exclusion = { + **DBT_TEST_ARGS, + "training_period": {"period": "day", "count": 30}, + "detection_period": {"period": "day", "count": 7}, + "time_bucket": {"period": "day", "count": 1}, + "sensitivity": 5, + # exclude_detection_period_from_training is not set (defaults to False/None) + } + + test_result_without_exclusion = dbt_project.test( + test_id + "_without_exclusion", + DBT_TEST_NAME, + test_args_without_exclusion, + data=all_data, + ) + + # This should PASS because the anomaly is included in training, making it part of the baseline + assert ( + test_result_without_exclusion["status"] == "pass" + ), "Test should pass when anomaly is included in training" + + # Test 2: WITH exclusion (should fail - detects the anomaly because it's excluded from training) + test_args_with_exclusion = { + **test_args_without_exclusion, + "exclude_detection_period_from_training": True, + } + + test_result_with_exclusion = dbt_project.test( + test_id + "_with_exclusion", + DBT_TEST_NAME, + test_args_with_exclusion, + data=all_data, + ) + + # This should FAIL because the anomaly is excluded from training, so it's detected as anomalous + assert ( + test_result_with_exclusion["status"] == "fail" + ), "Test should fail when anomaly is excluded from training" From b8b837acf30464ad43bf436c5281ce90d51c6357 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 20 Nov 2025 10:52:47 +0000 Subject: [PATCH 3/6] Fix test_dimension_exclude_detection_from_training: shorten test ID suffixes to avoid Postgres 63-char limit Co-Authored-By: Yosef Arbiv --- integration_tests/tests/test_dimension_anomalies.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integration_tests/tests/test_dimension_anomalies.py b/integration_tests/tests/test_dimension_anomalies.py index 7cd7c18fd..af586a947 100644 --- a/integration_tests/tests/test_dimension_anomalies.py +++ b/integration_tests/tests/test_dimension_anomalies.py @@ -296,7 +296,7 @@ def test_dimension_exclude_detection_from_training( } test_result_without_exclusion = dbt_project.test( - test_id + "_without_exclusion", + test_id + "_f", DBT_TEST_NAME, test_args_without_exclusion, data=all_data, @@ -314,7 +314,7 @@ def test_dimension_exclude_detection_from_training( } test_result_with_exclusion = dbt_project.test( - test_id + "_with_exclusion", + test_id + "_t", DBT_TEST_NAME, test_args_with_exclusion, data=all_data, From 370a4cb768b97e0c93eddf980fcac702da20955f Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 20 Nov 2025 13:04:19 +0000 Subject: [PATCH 4/6] Refactor test to use parametrization: rename to test_anomaly_in_detection_period - Rename test_dimension_exclude_detection_from_training to test_anomaly_in_detection_period - Add @pytest.mark.parametrize decorator with exclude_detection and expected_status parameters - Use descriptive IDs: include_detection_in_training and exclude_detection_from_training - Consolidate two test cases into one parametrized test for better maintainability - Addresses reviewer feedback on PR #890 Co-Authored-By: Yosef Arbiv --- .../tests/test_dimension_anomalies.py | 55 ++++++++----------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/integration_tests/tests/test_dimension_anomalies.py b/integration_tests/tests/test_dimension_anomalies.py index af586a947..f82298fde 100644 --- a/integration_tests/tests/test_dimension_anomalies.py +++ b/integration_tests/tests/test_dimension_anomalies.py @@ -223,11 +223,22 @@ def test_dimension_anomalies_with_timestamp_exclude_final_results( # Test for exclude_detection_period_from_training functionality # This test demonstrates the use case where: # 1. Detection period contains anomalous distribution data that would normally be included in training -# 2. With exclude_detection_period_from_training=False: anomaly is missed (test passes) because training includes the anomaly -# 3. With exclude_detection_period_from_training=True: anomaly is detected (test fails) because training excludes the anomaly +# 2. With exclude_detection=False: anomaly is missed (test passes) because training includes the anomaly +# 3. With exclude_detection=True: anomaly is detected (test fails) because training excludes the anomaly @pytest.mark.skip_targets(["clickhouse"]) -def test_dimension_exclude_detection_from_training( - test_id: str, dbt_project: DbtProject +@pytest.mark.parametrize( + "exclude_detection,expected_status", + [ + (False, "pass"), # include detection in training → anomaly absorbed + (True, "fail"), # exclude detection from training → anomaly detected + ], + ids=["include_detection_in_training", "exclude_detection_from_training"], +) +def test_anomaly_in_detection_period( + test_id: str, + dbt_project: DbtProject, + exclude_detection: bool, + expected_status: str, ): """ Test the exclude_detection_period_from_training flag functionality for dimension anomalies. @@ -285,42 +296,22 @@ def test_dimension_exclude_detection_from_training( all_data = normal_data + anomalous_data - # Test 1: WITHOUT exclusion (should pass - misses the anomaly because it's included in training) - test_args_without_exclusion = { + test_args = { **DBT_TEST_ARGS, "training_period": {"period": "day", "count": 30}, "detection_period": {"period": "day", "count": 7}, "time_bucket": {"period": "day", "count": 1}, "sensitivity": 5, - # exclude_detection_period_from_training is not set (defaults to False/None) - } - - test_result_without_exclusion = dbt_project.test( - test_id + "_f", - DBT_TEST_NAME, - test_args_without_exclusion, - data=all_data, - ) - - # This should PASS because the anomaly is included in training, making it part of the baseline - assert ( - test_result_without_exclusion["status"] == "pass" - ), "Test should pass when anomaly is included in training" - - # Test 2: WITH exclusion (should fail - detects the anomaly because it's excluded from training) - test_args_with_exclusion = { - **test_args_without_exclusion, - "exclude_detection_period_from_training": True, } + if exclude_detection: + test_args["exclude_detection_period_from_training"] = True - test_result_with_exclusion = dbt_project.test( - test_id + "_t", + suffix = "_excl" if exclude_detection else "_incl" + test_result = dbt_project.test( + test_id + suffix, DBT_TEST_NAME, - test_args_with_exclusion, + test_args, data=all_data, ) - # This should FAIL because the anomaly is excluded from training, so it's detected as anomalous - assert ( - test_result_with_exclusion["status"] == "fail" - ), "Test should fail when anomaly is excluded from training" + assert test_result["status"] == expected_status From fd75902f9438a0c2a12e4ed335c2212821406f19 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 20 Nov 2025 13:24:37 +0000 Subject: [PATCH 5/6] Fix Postgres 63-char identifier limit: shorten parametrize IDs and remove redundant suffix - Change parametrize IDs from 'include_detection_in_training'/'exclude_detection_from_training' to 'exclude_false'/'exclude_true' - Remove redundant suffix (_incl/_excl) since pytest parametrize IDs already differentiate test cases - New table names: test_anomaly_in_detection_period_exclude_false (44 chars) and test_anomaly_in_detection_period_exclude_true (43 chars) - Both are well under Postgres 63-character limit - Fixes CI failures on Postgres (latest_official and latest_pre) Co-Authored-By: Yosef Arbiv --- integration_tests/tests/test_dimension_anomalies.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/integration_tests/tests/test_dimension_anomalies.py b/integration_tests/tests/test_dimension_anomalies.py index f82298fde..57f3ccf2c 100644 --- a/integration_tests/tests/test_dimension_anomalies.py +++ b/integration_tests/tests/test_dimension_anomalies.py @@ -232,7 +232,10 @@ def test_dimension_anomalies_with_timestamp_exclude_final_results( (False, "pass"), # include detection in training → anomaly absorbed (True, "fail"), # exclude detection from training → anomaly detected ], - ids=["include_detection_in_training", "exclude_detection_from_training"], + ids=[ + "exclude_false", + "exclude_true", + ], # Shortened to stay under Postgres 63-char limit ) def test_anomaly_in_detection_period( test_id: str, @@ -248,6 +251,8 @@ def test_anomaly_in_detection_period( - 7 days of anomalous data (72 Superman, 28 Spiderman per day) in detection period - Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly) - With exclusion: anomaly excluded from training, test fails (detects anomaly) + + Note: Parametrize IDs are shortened to avoid Postgres 63-character identifier limit. """ utc_now = datetime.utcnow() @@ -306,9 +311,8 @@ def test_anomaly_in_detection_period( if exclude_detection: test_args["exclude_detection_period_from_training"] = True - suffix = "_excl" if exclude_detection else "_incl" test_result = dbt_project.test( - test_id + suffix, + test_id, DBT_TEST_NAME, test_args, data=all_data, From 311d543ad8e02e55699c89cb57a3cb9595b51656 Mon Sep 17 00:00:00 2001 From: arbiv Date: Mon, 24 Nov 2025 14:06:58 +0200 Subject: [PATCH 6/6] Fix test_anomaly_in_detection_period to use date object instead of datetime Change utc_now from datetime.utcnow() to datetime.utcnow().date() to match the pattern used in other tests. Date arithmetic already works correctly with date objects. --- integration_tests/tests/test_dimension_anomalies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/tests/test_dimension_anomalies.py b/integration_tests/tests/test_dimension_anomalies.py index 57f3ccf2c..d55354a56 100644 --- a/integration_tests/tests/test_dimension_anomalies.py +++ b/integration_tests/tests/test_dimension_anomalies.py @@ -254,7 +254,7 @@ def test_anomaly_in_detection_period( Note: Parametrize IDs are shortened to avoid Postgres 63-character identifier limit. """ - utc_now = datetime.utcnow() + utc_now = datetime.utcnow().date() # Generate 30 days of normal data with variance (45/50/55 pattern for Superman) normal_pattern = [45, 50, 55]