diff --git a/integration_tests/tests/test_dimension_anomalies.py b/integration_tests/tests/test_dimension_anomalies.py index 69e9e8637..d55354a56 100644 --- a/integration_tests/tests/test_dimension_anomalies.py +++ b/integration_tests/tests/test_dimension_anomalies.py @@ -218,3 +218,104 @@ def test_dimension_anomalies_with_timestamp_exclude_final_results( test_result = dbt_project.test(test_id, DBT_TEST_NAME, test_args, data=data) assert test_result["status"] == "fail" assert test_result["failures"] == 1 + + +# Test for exclude_detection_period_from_training functionality +# This test demonstrates the use case where: +# 1. Detection period contains anomalous distribution data that would normally be included in training +# 2. With exclude_detection=False: anomaly is missed (test passes) because training includes the anomaly +# 3. With exclude_detection=True: anomaly is detected (test fails) because training excludes the anomaly +@pytest.mark.skip_targets(["clickhouse"]) +@pytest.mark.parametrize( + "exclude_detection,expected_status", + [ + (False, "pass"), # include detection in training → anomaly absorbed + (True, "fail"), # exclude detection from training → anomaly detected + ], + ids=[ + "exclude_false", + "exclude_true", + ], # Shortened to stay under Postgres 63-char limit +) +def test_anomaly_in_detection_period( + test_id: str, + dbt_project: DbtProject, + exclude_detection: bool, + expected_status: str, +): + """ + Test the exclude_detection_period_from_training flag functionality for dimension anomalies. + + Scenario: + - 30 days of normal data with variance (45/50/55 Superman, 55/50/45 Spiderman pattern) + - 7 days of anomalous data (72 Superman, 28 Spiderman per day) in detection period + - Without exclusion: anomaly gets included in training baseline, test passes (misses anomaly) + - With exclusion: anomaly excluded from training, test fails (detects anomaly) + + Note: Parametrize IDs are shortened to avoid Postgres 63-character identifier limit. + """ + utc_now = datetime.utcnow().date() + + # Generate 30 days of normal data with variance (45/50/55 pattern for Superman) + normal_pattern = [45, 50, 55] + normal_data = [] + for i in range(30): + date = utc_now - timedelta(days=37 - i) + superman_count = normal_pattern[i % 3] + spiderman_count = 100 - superman_count + normal_data.extend( + [ + {TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": "Superman"} + for _ in range(superman_count) + ] + ) + normal_data.extend( + [ + { + TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), + "superhero": "Spiderman", + } + for _ in range(spiderman_count) + ] + ) + + # Generate 7 days of anomalous data (72 Superman, 28 Spiderman per day) - this will be in detection period + anomalous_data = [] + for i in range(7): + date = utc_now - timedelta(days=7 - i) + anomalous_data.extend( + [ + {TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), "superhero": "Superman"} + for _ in range(72) + ] + ) + anomalous_data.extend( + [ + { + TIMESTAMP_COLUMN: date.strftime(DATE_FORMAT), + "superhero": "Spiderman", + } + for _ in range(28) + ] + ) + + all_data = normal_data + anomalous_data + + test_args = { + **DBT_TEST_ARGS, + "training_period": {"period": "day", "count": 30}, + "detection_period": {"period": "day", "count": 7}, + "time_bucket": {"period": "day", "count": 1}, + "sensitivity": 5, + } + if exclude_detection: + test_args["exclude_detection_period_from_training"] = True + + test_result = dbt_project.test( + test_id, + DBT_TEST_NAME, + test_args, + data=all_data, + ) + + assert test_result["status"] == expected_status diff --git a/macros/edr/tests/test_dimension_anomalies.sql b/macros/edr/tests/test_dimension_anomalies.sql index 6412973a2..350e6e925 100644 --- a/macros/edr/tests/test_dimension_anomalies.sql +++ b/macros/edr/tests/test_dimension_anomalies.sql @@ -1,4 +1,4 @@ -{% test dimension_anomalies(model, dimensions, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, exclude_final_results) %} +{% test dimension_anomalies(model, dimensions, timestamp_column, where_expression, anomaly_sensitivity, anomaly_direction, min_training_set_size, time_bucket, days_back, backfill_days, seasonality, sensitivity,ignore_small_changes, fail_on_zero, detection_delay, anomaly_exclude_metrics, detection_period, training_period, exclude_final_results, exclude_detection_period_from_training=false) %} {{ config(tags = ['elementary-tests']) }} {%- if execute and elementary.is_test_command() and elementary.is_elementary_enabled() %} {% set model_relation = elementary.get_model_relation_for_test(model, elementary.get_test_model()) %} @@ -39,7 +39,8 @@ anomaly_exclude_metrics=anomaly_exclude_metrics, detection_period=detection_period, training_period=training_period, - exclude_final_results=exclude_final_results) %} + exclude_final_results=exclude_final_results, + exclude_detection_period_from_training=exclude_detection_period_from_training) %} {%- if not test_configuration %} {{ exceptions.raise_compiler_error("Failed to create test configuration dict for test `{}`".format(test_table_name)) }}