From f349462786220e17b21ec7c9cdd0c19f8e27fb5b Mon Sep 17 00:00:00 2001 From: Francisco Videira Date: Tue, 10 Feb 2026 00:38:40 +0000 Subject: [PATCH 1/6] Add alarms --- .../terraform/components/api/README.md | 6 ++ .../terraform/components/api/alarms.tf | 70 +++++++++++++++ .../terraform/modules/alarms-apigw/README.md | 34 ++++++++ .../terraform/modules/alarms-apigw/main.tf | 87 +++++++++++++++++++ .../modules/alarms-apigw/variables.tf | 56 ++++++++++++ .../modules/alarms-apigw/versions.tf | 9 ++ .../terraform/modules/alarms-ddb/README.md | 29 +++++++ .../terraform/modules/alarms-ddb/main.tf | 45 ++++++++++ .../terraform/modules/alarms-ddb/variables.tf | 32 +++++++ .../terraform/modules/alarms-ddb/versions.tf | 9 ++ .../terraform/modules/alarms-lambda/README.md | 36 ++++++++ .../terraform/modules/alarms-lambda/main.tf | 80 +++++++++++++++++ .../modules/alarms-lambda/variables.tf | 67 ++++++++++++++ .../modules/alarms-lambda/versions.tf | 9 ++ .../terraform/modules/alarms-sqs/README.md | 31 +++++++ .../terraform/modules/alarms-sqs/main.tf | 62 +++++++++++++ .../terraform/modules/alarms-sqs/variables.tf | 42 +++++++++ .../terraform/modules/alarms-sqs/versions.tf | 9 ++ 18 files changed, 713 insertions(+) create mode 100644 infrastructure/terraform/components/api/alarms.tf create mode 100644 infrastructure/terraform/modules/alarms-apigw/README.md create mode 100644 infrastructure/terraform/modules/alarms-apigw/main.tf create mode 100644 infrastructure/terraform/modules/alarms-apigw/variables.tf create mode 100644 infrastructure/terraform/modules/alarms-apigw/versions.tf create mode 100644 infrastructure/terraform/modules/alarms-ddb/README.md create mode 100644 infrastructure/terraform/modules/alarms-ddb/main.tf create mode 100644 infrastructure/terraform/modules/alarms-ddb/variables.tf create mode 100644 infrastructure/terraform/modules/alarms-ddb/versions.tf create mode 100644 infrastructure/terraform/modules/alarms-lambda/README.md create mode 100644 infrastructure/terraform/modules/alarms-lambda/main.tf create mode 100644 infrastructure/terraform/modules/alarms-lambda/variables.tf create mode 100644 infrastructure/terraform/modules/alarms-lambda/versions.tf create mode 100644 infrastructure/terraform/modules/alarms-sqs/README.md create mode 100644 infrastructure/terraform/modules/alarms-sqs/main.tf create mode 100644 infrastructure/terraform/modules/alarms-sqs/variables.tf create mode 100644 infrastructure/terraform/modules/alarms-sqs/versions.tf diff --git a/infrastructure/terraform/components/api/README.md b/infrastructure/terraform/components/api/README.md index 01fe4c4be..74b46a79a 100644 --- a/infrastructure/terraform/components/api/README.md +++ b/infrastructure/terraform/components/api/README.md @@ -42,7 +42,11 @@ No requirements. | Name | Source | Version | |------|--------|---------| +| [apigw\_alarms](#module\_apigw\_alarms) | ../../modules/alarms-apigw | n/a | | [authorizer\_lambda](#module\_authorizer\_lambda) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | +| [ddb\_alarms\_letters](#module\_ddb\_alarms\_letters) | ../../modules/alarms-ddb | n/a | +| [ddb\_alarms\_mi](#module\_ddb\_alarms\_mi) | ../../modules/alarms-ddb | n/a | +| [ddb\_alarms\_suppliers](#module\_ddb\_alarms\_suppliers) | ../../modules/alarms-ddb | n/a | | [domain\_truststore](#module\_domain\_truststore) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-s3bucket.zip | n/a | | [eventpub](#module\_eventpub) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-eventpub.zip | n/a | | [eventsub](#module\_eventsub) | ../../modules/eventsub | n/a | @@ -51,6 +55,7 @@ No requirements. | [get\_letters](#module\_get\_letters) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [get\_status](#module\_get\_status) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [kms](#module\_kms) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-kms.zip | n/a | +| [lambda\_alarms](#module\_lambda\_alarms) | ../../modules/alarms-lambda | n/a | | [letter\_status\_update](#module\_letter\_status\_update) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [letter\_status\_updates\_queue](#module\_letter\_status\_updates\_queue) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.24/terraform-sqs.zip | n/a | | [letter\_updates\_transformer](#module\_letter\_updates\_transformer) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | @@ -60,6 +65,7 @@ No requirements. | [post\_letters](#module\_post\_letters) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [post\_mi](#module\_post\_mi) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [s3bucket\_test\_letters](#module\_s3bucket\_test\_letters) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-s3bucket.zip | n/a | +| [sqs\_alarms](#module\_sqs\_alarms) | ../../modules/alarms-sqs | n/a | | [sqs\_letter\_updates](#module\_sqs\_letter\_updates) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-sqs.zip | n/a | | [supplier\_ssl](#module\_supplier\_ssl) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-ssl.zip | n/a | | [upsert\_letter](#module\_upsert\_letter) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | diff --git a/infrastructure/terraform/components/api/alarms.tf b/infrastructure/terraform/components/api/alarms.tf new file mode 100644 index 000000000..207cf20bb --- /dev/null +++ b/infrastructure/terraform/components/api/alarms.tf @@ -0,0 +1,70 @@ +locals { + lambda_alarm_targets = { + authorizer_lambda = module.authorizer_lambda.function_name + get_letter = module.get_letter.function_name + get_letters = module.get_letters.function_name + get_letter_data = module.get_letter_data.function_name + get_status = module.get_status.function_name + patch_letter = module.patch_letter.function_name + post_letters = module.post_letters.function_name + post_mi = module.post_mi.function_name + upsert_letter = module.upsert_letter.function_name + letter_status_update = module.letter_status_update.function_name + letter_updates_transformer = module.letter_updates_transformer.function_name + mi_updates_transformer = module.mi_updates_transformer.function_name + } + + sqs_queue_names = { + letter_updates = module.sqs_letter_updates.sqs_queue_name + letter_status_updates = module.letter_status_updates_queue.sqs_queue_name + } +} + +module "lambda_alarms" { + for_each = local.lambda_alarm_targets + source = "../../modules/alarms-lambda" + + alarm_prefix = local.csi + function_name = each.value + log_group_name = "/aws/lambda/${each.value}" + tags = local.default_tags +} + +module "ddb_alarms_letters" { + source = "../../modules/alarms-ddb" + alarm_prefix = local.csi + table_name = aws_dynamodb_table.letters.name + tags = local.default_tags +} + +module "ddb_alarms_mi" { + source = "../../modules/alarms-ddb" + alarm_prefix = local.csi + table_name = aws_dynamodb_table.mi.name + tags = local.default_tags +} + +module "ddb_alarms_suppliers" { + source = "../../modules/alarms-ddb" + alarm_prefix = local.csi + table_name = aws_dynamodb_table.suppliers.name + tags = local.default_tags +} + +module "sqs_alarms" { + for_each = local.sqs_queue_names + source = "../../modules/alarms-sqs" + + alarm_prefix = local.csi + queue_name = each.value + dlq_queue_name = replace(each.value, "-queue", "-dlq") + tags = local.default_tags +} + +module "apigw_alarms" { + source = "../../modules/alarms-apigw" + alarm_prefix = local.csi + api_name = aws_api_gateway_rest_api.main.name + stage_name = aws_api_gateway_stage.main.stage_name + tags = local.default_tags +} diff --git a/infrastructure/terraform/modules/alarms-apigw/README.md b/infrastructure/terraform/modules/alarms-apigw/README.md new file mode 100644 index 000000000..d1de73b57 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-apigw/README.md @@ -0,0 +1,34 @@ + + + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.9.0 | +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [alarm\_prefix](#input\_alarm\_prefix) | n/a | `string` | n/a | yes | +| [api\_name](#input\_api\_name) | n/a | `string` | n/a | yes | +| [error\_5xx\_evaluation\_periods](#input\_error\_5xx\_evaluation\_periods) | n/a | `number` | `1` | no | +| [error\_5xx\_period\_seconds](#input\_error\_5xx\_period\_seconds) | n/a | `number` | `60` | no | +| [error\_5xx\_threshold](#input\_error\_5xx\_threshold) | n/a | `number` | `0` | no | +| [latency\_anomaly\_sensitivity](#input\_latency\_anomaly\_sensitivity) | n/a | `number` | `2` | no | +| [latency\_datapoints\_to\_alarm](#input\_latency\_datapoints\_to\_alarm) | n/a | `number` | `3` | no | +| [latency\_evaluation\_periods](#input\_latency\_evaluation\_periods) | n/a | `number` | `5` | no | +| [latency\_period\_seconds](#input\_latency\_period\_seconds) | n/a | `number` | `60` | no | +| [latency\_threshold\_ms](#input\_latency\_threshold\_ms) | n/a | `number` | `29000` | no | +| [stage\_name](#input\_stage\_name) | n/a | `string` | n/a | yes | +| [tags](#input\_tags) | n/a | `map(string)` | `{}` | no | +## Modules + +No modules. +## Outputs + +No outputs. + + + diff --git a/infrastructure/terraform/modules/alarms-apigw/main.tf b/infrastructure/terraform/modules/alarms-apigw/main.tf new file mode 100644 index 000000000..3ae9c092c --- /dev/null +++ b/infrastructure/terraform/modules/alarms-apigw/main.tf @@ -0,0 +1,87 @@ +locals { + api_dimensions = { + ApiName = var.api_name + Stage = var.stage_name + } +} + +resource "aws_cloudwatch_metric_alarm" "five_xx" { + alarm_name = "${var.alarm_prefix}-apigw-5xx" + alarm_description = "RELIABILITY: API Gateway 5xx responses" + + namespace = "AWS/ApiGateway" + metric_name = "5XXError" + statistic = "Sum" + period = var.error_5xx_period_seconds + + evaluation_periods = var.error_5xx_evaluation_periods + threshold = var.error_5xx_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = local.api_dimensions + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} + +resource "aws_cloudwatch_metric_alarm" "latency_threshold" { + alarm_name = "${var.alarm_prefix}-apigw-latency-threshold" + alarm_description = "RELIABILITY: API Gateway latency above threshold" + + namespace = "AWS/ApiGateway" + metric_name = "Latency" + statistic = "Average" + period = var.latency_period_seconds + + evaluation_periods = var.latency_evaluation_periods + threshold = var.latency_threshold_ms + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = local.api_dimensions + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} + +resource "aws_cloudwatch_metric_alarm" "latency_anomaly" { + alarm_name = "${var.alarm_prefix}-apigw-latency-anomaly" + alarm_description = "RELIABILITY: API Gateway latency anomaly" + comparison_operator = "GreaterThanUpperThreshold" + evaluation_periods = var.latency_evaluation_periods + datapoints_to_alarm = var.latency_datapoints_to_alarm + threshold_metric_id = "ad1" + treat_missing_data = "notBreaching" + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags + + metric_query { + id = "m1" + metric { + metric_name = "Latency" + namespace = "AWS/ApiGateway" + stat = "Average" + period = var.latency_period_seconds + dimensions = local.api_dimensions + } + return_data = true + } + + metric_query { + id = "ad1" + expression = "ANOMALY_DETECTION_BAND(m1, ${var.latency_anomaly_sensitivity})" + label = "Latency (expected)" + return_data = false + } +} diff --git a/infrastructure/terraform/modules/alarms-apigw/variables.tf b/infrastructure/terraform/modules/alarms-apigw/variables.tf new file mode 100644 index 000000000..70909ad7f --- /dev/null +++ b/infrastructure/terraform/modules/alarms-apigw/variables.tf @@ -0,0 +1,56 @@ +variable "alarm_prefix" { + type = string +} + +variable "api_name" { + type = string +} + +variable "stage_name" { + type = string +} + +variable "tags" { + type = map(string) + default = {} +} + +variable "error_5xx_threshold" { + type = number + default = 0 +} + +variable "error_5xx_period_seconds" { + type = number + default = 60 +} + +variable "error_5xx_evaluation_periods" { + type = number + default = 1 +} + +variable "latency_threshold_ms" { + type = number + default = 29000 +} + +variable "latency_period_seconds" { + type = number + default = 60 +} + +variable "latency_evaluation_periods" { + type = number + default = 5 +} + +variable "latency_datapoints_to_alarm" { + type = number + default = 3 +} + +variable "latency_anomaly_sensitivity" { + type = number + default = 2 +} diff --git a/infrastructure/terraform/modules/alarms-apigw/versions.tf b/infrastructure/terraform/modules/alarms-apigw/versions.tf new file mode 100644 index 000000000..f8dc86e97 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-apigw/versions.tf @@ -0,0 +1,9 @@ + +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + } + required_version = ">= 1.9.0" +} diff --git a/infrastructure/terraform/modules/alarms-ddb/README.md b/infrastructure/terraform/modules/alarms-ddb/README.md new file mode 100644 index 000000000..b9c3b0c12 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-ddb/README.md @@ -0,0 +1,29 @@ + + + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.9.0 | +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [alarm\_prefix](#input\_alarm\_prefix) | n/a | `string` | n/a | yes | +| [evaluation\_periods](#input\_evaluation\_periods) | n/a | `number` | `1` | no | +| [period\_seconds](#input\_period\_seconds) | n/a | `number` | `60` | no | +| [read\_throttle\_threshold](#input\_read\_throttle\_threshold) | n/a | `number` | `0` | no | +| [table\_name](#input\_table\_name) | n/a | `string` | n/a | yes | +| [tags](#input\_tags) | n/a | `map(string)` | `{}` | no | +| [write\_throttle\_threshold](#input\_write\_throttle\_threshold) | n/a | `number` | `0` | no | +## Modules + +No modules. +## Outputs + +No outputs. + + + diff --git a/infrastructure/terraform/modules/alarms-ddb/main.tf b/infrastructure/terraform/modules/alarms-ddb/main.tf new file mode 100644 index 000000000..a7a046aa0 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-ddb/main.tf @@ -0,0 +1,45 @@ +resource "aws_cloudwatch_metric_alarm" "read_throttle" { + alarm_name = "${var.alarm_prefix}-ddb-${var.table_name}-read-throttle" + alarm_description = "RELIABILITY: DynamoDB read throttling" + + namespace = "AWS/DynamoDB" + metric_name = "ReadThrottleEvents" + statistic = "Sum" + period = var.period_seconds + + evaluation_periods = var.evaluation_periods + threshold = var.read_throttle_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = { TableName = var.table_name } + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} + +resource "aws_cloudwatch_metric_alarm" "write_throttle" { + alarm_name = "${var.alarm_prefix}-ddb-${var.table_name}-write-throttle" + alarm_description = "RELIABILITY: DynamoDB write throttling" + + namespace = "AWS/DynamoDB" + metric_name = "WriteThrottleEvents" + statistic = "Sum" + period = var.period_seconds + + evaluation_periods = var.evaluation_periods + threshold = var.write_throttle_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = { TableName = var.table_name } + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} diff --git a/infrastructure/terraform/modules/alarms-ddb/variables.tf b/infrastructure/terraform/modules/alarms-ddb/variables.tf new file mode 100644 index 000000000..3895d21eb --- /dev/null +++ b/infrastructure/terraform/modules/alarms-ddb/variables.tf @@ -0,0 +1,32 @@ +variable "alarm_prefix" { + type = string +} + +variable "table_name" { + type = string +} + +variable "tags" { + type = map(string) + default = {} +} + +variable "period_seconds" { + type = number + default = 60 +} + +variable "evaluation_periods" { + type = number + default = 1 +} + +variable "read_throttle_threshold" { + type = number + default = 0 +} + +variable "write_throttle_threshold" { + type = number + default = 0 +} diff --git a/infrastructure/terraform/modules/alarms-ddb/versions.tf b/infrastructure/terraform/modules/alarms-ddb/versions.tf new file mode 100644 index 000000000..f8dc86e97 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-ddb/versions.tf @@ -0,0 +1,9 @@ + +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + } + required_version = ">= 1.9.0" +} diff --git a/infrastructure/terraform/modules/alarms-lambda/README.md b/infrastructure/terraform/modules/alarms-lambda/README.md new file mode 100644 index 000000000..a865cb79d --- /dev/null +++ b/infrastructure/terraform/modules/alarms-lambda/README.md @@ -0,0 +1,36 @@ + + + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.9.0 | +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [alarm\_prefix](#input\_alarm\_prefix) | n/a | `string` | n/a | yes | +| [enable\_error\_log\_metric](#input\_enable\_error\_log\_metric) | n/a | `bool` | `true` | no | +| [error\_log\_evaluation\_periods](#input\_error\_log\_evaluation\_periods) | n/a | `number` | `1` | no | +| [error\_log\_metric\_filter\_pattern](#input\_error\_log\_metric\_filter\_pattern) | n/a | `string` | `"?ERROR ?Error ?Exception"` | no | +| [error\_log\_metric\_name\_prefix](#input\_error\_log\_metric\_name\_prefix) | n/a | `string` | `"LambdaErrorLogs-"` | no | +| [error\_log\_metric\_namespace](#input\_error\_log\_metric\_namespace) | n/a | `string` | `"Custom/LambdaErrorLogs"` | no | +| [error\_log\_threshold](#input\_error\_log\_threshold) | n/a | `number` | `0` | no | +| [errors\_threshold](#input\_errors\_threshold) | n/a | `number` | `0` | no | +| [evaluation\_periods](#input\_evaluation\_periods) | n/a | `number` | `1` | no | +| [function\_name](#input\_function\_name) | n/a | `string` | n/a | yes | +| [log\_group\_name](#input\_log\_group\_name) | n/a | `string` | `""` | no | +| [period\_seconds](#input\_period\_seconds) | n/a | `number` | `300` | no | +| [tags](#input\_tags) | n/a | `map(string)` | `{}` | no | +| [throttles\_threshold](#input\_throttles\_threshold) | n/a | `number` | `0` | no | +## Modules + +No modules. +## Outputs + +No outputs. + + + diff --git a/infrastructure/terraform/modules/alarms-lambda/main.tf b/infrastructure/terraform/modules/alarms-lambda/main.tf new file mode 100644 index 000000000..a2b973329 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-lambda/main.tf @@ -0,0 +1,80 @@ +resource "aws_cloudwatch_metric_alarm" "errors" { + alarm_name = "${var.alarm_prefix}-lambda-${var.function_name}-errors" + alarm_description = "RELIABILITY: Lambda errors" + + namespace = "AWS/Lambda" + metric_name = "Errors" + statistic = "Sum" + period = var.period_seconds + + evaluation_periods = var.evaluation_periods + threshold = var.errors_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = { FunctionName = var.function_name } + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} + +resource "aws_cloudwatch_metric_alarm" "throttles" { + alarm_name = "${var.alarm_prefix}-lambda-${var.function_name}-throttles" + alarm_description = "RELIABILITY: Lambda throttles" + + namespace = "AWS/Lambda" + metric_name = "Throttles" + statistic = "Sum" + period = var.period_seconds + + evaluation_periods = var.evaluation_periods + threshold = var.throttles_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = { FunctionName = var.function_name } + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} + +resource "aws_cloudwatch_log_metric_filter" "error_logs" { + count = var.enable_error_log_metric ? 1 : 0 + name = "${var.alarm_prefix}-lambda-${var.function_name}-error-logs" + log_group_name = var.log_group_name + pattern = var.error_log_metric_filter_pattern + + metric_transformation { + name = "${var.error_log_metric_name_prefix}${var.function_name}" + namespace = var.error_log_metric_namespace + value = "1" + } +} + +resource "aws_cloudwatch_metric_alarm" "error_logs" { + count = var.enable_error_log_metric ? 1 : 0 + alarm_name = "${var.alarm_prefix}-lambda-${var.function_name}-error-logs" + alarm_description = "RELIABILITY: Lambda error logs detected" + + namespace = var.error_log_metric_namespace + metric_name = "${var.error_log_metric_name_prefix}${var.function_name}" + statistic = "Sum" + period = var.period_seconds + + evaluation_periods = var.error_log_evaluation_periods + threshold = var.error_log_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} diff --git a/infrastructure/terraform/modules/alarms-lambda/variables.tf b/infrastructure/terraform/modules/alarms-lambda/variables.tf new file mode 100644 index 000000000..4fe998606 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-lambda/variables.tf @@ -0,0 +1,67 @@ +variable "alarm_prefix" { + type = string +} + +variable "function_name" { + type = string +} + +variable "log_group_name" { + type = string + default = "" +} + +variable "tags" { + type = map(string) + default = {} +} + +variable "period_seconds" { + type = number + default = 300 +} + +variable "evaluation_periods" { + type = number + default = 1 +} + +variable "errors_threshold" { + type = number + default = 0 +} + +variable "throttles_threshold" { + type = number + default = 0 +} + +variable "enable_error_log_metric" { + type = bool + default = true +} + +variable "error_log_metric_namespace" { + type = string + default = "Custom/LambdaErrorLogs" +} + +variable "error_log_metric_name_prefix" { + type = string + default = "LambdaErrorLogs-" +} + +variable "error_log_metric_filter_pattern" { + type = string + default = "?ERROR ?Error ?Exception" +} + +variable "error_log_threshold" { + type = number + default = 0 +} + +variable "error_log_evaluation_periods" { + type = number + default = 1 +} diff --git a/infrastructure/terraform/modules/alarms-lambda/versions.tf b/infrastructure/terraform/modules/alarms-lambda/versions.tf new file mode 100644 index 000000000..f8dc86e97 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-lambda/versions.tf @@ -0,0 +1,9 @@ + +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + } + required_version = ">= 1.9.0" +} diff --git a/infrastructure/terraform/modules/alarms-sqs/README.md b/infrastructure/terraform/modules/alarms-sqs/README.md new file mode 100644 index 000000000..b02b320d3 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-sqs/README.md @@ -0,0 +1,31 @@ + + + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.9.0 | +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [age\_anomaly\_datapoints\_to\_alarm](#input\_age\_anomaly\_datapoints\_to\_alarm) | n/a | `number` | `3` | no | +| [age\_anomaly\_evaluation\_periods](#input\_age\_anomaly\_evaluation\_periods) | n/a | `number` | `5` | no | +| [age\_anomaly\_sensitivity](#input\_age\_anomaly\_sensitivity) | n/a | `number` | `2` | no | +| [age\_period\_seconds](#input\_age\_period\_seconds) | n/a | `number` | `60` | no | +| [alarm\_prefix](#input\_alarm\_prefix) | n/a | `string` | n/a | yes | +| [dlq\_queue\_name](#input\_dlq\_queue\_name) | n/a | `string` | `null` | no | +| [dlq\_visible\_threshold](#input\_dlq\_visible\_threshold) | n/a | `number` | `0` | no | +| [queue\_name](#input\_queue\_name) | n/a | `string` | n/a | yes | +| [tags](#input\_tags) | n/a | `map(string)` | `{}` | no | +## Modules + +No modules. +## Outputs + +No outputs. + + + diff --git a/infrastructure/terraform/modules/alarms-sqs/main.tf b/infrastructure/terraform/modules/alarms-sqs/main.tf new file mode 100644 index 000000000..e90237f23 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-sqs/main.tf @@ -0,0 +1,62 @@ +locals { + queue_dimensions = { QueueName = var.queue_name } +} + +resource "aws_cloudwatch_metric_alarm" "age_anomaly" { + alarm_name = "${var.alarm_prefix}-sqs-${var.queue_name}-age-anomaly" + alarm_description = "RELIABILITY: SQS oldest message age anomaly" + comparison_operator = "GreaterThanUpperThreshold" + evaluation_periods = var.age_anomaly_evaluation_periods + datapoints_to_alarm = var.age_anomaly_datapoints_to_alarm + threshold_metric_id = "ad1" + treat_missing_data = "notBreaching" + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags + + metric_query { + id = "m1" + metric { + metric_name = "ApproximateAgeOfOldestMessage" + namespace = "AWS/SQS" + stat = "Maximum" + period = var.age_period_seconds + dimensions = local.queue_dimensions + } + return_data = true + } + + metric_query { + id = "ad1" + expression = "ANOMALY_DETECTION_BAND(m1, ${var.age_anomaly_sensitivity})" + label = "AgeOfOldestMessage (expected)" + return_data = false + } +} + +resource "aws_cloudwatch_metric_alarm" "dlq_depth" { + count = var.dlq_queue_name == null ? 0 : 1 + alarm_name = "${var.alarm_prefix}-sqs-${var.dlq_queue_name}-dlq-depth" + alarm_description = "RELIABILITY: SQS DLQ has messages" + + namespace = "AWS/SQS" + metric_name = "ApproximateNumberOfMessagesVisible" + statistic = "Sum" + period = 60 + + evaluation_periods = 1 + threshold = var.dlq_visible_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = { QueueName = var.dlq_queue_name } + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} diff --git a/infrastructure/terraform/modules/alarms-sqs/variables.tf b/infrastructure/terraform/modules/alarms-sqs/variables.tf new file mode 100644 index 000000000..b75f2fd45 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-sqs/variables.tf @@ -0,0 +1,42 @@ +variable "alarm_prefix" { + type = string +} + +variable "queue_name" { + type = string +} + +variable "dlq_queue_name" { + type = string + default = null +} + +variable "tags" { + type = map(string) + default = {} +} + +variable "age_period_seconds" { + type = number + default = 60 +} + +variable "age_anomaly_sensitivity" { + type = number + default = 2 +} + +variable "age_anomaly_evaluation_periods" { + type = number + default = 5 +} + +variable "age_anomaly_datapoints_to_alarm" { + type = number + default = 3 +} + +variable "dlq_visible_threshold" { + type = number + default = 0 +} diff --git a/infrastructure/terraform/modules/alarms-sqs/versions.tf b/infrastructure/terraform/modules/alarms-sqs/versions.tf new file mode 100644 index 000000000..f8dc86e97 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-sqs/versions.tf @@ -0,0 +1,9 @@ + +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + } + required_version = ">= 1.9.0" +} From bc7febe02710362dc3714732b5dad4f2b9938340 Mon Sep 17 00:00:00 2001 From: Francisco Videira Date: Tue, 10 Feb 2026 14:06:15 +0000 Subject: [PATCH 2/6] Add alarms dir and fix anomaly metric --- .../terraform/components/api/README.md | 12 ++++----- .../terraform/components/api/alarms.tf | 27 ++++++++++++------- .../{ => alarms}/alarms-apigw/README.md | 0 .../modules/{ => alarms}/alarms-apigw/main.tf | 3 +-- .../{ => alarms}/alarms-apigw/variables.tf | 0 .../{ => alarms}/alarms-apigw/versions.tf | 0 .../modules/{ => alarms}/alarms-ddb/README.md | 0 .../modules/{ => alarms}/alarms-ddb/main.tf | 0 .../{ => alarms}/alarms-ddb/variables.tf | 0 .../{ => alarms}/alarms-ddb/versions.tf | 0 .../{ => alarms}/alarms-lambda/README.md | 0 .../{ => alarms}/alarms-lambda/main.tf | 0 .../{ => alarms}/alarms-lambda/variables.tf | 0 .../{ => alarms}/alarms-lambda/versions.tf | 0 .../modules/{ => alarms}/alarms-sqs/README.md | 0 .../modules/{ => alarms}/alarms-sqs/main.tf | 3 +-- .../{ => alarms}/alarms-sqs/variables.tf | 0 .../{ => alarms}/alarms-sqs/versions.tf | 0 18 files changed, 25 insertions(+), 20 deletions(-) rename infrastructure/terraform/modules/{ => alarms}/alarms-apigw/README.md (100%) rename infrastructure/terraform/modules/{ => alarms}/alarms-apigw/main.tf (99%) rename infrastructure/terraform/modules/{ => alarms}/alarms-apigw/variables.tf (100%) rename infrastructure/terraform/modules/{ => alarms}/alarms-apigw/versions.tf (100%) rename infrastructure/terraform/modules/{ => alarms}/alarms-ddb/README.md (100%) rename infrastructure/terraform/modules/{ => alarms}/alarms-ddb/main.tf (100%) rename infrastructure/terraform/modules/{ => alarms}/alarms-ddb/variables.tf (100%) rename infrastructure/terraform/modules/{ => alarms}/alarms-ddb/versions.tf (100%) rename infrastructure/terraform/modules/{ => alarms}/alarms-lambda/README.md (100%) rename infrastructure/terraform/modules/{ => alarms}/alarms-lambda/main.tf (100%) rename infrastructure/terraform/modules/{ => alarms}/alarms-lambda/variables.tf (100%) rename infrastructure/terraform/modules/{ => alarms}/alarms-lambda/versions.tf (100%) rename infrastructure/terraform/modules/{ => alarms}/alarms-sqs/README.md (100%) rename infrastructure/terraform/modules/{ => alarms}/alarms-sqs/main.tf (98%) rename infrastructure/terraform/modules/{ => alarms}/alarms-sqs/variables.tf (100%) rename infrastructure/terraform/modules/{ => alarms}/alarms-sqs/versions.tf (100%) diff --git a/infrastructure/terraform/components/api/README.md b/infrastructure/terraform/components/api/README.md index 74b46a79a..b3e8e2517 100644 --- a/infrastructure/terraform/components/api/README.md +++ b/infrastructure/terraform/components/api/README.md @@ -42,11 +42,11 @@ No requirements. | Name | Source | Version | |------|--------|---------| -| [apigw\_alarms](#module\_apigw\_alarms) | ../../modules/alarms-apigw | n/a | +| [apigw\_alarms](#module\_apigw\_alarms) | ../../modules/alarms/alarms-apigw | n/a | | [authorizer\_lambda](#module\_authorizer\_lambda) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | -| [ddb\_alarms\_letters](#module\_ddb\_alarms\_letters) | ../../modules/alarms-ddb | n/a | -| [ddb\_alarms\_mi](#module\_ddb\_alarms\_mi) | ../../modules/alarms-ddb | n/a | -| [ddb\_alarms\_suppliers](#module\_ddb\_alarms\_suppliers) | ../../modules/alarms-ddb | n/a | +| [ddb\_alarms\_letters](#module\_ddb\_alarms\_letters) | ../../modules/alarms/alarms-ddb | n/a | +| [ddb\_alarms\_mi](#module\_ddb\_alarms\_mi) | ../../modules/alarms/alarms-ddb | n/a | +| [ddb\_alarms\_suppliers](#module\_ddb\_alarms\_suppliers) | ../../modules/alarms/alarms-ddb | n/a | | [domain\_truststore](#module\_domain\_truststore) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-s3bucket.zip | n/a | | [eventpub](#module\_eventpub) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-eventpub.zip | n/a | | [eventsub](#module\_eventsub) | ../../modules/eventsub | n/a | @@ -55,7 +55,7 @@ No requirements. | [get\_letters](#module\_get\_letters) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [get\_status](#module\_get\_status) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [kms](#module\_kms) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-kms.zip | n/a | -| [lambda\_alarms](#module\_lambda\_alarms) | ../../modules/alarms-lambda | n/a | +| [lambda\_alarms](#module\_lambda\_alarms) | ../../modules/alarms/alarms-lambda | n/a | | [letter\_status\_update](#module\_letter\_status\_update) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [letter\_status\_updates\_queue](#module\_letter\_status\_updates\_queue) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.24/terraform-sqs.zip | n/a | | [letter\_updates\_transformer](#module\_letter\_updates\_transformer) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | @@ -65,7 +65,7 @@ No requirements. | [post\_letters](#module\_post\_letters) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [post\_mi](#module\_post\_mi) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [s3bucket\_test\_letters](#module\_s3bucket\_test\_letters) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-s3bucket.zip | n/a | -| [sqs\_alarms](#module\_sqs\_alarms) | ../../modules/alarms-sqs | n/a | +| [sqs\_alarms](#module\_sqs\_alarms) | ../../modules/alarms/alarms-sqs | n/a | | [sqs\_letter\_updates](#module\_sqs\_letter\_updates) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-sqs.zip | n/a | | [supplier\_ssl](#module\_supplier\_ssl) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-ssl.zip | n/a | | [upsert\_letter](#module\_upsert\_letter) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | diff --git a/infrastructure/terraform/components/api/alarms.tf b/infrastructure/terraform/components/api/alarms.tf index 207cf20bb..28358b082 100644 --- a/infrastructure/terraform/components/api/alarms.tf +++ b/infrastructure/terraform/components/api/alarms.tf @@ -15,14 +15,20 @@ locals { } sqs_queue_names = { - letter_updates = module.sqs_letter_updates.sqs_queue_name - letter_status_updates = module.letter_status_updates_queue.sqs_queue_name + letter_updates = { + name = module.sqs_letter_updates.sqs_queue_name + age_period_seconds = 900 + } + letter_status_updates = { + name = module.letter_status_updates_queue.sqs_queue_name + age_period_seconds = 300 + } } } module "lambda_alarms" { for_each = local.lambda_alarm_targets - source = "../../modules/alarms-lambda" + source = "../../modules/alarms/alarms-lambda" alarm_prefix = local.csi function_name = each.value @@ -31,21 +37,21 @@ module "lambda_alarms" { } module "ddb_alarms_letters" { - source = "../../modules/alarms-ddb" + source = "../../modules/alarms/alarms-ddb" alarm_prefix = local.csi table_name = aws_dynamodb_table.letters.name tags = local.default_tags } module "ddb_alarms_mi" { - source = "../../modules/alarms-ddb" + source = "../../modules/alarms/alarms-ddb" alarm_prefix = local.csi table_name = aws_dynamodb_table.mi.name tags = local.default_tags } module "ddb_alarms_suppliers" { - source = "../../modules/alarms-ddb" + source = "../../modules/alarms/alarms-ddb" alarm_prefix = local.csi table_name = aws_dynamodb_table.suppliers.name tags = local.default_tags @@ -53,16 +59,17 @@ module "ddb_alarms_suppliers" { module "sqs_alarms" { for_each = local.sqs_queue_names - source = "../../modules/alarms-sqs" + source = "../../modules/alarms/alarms-sqs" alarm_prefix = local.csi - queue_name = each.value - dlq_queue_name = replace(each.value, "-queue", "-dlq") + queue_name = each.value.name + dlq_queue_name = replace(each.value.name, "-queue", "-dlq") + age_period_seconds = each.value.age_period_seconds tags = local.default_tags } module "apigw_alarms" { - source = "../../modules/alarms-apigw" + source = "../../modules/alarms/alarms-apigw" alarm_prefix = local.csi api_name = aws_api_gateway_rest_api.main.name stage_name = aws_api_gateway_stage.main.stage_name diff --git a/infrastructure/terraform/modules/alarms-apigw/README.md b/infrastructure/terraform/modules/alarms/alarms-apigw/README.md similarity index 100% rename from infrastructure/terraform/modules/alarms-apigw/README.md rename to infrastructure/terraform/modules/alarms/alarms-apigw/README.md diff --git a/infrastructure/terraform/modules/alarms-apigw/main.tf b/infrastructure/terraform/modules/alarms/alarms-apigw/main.tf similarity index 99% rename from infrastructure/terraform/modules/alarms-apigw/main.tf rename to infrastructure/terraform/modules/alarms/alarms-apigw/main.tf index 3ae9c092c..4c8bb1fda 100644 --- a/infrastructure/terraform/modules/alarms-apigw/main.tf +++ b/infrastructure/terraform/modules/alarms/alarms-apigw/main.tf @@ -75,13 +75,12 @@ resource "aws_cloudwatch_metric_alarm" "latency_anomaly" { period = var.latency_period_seconds dimensions = local.api_dimensions } - return_data = true } metric_query { id = "ad1" expression = "ANOMALY_DETECTION_BAND(m1, ${var.latency_anomaly_sensitivity})" label = "Latency (expected)" - return_data = false + return_data = true } } diff --git a/infrastructure/terraform/modules/alarms-apigw/variables.tf b/infrastructure/terraform/modules/alarms/alarms-apigw/variables.tf similarity index 100% rename from infrastructure/terraform/modules/alarms-apigw/variables.tf rename to infrastructure/terraform/modules/alarms/alarms-apigw/variables.tf diff --git a/infrastructure/terraform/modules/alarms-apigw/versions.tf b/infrastructure/terraform/modules/alarms/alarms-apigw/versions.tf similarity index 100% rename from infrastructure/terraform/modules/alarms-apigw/versions.tf rename to infrastructure/terraform/modules/alarms/alarms-apigw/versions.tf diff --git a/infrastructure/terraform/modules/alarms-ddb/README.md b/infrastructure/terraform/modules/alarms/alarms-ddb/README.md similarity index 100% rename from infrastructure/terraform/modules/alarms-ddb/README.md rename to infrastructure/terraform/modules/alarms/alarms-ddb/README.md diff --git a/infrastructure/terraform/modules/alarms-ddb/main.tf b/infrastructure/terraform/modules/alarms/alarms-ddb/main.tf similarity index 100% rename from infrastructure/terraform/modules/alarms-ddb/main.tf rename to infrastructure/terraform/modules/alarms/alarms-ddb/main.tf diff --git a/infrastructure/terraform/modules/alarms-ddb/variables.tf b/infrastructure/terraform/modules/alarms/alarms-ddb/variables.tf similarity index 100% rename from infrastructure/terraform/modules/alarms-ddb/variables.tf rename to infrastructure/terraform/modules/alarms/alarms-ddb/variables.tf diff --git a/infrastructure/terraform/modules/alarms-ddb/versions.tf b/infrastructure/terraform/modules/alarms/alarms-ddb/versions.tf similarity index 100% rename from infrastructure/terraform/modules/alarms-ddb/versions.tf rename to infrastructure/terraform/modules/alarms/alarms-ddb/versions.tf diff --git a/infrastructure/terraform/modules/alarms-lambda/README.md b/infrastructure/terraform/modules/alarms/alarms-lambda/README.md similarity index 100% rename from infrastructure/terraform/modules/alarms-lambda/README.md rename to infrastructure/terraform/modules/alarms/alarms-lambda/README.md diff --git a/infrastructure/terraform/modules/alarms-lambda/main.tf b/infrastructure/terraform/modules/alarms/alarms-lambda/main.tf similarity index 100% rename from infrastructure/terraform/modules/alarms-lambda/main.tf rename to infrastructure/terraform/modules/alarms/alarms-lambda/main.tf diff --git a/infrastructure/terraform/modules/alarms-lambda/variables.tf b/infrastructure/terraform/modules/alarms/alarms-lambda/variables.tf similarity index 100% rename from infrastructure/terraform/modules/alarms-lambda/variables.tf rename to infrastructure/terraform/modules/alarms/alarms-lambda/variables.tf diff --git a/infrastructure/terraform/modules/alarms-lambda/versions.tf b/infrastructure/terraform/modules/alarms/alarms-lambda/versions.tf similarity index 100% rename from infrastructure/terraform/modules/alarms-lambda/versions.tf rename to infrastructure/terraform/modules/alarms/alarms-lambda/versions.tf diff --git a/infrastructure/terraform/modules/alarms-sqs/README.md b/infrastructure/terraform/modules/alarms/alarms-sqs/README.md similarity index 100% rename from infrastructure/terraform/modules/alarms-sqs/README.md rename to infrastructure/terraform/modules/alarms/alarms-sqs/README.md diff --git a/infrastructure/terraform/modules/alarms-sqs/main.tf b/infrastructure/terraform/modules/alarms/alarms-sqs/main.tf similarity index 98% rename from infrastructure/terraform/modules/alarms-sqs/main.tf rename to infrastructure/terraform/modules/alarms/alarms-sqs/main.tf index e90237f23..6a9647c29 100644 --- a/infrastructure/terraform/modules/alarms-sqs/main.tf +++ b/infrastructure/terraform/modules/alarms/alarms-sqs/main.tf @@ -26,14 +26,13 @@ resource "aws_cloudwatch_metric_alarm" "age_anomaly" { period = var.age_period_seconds dimensions = local.queue_dimensions } - return_data = true } metric_query { id = "ad1" expression = "ANOMALY_DETECTION_BAND(m1, ${var.age_anomaly_sensitivity})" label = "AgeOfOldestMessage (expected)" - return_data = false + return_data = true } } diff --git a/infrastructure/terraform/modules/alarms-sqs/variables.tf b/infrastructure/terraform/modules/alarms/alarms-sqs/variables.tf similarity index 100% rename from infrastructure/terraform/modules/alarms-sqs/variables.tf rename to infrastructure/terraform/modules/alarms/alarms-sqs/variables.tf diff --git a/infrastructure/terraform/modules/alarms-sqs/versions.tf b/infrastructure/terraform/modules/alarms/alarms-sqs/versions.tf similarity index 100% rename from infrastructure/terraform/modules/alarms-sqs/versions.tf rename to infrastructure/terraform/modules/alarms/alarms-sqs/versions.tf From fb61f97935b95d04b518223dd9a1b8e83b6ba1e5 Mon Sep 17 00:00:00 2001 From: Francisco Videira Date: Tue, 10 Feb 2026 14:08:45 +0000 Subject: [PATCH 3/6] add alarms readme --- .../terraform/modules/alarms/README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 infrastructure/terraform/modules/alarms/README.md diff --git a/infrastructure/terraform/modules/alarms/README.md b/infrastructure/terraform/modules/alarms/README.md new file mode 100644 index 000000000..df8c1f5c0 --- /dev/null +++ b/infrastructure/terraform/modules/alarms/README.md @@ -0,0 +1,19 @@ + + + + +## Requirements + +No requirements. +## Inputs + +No inputs. +## Modules + +No modules. +## Outputs + +No outputs. + + + From e6cd9b51339775adb465759c54dd9ff38dfadab8 Mon Sep 17 00:00:00 2001 From: Francisco Videira Date: Tue, 10 Feb 2026 14:57:28 +0000 Subject: [PATCH 4/6] fix return data --- infrastructure/terraform/modules/alarms/alarms-apigw/main.tf | 1 + infrastructure/terraform/modules/alarms/alarms-sqs/main.tf | 1 + 2 files changed, 2 insertions(+) diff --git a/infrastructure/terraform/modules/alarms/alarms-apigw/main.tf b/infrastructure/terraform/modules/alarms/alarms-apigw/main.tf index 4c8bb1fda..4c376944b 100644 --- a/infrastructure/terraform/modules/alarms/alarms-apigw/main.tf +++ b/infrastructure/terraform/modules/alarms/alarms-apigw/main.tf @@ -75,6 +75,7 @@ resource "aws_cloudwatch_metric_alarm" "latency_anomaly" { period = var.latency_period_seconds dimensions = local.api_dimensions } + return_data = true } metric_query { diff --git a/infrastructure/terraform/modules/alarms/alarms-sqs/main.tf b/infrastructure/terraform/modules/alarms/alarms-sqs/main.tf index 6a9647c29..32acc0fc3 100644 --- a/infrastructure/terraform/modules/alarms/alarms-sqs/main.tf +++ b/infrastructure/terraform/modules/alarms/alarms-sqs/main.tf @@ -26,6 +26,7 @@ resource "aws_cloudwatch_metric_alarm" "age_anomaly" { period = var.age_period_seconds dimensions = local.queue_dimensions } + return_data = true } metric_query { From 7f459814729c67c47885dcfea77a7fb67160f65c Mon Sep 17 00:00:00 2001 From: Francisco Videira Date: Wed, 11 Feb 2026 17:42:33 +0000 Subject: [PATCH 5/6] Fix some values and increase cert expiry from 14 to 30 --- .../terraform/components/api/module_authorizer_lambda.tf | 2 +- infrastructure/terraform/modules/alarms/alarms-lambda/main.tf | 4 ++-- .../terraform/modules/alarms/alarms-lambda/variables.tf | 2 +- .../terraform/modules/alarms/alarms-sqs/variables.tf | 2 +- lambdas/authorizer/src/__tests__/index.test.ts | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/infrastructure/terraform/components/api/module_authorizer_lambda.tf b/infrastructure/terraform/components/api/module_authorizer_lambda.tf index 7e3c94b8b..c90a7d303 100644 --- a/infrastructure/terraform/components/api/module_authorizer_lambda.tf +++ b/infrastructure/terraform/components/api/module_authorizer_lambda.tf @@ -36,7 +36,7 @@ module "authorizer_lambda" { lambda_env_vars = { CLOUDWATCH_NAMESPACE = "/aws/api-gateway/supplier/alarms", - CLIENT_CERTIFICATE_EXPIRATION_ALERT_DAYS = 14, + CLIENT_CERTIFICATE_EXPIRATION_ALERT_DAYS = 30, APIM_SUPPLIER_ID_HEADER = "NHSD-Supplier-ID", SUPPLIERS_TABLE_NAME = aws_dynamodb_table.suppliers.name } diff --git a/infrastructure/terraform/modules/alarms/alarms-lambda/main.tf b/infrastructure/terraform/modules/alarms/alarms-lambda/main.tf index a2b973329..e17ec4ca7 100644 --- a/infrastructure/terraform/modules/alarms/alarms-lambda/main.tf +++ b/infrastructure/terraform/modules/alarms/alarms-lambda/main.tf @@ -1,6 +1,6 @@ resource "aws_cloudwatch_metric_alarm" "errors" { alarm_name = "${var.alarm_prefix}-lambda-${var.function_name}-errors" - alarm_description = "RELIABILITY: Lambda errors" + alarm_description = "ERROR: Lambda errors" namespace = "AWS/Lambda" metric_name = "Errors" @@ -60,7 +60,7 @@ resource "aws_cloudwatch_log_metric_filter" "error_logs" { resource "aws_cloudwatch_metric_alarm" "error_logs" { count = var.enable_error_log_metric ? 1 : 0 alarm_name = "${var.alarm_prefix}-lambda-${var.function_name}-error-logs" - alarm_description = "RELIABILITY: Lambda error logs detected" + alarm_description = "ERROR: Lambda error logs detected" namespace = var.error_log_metric_namespace metric_name = "${var.error_log_metric_name_prefix}${var.function_name}" diff --git a/infrastructure/terraform/modules/alarms/alarms-lambda/variables.tf b/infrastructure/terraform/modules/alarms/alarms-lambda/variables.tf index 4fe998606..5da36fd53 100644 --- a/infrastructure/terraform/modules/alarms/alarms-lambda/variables.tf +++ b/infrastructure/terraform/modules/alarms/alarms-lambda/variables.tf @@ -53,7 +53,7 @@ variable "error_log_metric_name_prefix" { variable "error_log_metric_filter_pattern" { type = string - default = "?ERROR ?Error ?Exception" + default = "{ ($.level = \"50\" || $.level = \"error\") && $.environment = * }" } variable "error_log_threshold" { diff --git a/infrastructure/terraform/modules/alarms/alarms-sqs/variables.tf b/infrastructure/terraform/modules/alarms/alarms-sqs/variables.tf index b75f2fd45..762c15eac 100644 --- a/infrastructure/terraform/modules/alarms/alarms-sqs/variables.tf +++ b/infrastructure/terraform/modules/alarms/alarms-sqs/variables.tf @@ -23,7 +23,7 @@ variable "age_period_seconds" { variable "age_anomaly_sensitivity" { type = number - default = 2 + default = 3 } variable "age_anomaly_evaluation_periods" { diff --git a/lambdas/authorizer/src/__tests__/index.test.ts b/lambdas/authorizer/src/__tests__/index.test.ts index e7567c70e..5b00bf69d 100644 --- a/lambdas/authorizer/src/__tests__/index.test.ts +++ b/lambdas/authorizer/src/__tests__/index.test.ts @@ -13,7 +13,7 @@ const mockedDeps: jest.Mocked = { logger: { info: jest.fn(), error: jest.fn() } as unknown as pino.Logger, env: { CLOUDWATCH_NAMESPACE: "cloudwatch-namespace", - CLIENT_CERTIFICATE_EXPIRATION_ALERT_DAYS: 14, + CLIENT_CERTIFICATE_EXPIRATION_ALERT_DAYS: 30, APIM_SUPPLIER_ID_HEADER: "NHSD-Supplier-ID", } as unknown as EnvVars, supplierRepo: { From a5ffa22a272119e80cf27dc0073e07b00a610f19 Mon Sep 17 00:00:00 2001 From: Francisco Videira Date: Thu, 12 Feb 2026 14:33:36 +0000 Subject: [PATCH 6/6] Make sqs msg age alarms same period; some tf lint --- .../terraform/components/api/alarms.tf | 22 +++++++++---------- .../modules/alarms/alarms-lambda/main.tf | 6 ++--- .../modules/alarms/alarms-sqs/main.tf | 6 ++--- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/infrastructure/terraform/components/api/alarms.tf b/infrastructure/terraform/components/api/alarms.tf index 28358b082..5d4ade605 100644 --- a/infrastructure/terraform/components/api/alarms.tf +++ b/infrastructure/terraform/components/api/alarms.tf @@ -16,19 +16,19 @@ locals { sqs_queue_names = { letter_updates = { - name = module.sqs_letter_updates.sqs_queue_name + name = module.sqs_letter_updates.sqs_queue_name age_period_seconds = 900 } letter_status_updates = { - name = module.letter_status_updates_queue.sqs_queue_name - age_period_seconds = 300 + name = module.letter_status_updates_queue.sqs_queue_name + age_period_seconds = 900 } } } module "lambda_alarms" { - for_each = local.lambda_alarm_targets - source = "../../modules/alarms/alarms-lambda" + for_each = local.lambda_alarm_targets + source = "../../modules/alarms/alarms-lambda" alarm_prefix = local.csi function_name = each.value @@ -58,14 +58,14 @@ module "ddb_alarms_suppliers" { } module "sqs_alarms" { - for_each = local.sqs_queue_names - source = "../../modules/alarms/alarms-sqs" + for_each = local.sqs_queue_names + source = "../../modules/alarms/alarms-sqs" - alarm_prefix = local.csi - queue_name = each.value.name - dlq_queue_name = replace(each.value.name, "-queue", "-dlq") + alarm_prefix = local.csi + queue_name = each.value.name + dlq_queue_name = replace(each.value.name, "-queue", "-dlq") age_period_seconds = each.value.age_period_seconds - tags = local.default_tags + tags = local.default_tags } module "apigw_alarms" { diff --git a/infrastructure/terraform/modules/alarms/alarms-lambda/main.tf b/infrastructure/terraform/modules/alarms/alarms-lambda/main.tf index e17ec4ca7..033d17988 100644 --- a/infrastructure/terraform/modules/alarms/alarms-lambda/main.tf +++ b/infrastructure/terraform/modules/alarms/alarms-lambda/main.tf @@ -58,9 +58,9 @@ resource "aws_cloudwatch_log_metric_filter" "error_logs" { } resource "aws_cloudwatch_metric_alarm" "error_logs" { - count = var.enable_error_log_metric ? 1 : 0 - alarm_name = "${var.alarm_prefix}-lambda-${var.function_name}-error-logs" - alarm_description = "ERROR: Lambda error logs detected" + count = var.enable_error_log_metric ? 1 : 0 + alarm_name = "${var.alarm_prefix}-lambda-${var.function_name}-error-logs" + alarm_description = "ERROR: Lambda error logs detected" namespace = var.error_log_metric_namespace metric_name = "${var.error_log_metric_name_prefix}${var.function_name}" diff --git a/infrastructure/terraform/modules/alarms/alarms-sqs/main.tf b/infrastructure/terraform/modules/alarms/alarms-sqs/main.tf index 32acc0fc3..1a706973a 100644 --- a/infrastructure/terraform/modules/alarms/alarms-sqs/main.tf +++ b/infrastructure/terraform/modules/alarms/alarms-sqs/main.tf @@ -38,9 +38,9 @@ resource "aws_cloudwatch_metric_alarm" "age_anomaly" { } resource "aws_cloudwatch_metric_alarm" "dlq_depth" { - count = var.dlq_queue_name == null ? 0 : 1 - alarm_name = "${var.alarm_prefix}-sqs-${var.dlq_queue_name}-dlq-depth" - alarm_description = "RELIABILITY: SQS DLQ has messages" + count = var.dlq_queue_name == null ? 0 : 1 + alarm_name = "${var.alarm_prefix}-sqs-${var.dlq_queue_name}-dlq-depth" + alarm_description = "RELIABILITY: SQS DLQ has messages" namespace = "AWS/SQS" metric_name = "ApproximateNumberOfMessagesVisible"