diff --git a/infrastructure/terraform/components/api/README.md b/infrastructure/terraform/components/api/README.md index 75faa887..fdfbaf53 100644 --- a/infrastructure/terraform/components/api/README.md +++ b/infrastructure/terraform/components/api/README.md @@ -43,7 +43,11 @@ No requirements. | Name | Source | Version | |------|--------|---------| +| [apigw\_alarms](#module\_apigw\_alarms) | ../../modules/alarms/alarms-apigw | n/a | | [authorizer\_lambda](#module\_authorizer\_lambda) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | +| [ddb\_alarms\_letters](#module\_ddb\_alarms\_letters) | ../../modules/alarms/alarms-ddb | n/a | +| [ddb\_alarms\_mi](#module\_ddb\_alarms\_mi) | ../../modules/alarms/alarms-ddb | n/a | +| [ddb\_alarms\_suppliers](#module\_ddb\_alarms\_suppliers) | ../../modules/alarms/alarms-ddb | n/a | | [domain\_truststore](#module\_domain\_truststore) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-s3bucket.zip | n/a | | [eventpub](#module\_eventpub) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.31/terraform-eventpub.zip | n/a | | [eventsub](#module\_eventsub) | ../../modules/eventsub | n/a | @@ -52,6 +56,7 @@ No requirements. | [get\_letters](#module\_get\_letters) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [get\_status](#module\_get\_status) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [kms](#module\_kms) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-kms.zip | n/a | +| [lambda\_alarms](#module\_lambda\_alarms) | ../../modules/alarms/alarms-lambda | n/a | | [letter\_status\_update](#module\_letter\_status\_update) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [letter\_status\_updates\_queue](#module\_letter\_status\_updates\_queue) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.24/terraform-sqs.zip | n/a | | [letter\_updates\_transformer](#module\_letter\_updates\_transformer) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | @@ -61,6 +66,7 @@ No requirements. | [post\_letters](#module\_post\_letters) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [post\_mi](#module\_post\_mi) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [s3bucket\_test\_letters](#module\_s3bucket\_test\_letters) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-s3bucket.zip | n/a | +| [sqs\_alarms](#module\_sqs\_alarms) | ../../modules/alarms/alarms-sqs | n/a | | [sqs\_letter\_updates](#module\_sqs\_letter\_updates) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-sqs.zip | n/a | | [supplier\_ssl](#module\_supplier\_ssl) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-ssl.zip | n/a | | [upsert\_letter](#module\_upsert\_letter) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | diff --git a/infrastructure/terraform/components/api/alarms.tf b/infrastructure/terraform/components/api/alarms.tf new file mode 100644 index 00000000..5d4ade60 --- /dev/null +++ b/infrastructure/terraform/components/api/alarms.tf @@ -0,0 +1,77 @@ +locals { + lambda_alarm_targets = { + authorizer_lambda = module.authorizer_lambda.function_name + get_letter = module.get_letter.function_name + get_letters = module.get_letters.function_name + get_letter_data = module.get_letter_data.function_name + get_status = module.get_status.function_name + patch_letter = module.patch_letter.function_name + post_letters = module.post_letters.function_name + post_mi = module.post_mi.function_name + upsert_letter = module.upsert_letter.function_name + letter_status_update = module.letter_status_update.function_name + letter_updates_transformer = module.letter_updates_transformer.function_name + mi_updates_transformer = module.mi_updates_transformer.function_name + } + + sqs_queue_names = { + letter_updates = { + name = module.sqs_letter_updates.sqs_queue_name + age_period_seconds = 900 + } + letter_status_updates = { + name = module.letter_status_updates_queue.sqs_queue_name + age_period_seconds = 900 + } + } +} + +module "lambda_alarms" { + for_each = local.lambda_alarm_targets + source = "../../modules/alarms/alarms-lambda" + + alarm_prefix = local.csi + function_name = each.value + log_group_name = "/aws/lambda/${each.value}" + tags = local.default_tags +} + +module "ddb_alarms_letters" { + source = "../../modules/alarms/alarms-ddb" + alarm_prefix = local.csi + table_name = aws_dynamodb_table.letters.name + tags = local.default_tags +} + +module "ddb_alarms_mi" { + source = "../../modules/alarms/alarms-ddb" + alarm_prefix = local.csi + table_name = aws_dynamodb_table.mi.name + tags = local.default_tags +} + +module "ddb_alarms_suppliers" { + source = "../../modules/alarms/alarms-ddb" + alarm_prefix = local.csi + table_name = aws_dynamodb_table.suppliers.name + tags = local.default_tags +} + +module "sqs_alarms" { + for_each = local.sqs_queue_names + source = "../../modules/alarms/alarms-sqs" + + alarm_prefix = local.csi + queue_name = each.value.name + dlq_queue_name = replace(each.value.name, "-queue", "-dlq") + age_period_seconds = each.value.age_period_seconds + tags = local.default_tags +} + +module "apigw_alarms" { + source = "../../modules/alarms/alarms-apigw" + alarm_prefix = local.csi + api_name = aws_api_gateway_rest_api.main.name + stage_name = aws_api_gateway_stage.main.stage_name + tags = local.default_tags +} diff --git a/infrastructure/terraform/components/api/module_authorizer_lambda.tf b/infrastructure/terraform/components/api/module_authorizer_lambda.tf index 7e3c94b8..c90a7d30 100644 --- a/infrastructure/terraform/components/api/module_authorizer_lambda.tf +++ b/infrastructure/terraform/components/api/module_authorizer_lambda.tf @@ -36,7 +36,7 @@ module "authorizer_lambda" { lambda_env_vars = { CLOUDWATCH_NAMESPACE = "/aws/api-gateway/supplier/alarms", - CLIENT_CERTIFICATE_EXPIRATION_ALERT_DAYS = 14, + CLIENT_CERTIFICATE_EXPIRATION_ALERT_DAYS = 30, APIM_SUPPLIER_ID_HEADER = "NHSD-Supplier-ID", SUPPLIERS_TABLE_NAME = aws_dynamodb_table.suppliers.name } diff --git a/infrastructure/terraform/modules/alarms/README.md b/infrastructure/terraform/modules/alarms/README.md new file mode 100644 index 00000000..df8c1f5c --- /dev/null +++ b/infrastructure/terraform/modules/alarms/README.md @@ -0,0 +1,19 @@ + + + + +## Requirements + +No requirements. +## Inputs + +No inputs. +## Modules + +No modules. +## Outputs + +No outputs. + + + diff --git a/infrastructure/terraform/modules/alarms/alarms-apigw/README.md b/infrastructure/terraform/modules/alarms/alarms-apigw/README.md new file mode 100644 index 00000000..d1de73b5 --- /dev/null +++ b/infrastructure/terraform/modules/alarms/alarms-apigw/README.md @@ -0,0 +1,34 @@ + + + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.9.0 | +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [alarm\_prefix](#input\_alarm\_prefix) | n/a | `string` | n/a | yes | +| [api\_name](#input\_api\_name) | n/a | `string` | n/a | yes | +| [error\_5xx\_evaluation\_periods](#input\_error\_5xx\_evaluation\_periods) | n/a | `number` | `1` | no | +| [error\_5xx\_period\_seconds](#input\_error\_5xx\_period\_seconds) | n/a | `number` | `60` | no | +| [error\_5xx\_threshold](#input\_error\_5xx\_threshold) | n/a | `number` | `0` | no | +| [latency\_anomaly\_sensitivity](#input\_latency\_anomaly\_sensitivity) | n/a | `number` | `2` | no | +| [latency\_datapoints\_to\_alarm](#input\_latency\_datapoints\_to\_alarm) | n/a | `number` | `3` | no | +| [latency\_evaluation\_periods](#input\_latency\_evaluation\_periods) | n/a | `number` | `5` | no | +| [latency\_period\_seconds](#input\_latency\_period\_seconds) | n/a | `number` | `60` | no | +| [latency\_threshold\_ms](#input\_latency\_threshold\_ms) | n/a | `number` | `29000` | no | +| [stage\_name](#input\_stage\_name) | n/a | `string` | n/a | yes | +| [tags](#input\_tags) | n/a | `map(string)` | `{}` | no | +## Modules + +No modules. +## Outputs + +No outputs. + + + diff --git a/infrastructure/terraform/modules/alarms/alarms-apigw/main.tf b/infrastructure/terraform/modules/alarms/alarms-apigw/main.tf new file mode 100644 index 00000000..4c376944 --- /dev/null +++ b/infrastructure/terraform/modules/alarms/alarms-apigw/main.tf @@ -0,0 +1,87 @@ +locals { + api_dimensions = { + ApiName = var.api_name + Stage = var.stage_name + } +} + +resource "aws_cloudwatch_metric_alarm" "five_xx" { + alarm_name = "${var.alarm_prefix}-apigw-5xx" + alarm_description = "RELIABILITY: API Gateway 5xx responses" + + namespace = "AWS/ApiGateway" + metric_name = "5XXError" + statistic = "Sum" + period = var.error_5xx_period_seconds + + evaluation_periods = var.error_5xx_evaluation_periods + threshold = var.error_5xx_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = local.api_dimensions + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} + +resource "aws_cloudwatch_metric_alarm" "latency_threshold" { + alarm_name = "${var.alarm_prefix}-apigw-latency-threshold" + alarm_description = "RELIABILITY: API Gateway latency above threshold" + + namespace = "AWS/ApiGateway" + metric_name = "Latency" + statistic = "Average" + period = var.latency_period_seconds + + evaluation_periods = var.latency_evaluation_periods + threshold = var.latency_threshold_ms + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = local.api_dimensions + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} + +resource "aws_cloudwatch_metric_alarm" "latency_anomaly" { + alarm_name = "${var.alarm_prefix}-apigw-latency-anomaly" + alarm_description = "RELIABILITY: API Gateway latency anomaly" + comparison_operator = "GreaterThanUpperThreshold" + evaluation_periods = var.latency_evaluation_periods + datapoints_to_alarm = var.latency_datapoints_to_alarm + threshold_metric_id = "ad1" + treat_missing_data = "notBreaching" + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags + + metric_query { + id = "m1" + metric { + metric_name = "Latency" + namespace = "AWS/ApiGateway" + stat = "Average" + period = var.latency_period_seconds + dimensions = local.api_dimensions + } + return_data = true + } + + metric_query { + id = "ad1" + expression = "ANOMALY_DETECTION_BAND(m1, ${var.latency_anomaly_sensitivity})" + label = "Latency (expected)" + return_data = true + } +} diff --git a/infrastructure/terraform/modules/alarms/alarms-apigw/variables.tf b/infrastructure/terraform/modules/alarms/alarms-apigw/variables.tf new file mode 100644 index 00000000..70909ad7 --- /dev/null +++ b/infrastructure/terraform/modules/alarms/alarms-apigw/variables.tf @@ -0,0 +1,56 @@ +variable "alarm_prefix" { + type = string +} + +variable "api_name" { + type = string +} + +variable "stage_name" { + type = string +} + +variable "tags" { + type = map(string) + default = {} +} + +variable "error_5xx_threshold" { + type = number + default = 0 +} + +variable "error_5xx_period_seconds" { + type = number + default = 60 +} + +variable "error_5xx_evaluation_periods" { + type = number + default = 1 +} + +variable "latency_threshold_ms" { + type = number + default = 29000 +} + +variable "latency_period_seconds" { + type = number + default = 60 +} + +variable "latency_evaluation_periods" { + type = number + default = 5 +} + +variable "latency_datapoints_to_alarm" { + type = number + default = 3 +} + +variable "latency_anomaly_sensitivity" { + type = number + default = 2 +} diff --git a/infrastructure/terraform/modules/alarms/alarms-apigw/versions.tf b/infrastructure/terraform/modules/alarms/alarms-apigw/versions.tf new file mode 100644 index 00000000..f8dc86e9 --- /dev/null +++ b/infrastructure/terraform/modules/alarms/alarms-apigw/versions.tf @@ -0,0 +1,9 @@ + +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + } + required_version = ">= 1.9.0" +} diff --git a/infrastructure/terraform/modules/alarms/alarms-ddb/README.md b/infrastructure/terraform/modules/alarms/alarms-ddb/README.md new file mode 100644 index 00000000..b9c3b0c1 --- /dev/null +++ b/infrastructure/terraform/modules/alarms/alarms-ddb/README.md @@ -0,0 +1,29 @@ + + + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.9.0 | +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [alarm\_prefix](#input\_alarm\_prefix) | n/a | `string` | n/a | yes | +| [evaluation\_periods](#input\_evaluation\_periods) | n/a | `number` | `1` | no | +| [period\_seconds](#input\_period\_seconds) | n/a | `number` | `60` | no | +| [read\_throttle\_threshold](#input\_read\_throttle\_threshold) | n/a | `number` | `0` | no | +| [table\_name](#input\_table\_name) | n/a | `string` | n/a | yes | +| [tags](#input\_tags) | n/a | `map(string)` | `{}` | no | +| [write\_throttle\_threshold](#input\_write\_throttle\_threshold) | n/a | `number` | `0` | no | +## Modules + +No modules. +## Outputs + +No outputs. + + + diff --git a/infrastructure/terraform/modules/alarms/alarms-ddb/main.tf b/infrastructure/terraform/modules/alarms/alarms-ddb/main.tf new file mode 100644 index 00000000..a7a046aa --- /dev/null +++ b/infrastructure/terraform/modules/alarms/alarms-ddb/main.tf @@ -0,0 +1,45 @@ +resource "aws_cloudwatch_metric_alarm" "read_throttle" { + alarm_name = "${var.alarm_prefix}-ddb-${var.table_name}-read-throttle" + alarm_description = "RELIABILITY: DynamoDB read throttling" + + namespace = "AWS/DynamoDB" + metric_name = "ReadThrottleEvents" + statistic = "Sum" + period = var.period_seconds + + evaluation_periods = var.evaluation_periods + threshold = var.read_throttle_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = { TableName = var.table_name } + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} + +resource "aws_cloudwatch_metric_alarm" "write_throttle" { + alarm_name = "${var.alarm_prefix}-ddb-${var.table_name}-write-throttle" + alarm_description = "RELIABILITY: DynamoDB write throttling" + + namespace = "AWS/DynamoDB" + metric_name = "WriteThrottleEvents" + statistic = "Sum" + period = var.period_seconds + + evaluation_periods = var.evaluation_periods + threshold = var.write_throttle_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = { TableName = var.table_name } + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} diff --git a/infrastructure/terraform/modules/alarms/alarms-ddb/variables.tf b/infrastructure/terraform/modules/alarms/alarms-ddb/variables.tf new file mode 100644 index 00000000..3895d21e --- /dev/null +++ b/infrastructure/terraform/modules/alarms/alarms-ddb/variables.tf @@ -0,0 +1,32 @@ +variable "alarm_prefix" { + type = string +} + +variable "table_name" { + type = string +} + +variable "tags" { + type = map(string) + default = {} +} + +variable "period_seconds" { + type = number + default = 60 +} + +variable "evaluation_periods" { + type = number + default = 1 +} + +variable "read_throttle_threshold" { + type = number + default = 0 +} + +variable "write_throttle_threshold" { + type = number + default = 0 +} diff --git a/infrastructure/terraform/modules/alarms/alarms-ddb/versions.tf b/infrastructure/terraform/modules/alarms/alarms-ddb/versions.tf new file mode 100644 index 00000000..f8dc86e9 --- /dev/null +++ b/infrastructure/terraform/modules/alarms/alarms-ddb/versions.tf @@ -0,0 +1,9 @@ + +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + } + required_version = ">= 1.9.0" +} diff --git a/infrastructure/terraform/modules/alarms/alarms-lambda/README.md b/infrastructure/terraform/modules/alarms/alarms-lambda/README.md new file mode 100644 index 00000000..a865cb79 --- /dev/null +++ b/infrastructure/terraform/modules/alarms/alarms-lambda/README.md @@ -0,0 +1,36 @@ + + + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.9.0 | +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [alarm\_prefix](#input\_alarm\_prefix) | n/a | `string` | n/a | yes | +| [enable\_error\_log\_metric](#input\_enable\_error\_log\_metric) | n/a | `bool` | `true` | no | +| [error\_log\_evaluation\_periods](#input\_error\_log\_evaluation\_periods) | n/a | `number` | `1` | no | +| [error\_log\_metric\_filter\_pattern](#input\_error\_log\_metric\_filter\_pattern) | n/a | `string` | `"?ERROR ?Error ?Exception"` | no | +| [error\_log\_metric\_name\_prefix](#input\_error\_log\_metric\_name\_prefix) | n/a | `string` | `"LambdaErrorLogs-"` | no | +| [error\_log\_metric\_namespace](#input\_error\_log\_metric\_namespace) | n/a | `string` | `"Custom/LambdaErrorLogs"` | no | +| [error\_log\_threshold](#input\_error\_log\_threshold) | n/a | `number` | `0` | no | +| [errors\_threshold](#input\_errors\_threshold) | n/a | `number` | `0` | no | +| [evaluation\_periods](#input\_evaluation\_periods) | n/a | `number` | `1` | no | +| [function\_name](#input\_function\_name) | n/a | `string` | n/a | yes | +| [log\_group\_name](#input\_log\_group\_name) | n/a | `string` | `""` | no | +| [period\_seconds](#input\_period\_seconds) | n/a | `number` | `300` | no | +| [tags](#input\_tags) | n/a | `map(string)` | `{}` | no | +| [throttles\_threshold](#input\_throttles\_threshold) | n/a | `number` | `0` | no | +## Modules + +No modules. +## Outputs + +No outputs. + + + diff --git a/infrastructure/terraform/modules/alarms/alarms-lambda/main.tf b/infrastructure/terraform/modules/alarms/alarms-lambda/main.tf new file mode 100644 index 00000000..033d1798 --- /dev/null +++ b/infrastructure/terraform/modules/alarms/alarms-lambda/main.tf @@ -0,0 +1,80 @@ +resource "aws_cloudwatch_metric_alarm" "errors" { + alarm_name = "${var.alarm_prefix}-lambda-${var.function_name}-errors" + alarm_description = "ERROR: Lambda errors" + + namespace = "AWS/Lambda" + metric_name = "Errors" + statistic = "Sum" + period = var.period_seconds + + evaluation_periods = var.evaluation_periods + threshold = var.errors_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = { FunctionName = var.function_name } + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} + +resource "aws_cloudwatch_metric_alarm" "throttles" { + alarm_name = "${var.alarm_prefix}-lambda-${var.function_name}-throttles" + alarm_description = "RELIABILITY: Lambda throttles" + + namespace = "AWS/Lambda" + metric_name = "Throttles" + statistic = "Sum" + period = var.period_seconds + + evaluation_periods = var.evaluation_periods + threshold = var.throttles_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = { FunctionName = var.function_name } + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} + +resource "aws_cloudwatch_log_metric_filter" "error_logs" { + count = var.enable_error_log_metric ? 1 : 0 + name = "${var.alarm_prefix}-lambda-${var.function_name}-error-logs" + log_group_name = var.log_group_name + pattern = var.error_log_metric_filter_pattern + + metric_transformation { + name = "${var.error_log_metric_name_prefix}${var.function_name}" + namespace = var.error_log_metric_namespace + value = "1" + } +} + +resource "aws_cloudwatch_metric_alarm" "error_logs" { + count = var.enable_error_log_metric ? 1 : 0 + alarm_name = "${var.alarm_prefix}-lambda-${var.function_name}-error-logs" + alarm_description = "ERROR: Lambda error logs detected" + + namespace = var.error_log_metric_namespace + metric_name = "${var.error_log_metric_name_prefix}${var.function_name}" + statistic = "Sum" + period = var.period_seconds + + evaluation_periods = var.error_log_evaluation_periods + threshold = var.error_log_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} diff --git a/infrastructure/terraform/modules/alarms/alarms-lambda/variables.tf b/infrastructure/terraform/modules/alarms/alarms-lambda/variables.tf new file mode 100644 index 00000000..5da36fd5 --- /dev/null +++ b/infrastructure/terraform/modules/alarms/alarms-lambda/variables.tf @@ -0,0 +1,67 @@ +variable "alarm_prefix" { + type = string +} + +variable "function_name" { + type = string +} + +variable "log_group_name" { + type = string + default = "" +} + +variable "tags" { + type = map(string) + default = {} +} + +variable "period_seconds" { + type = number + default = 300 +} + +variable "evaluation_periods" { + type = number + default = 1 +} + +variable "errors_threshold" { + type = number + default = 0 +} + +variable "throttles_threshold" { + type = number + default = 0 +} + +variable "enable_error_log_metric" { + type = bool + default = true +} + +variable "error_log_metric_namespace" { + type = string + default = "Custom/LambdaErrorLogs" +} + +variable "error_log_metric_name_prefix" { + type = string + default = "LambdaErrorLogs-" +} + +variable "error_log_metric_filter_pattern" { + type = string + default = "{ ($.level = \"50\" || $.level = \"error\") && $.environment = * }" +} + +variable "error_log_threshold" { + type = number + default = 0 +} + +variable "error_log_evaluation_periods" { + type = number + default = 1 +} diff --git a/infrastructure/terraform/modules/alarms/alarms-lambda/versions.tf b/infrastructure/terraform/modules/alarms/alarms-lambda/versions.tf new file mode 100644 index 00000000..f8dc86e9 --- /dev/null +++ b/infrastructure/terraform/modules/alarms/alarms-lambda/versions.tf @@ -0,0 +1,9 @@ + +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + } + required_version = ">= 1.9.0" +} diff --git a/infrastructure/terraform/modules/alarms/alarms-sqs/README.md b/infrastructure/terraform/modules/alarms/alarms-sqs/README.md new file mode 100644 index 00000000..b02b320d --- /dev/null +++ b/infrastructure/terraform/modules/alarms/alarms-sqs/README.md @@ -0,0 +1,31 @@ + + + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.9.0 | +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [age\_anomaly\_datapoints\_to\_alarm](#input\_age\_anomaly\_datapoints\_to\_alarm) | n/a | `number` | `3` | no | +| [age\_anomaly\_evaluation\_periods](#input\_age\_anomaly\_evaluation\_periods) | n/a | `number` | `5` | no | +| [age\_anomaly\_sensitivity](#input\_age\_anomaly\_sensitivity) | n/a | `number` | `2` | no | +| [age\_period\_seconds](#input\_age\_period\_seconds) | n/a | `number` | `60` | no | +| [alarm\_prefix](#input\_alarm\_prefix) | n/a | `string` | n/a | yes | +| [dlq\_queue\_name](#input\_dlq\_queue\_name) | n/a | `string` | `null` | no | +| [dlq\_visible\_threshold](#input\_dlq\_visible\_threshold) | n/a | `number` | `0` | no | +| [queue\_name](#input\_queue\_name) | n/a | `string` | n/a | yes | +| [tags](#input\_tags) | n/a | `map(string)` | `{}` | no | +## Modules + +No modules. +## Outputs + +No outputs. + + + diff --git a/infrastructure/terraform/modules/alarms/alarms-sqs/main.tf b/infrastructure/terraform/modules/alarms/alarms-sqs/main.tf new file mode 100644 index 00000000..1a706973 --- /dev/null +++ b/infrastructure/terraform/modules/alarms/alarms-sqs/main.tf @@ -0,0 +1,62 @@ +locals { + queue_dimensions = { QueueName = var.queue_name } +} + +resource "aws_cloudwatch_metric_alarm" "age_anomaly" { + alarm_name = "${var.alarm_prefix}-sqs-${var.queue_name}-age-anomaly" + alarm_description = "RELIABILITY: SQS oldest message age anomaly" + comparison_operator = "GreaterThanUpperThreshold" + evaluation_periods = var.age_anomaly_evaluation_periods + datapoints_to_alarm = var.age_anomaly_datapoints_to_alarm + threshold_metric_id = "ad1" + treat_missing_data = "notBreaching" + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags + + metric_query { + id = "m1" + metric { + metric_name = "ApproximateAgeOfOldestMessage" + namespace = "AWS/SQS" + stat = "Maximum" + period = var.age_period_seconds + dimensions = local.queue_dimensions + } + return_data = true + } + + metric_query { + id = "ad1" + expression = "ANOMALY_DETECTION_BAND(m1, ${var.age_anomaly_sensitivity})" + label = "AgeOfOldestMessage (expected)" + return_data = true + } +} + +resource "aws_cloudwatch_metric_alarm" "dlq_depth" { + count = var.dlq_queue_name == null ? 0 : 1 + alarm_name = "${var.alarm_prefix}-sqs-${var.dlq_queue_name}-dlq-depth" + alarm_description = "RELIABILITY: SQS DLQ has messages" + + namespace = "AWS/SQS" + metric_name = "ApproximateNumberOfMessagesVisible" + statistic = "Sum" + period = 60 + + evaluation_periods = 1 + threshold = var.dlq_visible_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = { QueueName = var.dlq_queue_name } + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} diff --git a/infrastructure/terraform/modules/alarms/alarms-sqs/variables.tf b/infrastructure/terraform/modules/alarms/alarms-sqs/variables.tf new file mode 100644 index 00000000..762c15ea --- /dev/null +++ b/infrastructure/terraform/modules/alarms/alarms-sqs/variables.tf @@ -0,0 +1,42 @@ +variable "alarm_prefix" { + type = string +} + +variable "queue_name" { + type = string +} + +variable "dlq_queue_name" { + type = string + default = null +} + +variable "tags" { + type = map(string) + default = {} +} + +variable "age_period_seconds" { + type = number + default = 60 +} + +variable "age_anomaly_sensitivity" { + type = number + default = 3 +} + +variable "age_anomaly_evaluation_periods" { + type = number + default = 5 +} + +variable "age_anomaly_datapoints_to_alarm" { + type = number + default = 3 +} + +variable "dlq_visible_threshold" { + type = number + default = 0 +} diff --git a/infrastructure/terraform/modules/alarms/alarms-sqs/versions.tf b/infrastructure/terraform/modules/alarms/alarms-sqs/versions.tf new file mode 100644 index 00000000..f8dc86e9 --- /dev/null +++ b/infrastructure/terraform/modules/alarms/alarms-sqs/versions.tf @@ -0,0 +1,9 @@ + +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + } + required_version = ">= 1.9.0" +} diff --git a/lambdas/authorizer/src/__tests__/index.test.ts b/lambdas/authorizer/src/__tests__/index.test.ts index 4020b55e..a3f2fc9f 100644 --- a/lambdas/authorizer/src/__tests__/index.test.ts +++ b/lambdas/authorizer/src/__tests__/index.test.ts @@ -17,7 +17,7 @@ const mockedDeps: jest.Mocked = { } as unknown as pino.Logger, env: { CLOUDWATCH_NAMESPACE: "cloudwatch-namespace", - CLIENT_CERTIFICATE_EXPIRATION_ALERT_DAYS: 14, + CLIENT_CERTIFICATE_EXPIRATION_ALERT_DAYS: 30, APIM_SUPPLIER_ID_HEADER: "NHSD-Supplier-ID", } as unknown as EnvVars, supplierRepo: { @@ -56,10 +56,11 @@ describe("Authorizer Lambda Function", () => { }); describe("Certificate expiry check", () => { + const currentDate = new Date("2025-11-01T14:19:00Z"); beforeEach(() => { jest .useFakeTimers({ doNotFake: ["nextTick"] }) - .setSystemTime(new Date("2025-11-03T14:19:00Z")); + .setSystemTime(currentDate); }); afterEach(() => { @@ -81,7 +82,7 @@ describe("Authorizer Lambda Function", () => { it("Should log CloudWatch metric when the certificate expiry threshold is reached", async () => { mockEvent.requestContext.identity.clientCert = buildCertWithExpiry( - "2025-11-17T14:19:00Z", + "2025-11-31T14:19:00Z", ); const handler = createAuthorizerHandler(mockedDeps); @@ -92,7 +93,7 @@ describe("Authorizer Lambda Function", () => { expect(mockedInfo.mock.calls.map((call) => call[0])).toContain( JSON.stringify({ _aws: { - Timestamp: 1_762_179_540_000, + Timestamp: currentDate.getTime(), CloudWatchMetrics: [ { Namespace: "cloudwatch-namespace", @@ -108,7 +109,7 @@ describe("Authorizer Lambda Function", () => { ], }, SUBJECT_DN: "CN=test-subject", - NOT_AFTER: "2025-11-17T14:19:00Z", + NOT_AFTER: "2025-11-31T14:19:00Z", "apim-client-certificate-near-expiry": 1, }), ); @@ -116,7 +117,7 @@ describe("Authorizer Lambda Function", () => { it("Should not log CloudWatch metric when the certificate expiry threshold is not yet reached", async () => { mockEvent.requestContext.identity.clientCert = buildCertWithExpiry( - "2025-11-18T14:19:00Z", + "2026-01-01T14:19:00Z", ); const handler = createAuthorizerHandler(mockedDeps);