diff --git a/terraform/account-wide-infrastructure/README.md b/terraform/account-wide-infrastructure/README.md index 7e8881a6d..96ed581d4 100644 --- a/terraform/account-wide-infrastructure/README.md +++ b/terraform/account-wide-infrastructure/README.md @@ -134,9 +134,11 @@ If deploying the EC2 set up to a new environment, these steps need to be followe aws ssm start-session --target --document-name AWS-StartPortForwardingSession --parameters "localPortNumber=13389,portNumber=3389" ``` -2. Install Athena ODBC driver and Power BI personal on premises gateway -3. Configure ODBC driver to connect to relevant Athena instance and log in to the gateway using NHS email -4. Log into power bi and test the refresh on the relevant data sources +2. Install Athena ODBC driver and Power BI standard on premises gateway +3. Configure ODBC driver to connect to relevant Athena instance +4. Log in to the gateway using NHS email, name the cluster to nhsd-nrlf-{env}--reporting-gw +5. Log on to power bi, navigate to Manage Connections and Gateways in settings and set up Athena connector with authentication method: Anonymous and privacy level: Private +6. Set dataset to point to this gateway, define schedule as needed ## Tear down account wide resources diff --git a/terraform/account-wide-infrastructure/dev/athena.tf b/terraform/account-wide-infrastructure/dev/athena.tf index cb7e78a97..77cc82148 100644 --- a/terraform/account-wide-infrastructure/dev/athena.tf +++ b/terraform/account-wide-infrastructure/dev/athena.tf @@ -1,5 +1,7 @@ module "dev-athena" { + count = var.enable_reporting ? 1 : 0 source = "../modules/athena" name_prefix = "nhsd-nrlf--dev" target_bucket_name = module.dev-glue.target_bucket_name + glue_database = module.dev-glue.glue_database } diff --git a/terraform/account-wide-infrastructure/dev/ec2.tf b/terraform/account-wide-infrastructure/dev/ec2.tf index 2ed6a2246..bfe9ede22 100644 --- a/terraform/account-wide-infrastructure/dev/ec2.tf +++ b/terraform/account-wide-infrastructure/dev/ec2.tf @@ -1,4 +1,5 @@ module "vpc" { + count = var.enable_reporting && var.enable_powerbi_auto_push ? 1 : 0 source = "../modules/vpc" vpc_cidr_block = var.vpc_cidr_block enable_dns_hostnames = var.enable_dns_hostnames @@ -8,16 +9,17 @@ module "vpc" { name_prefix = "nhsd-nrlf--dev" } -module "powerbi_gw_instance_v2" { - source = "../modules/ec2" - use_custom_ami = true - instance_type = var.instance_type - name_prefix = "nhsd-nrlf--dev-powerbi-gw-v2" +module "powerbi_gw_instance" { + count = var.enable_reporting && var.enable_powerbi_auto_push ? 1 : 0 + source = "../modules/powerbi-gw-ec2" + use_custom_ami = var.use_powerbi_gw_custom_ami + instance_type = var.powerbi_gw_instance_type + name_prefix = "nhsd-nrlf--dev-powerbi-gw" target_bucket_arn = module.dev-glue.target_bucket_arn glue_kms_key_arn = module.dev-glue.aws_kms_key_arn - athena_kms_key_arn = module.dev-athena.kms_key_arn - athena_bucket_arn = module.dev-athena.bucket_arn + athena_kms_key_arn = module.dev-athena[0].kms_key_arn + athena_bucket_arn = module.dev-athena[0].bucket_arn - subnet_id = module.vpc.private_subnet_id - security_groups = [module.vpc.powerbi_gw_security_group_id] + subnet_id = module.vpc[0].private_subnet_id + security_groups = [module.vpc[0].powerbi_gw_security_group_id] } diff --git a/terraform/account-wide-infrastructure/dev/glue.tf b/terraform/account-wide-infrastructure/dev/glue.tf index e8fbd713a..9f52c1f9a 100644 --- a/terraform/account-wide-infrastructure/dev/glue.tf +++ b/terraform/account-wide-infrastructure/dev/glue.tf @@ -1,4 +1,5 @@ module "dev-glue" { + is_enabled = var.enable_reporting source = "../modules/glue" name_prefix = "nhsd-nrlf--dev" python_version = 3 diff --git a/terraform/account-wide-infrastructure/dev/vars.tf b/terraform/account-wide-infrastructure/dev/vars.tf index dcbbfd78b..adb15d24a 100644 --- a/terraform/account-wide-infrastructure/dev/vars.tf +++ b/terraform/account-wide-infrastructure/dev/vars.tf @@ -14,6 +14,12 @@ variable "devsandbox_api_domain_name" { default = "dev-sandbox.api.record-locator.dev.national.nhs.uk" } +variable "enable_reporting" { + type = bool + description = "Enable account-wide reporting processes in the dev account" + default = true +} + variable "aws_azs" { type = string description = "AWS Availability Zones" @@ -44,14 +50,20 @@ variable "vpc_private_subnets_cidr_block" { default = "10.0.1.0/24" } -variable "instance_type" { +variable "enable_powerbi_auto_push" { + type = bool + description = "Enable automatic pushing of info into PowerBI" + default = true +} + +variable "powerbi_gw_instance_type" { type = string - description = "Type for EC2 Instance" + description = "Type for PowerBI GW EC2 Instance" default = "t2.micro" } -variable "use_custom_ami" { +variable "use_powerbi_gw_custom_ami" { type = bool - description = "Use custom image" - default = false + description = "Use custom image for PowerBI GW instance" + default = true } diff --git a/terraform/account-wide-infrastructure/modules/athena/athena.tf b/terraform/account-wide-infrastructure/modules/athena/athena.tf index d111611e5..b907541be 100644 --- a/terraform/account-wide-infrastructure/modules/athena/athena.tf +++ b/terraform/account-wide-infrastructure/modules/athena/athena.tf @@ -16,3 +16,17 @@ resource "aws_athena_workgroup" "athena" { } } + +resource "aws_athena_named_query" "rep_consumer" { + name = "rep_consumer" + workgroup = aws_athena_workgroup.athena.id + database = var.glue_database + query = file("${path.module}/sql/rep_consumer.sql") +} + +resource "aws_athena_named_query" "rep_producer" { + name = "rep_producer" + workgroup = aws_athena_workgroup.athena.id + database = var.glue_database + query = file("${path.module}/sql/rep_producer.sql") +} diff --git a/terraform/account-wide-infrastructure/modules/athena/sql/rep_consumer.sql b/terraform/account-wide-infrastructure/modules/athena/sql/rep_consumer.sql new file mode 100644 index 000000000..7fa4095d0 --- /dev/null +++ b/terraform/account-wide-infrastructure/modules/athena/sql/rep_consumer.sql @@ -0,0 +1,118 @@ +CREATE OR REPLACE VIEW "rep_consumer" AS +WITH + cc AS ( + SELECT + time + , event_timestamp + , date + , host + , event_log_reference + , event_level + , event_location + , event_message + , event_service + , event_function_request_id + , event_correlation_id + , event_xray_trace_id + , event_pointer_types + , COALESCE("event_headers_nhsd-end-user-organisation-ods", event_metadata_ods_code) user_ods + FROM + consumer_countdocumentreference +) +, cr AS ( + SELECT + time + , event_timestamp + , date + , host + , event_log_reference + , event_level + , event_location + , event_message + , event_service + , event_function_request_id + , event_correlation_id + , event_xray_trace_id + , event_pointer_types + , COALESCE("event_headers_nhsd-end-user-organisation-ods", event_metadata_ods_code) user_ods + FROM + consumer_readdocumentreference +) +, cs AS ( + SELECT + time + , event_timestamp + , date + , host + , event_log_reference + , event_level + , event_location + , event_message + , event_service + , event_function_request_id + , event_correlation_id + , event_xray_trace_id + , event_pointer_types + , COALESCE("event_headers_nhsd-end-user-organisation-ods", event_metadata_ods_code) user_ods + FROM + consumer_searchdocumentreference +) +, csp AS ( + SELECT + time + , event_timestamp + , date + , host + , event_log_reference + , event_level + , event_location + , event_message + , event_service + , event_function_request_id + , event_correlation_id + , event_xray_trace_id + , event_pointer_types + , COALESCE("event_headers_nhsd-end-user-organisation-ods", event_metadata_ods_code) user_ods + FROM + consumer_searchpostdocumentreference +) +, base AS ( + SELECT * + FROM + cc +UNION SELECT * + FROM + cr +UNION SELECT * + FROM + cs +UNION SELECT * + FROM + csp +) +, ods_codes AS ( + SELECT DISTINCT + user_ods + , event_xray_trace_id + FROM + base + WHERE (user_ods IS NOT NULL) +) +SELECT + time +, event_timestamp +, date +, host +, event_log_reference +, event_level +, event_location +, event_message +, event_service +, event_function_request_id +, b.event_correlation_id +, b.event_xray_trace_id +, event_pointer_types +, oc.user_ods +FROM + (base b +LEFT JOIN ods_codes oc ON (b.event_xray_trace_id = oc.event_xray_trace_id)) diff --git a/terraform/account-wide-infrastructure/modules/athena/sql/rep_producer.sql b/terraform/account-wide-infrastructure/modules/athena/sql/rep_producer.sql new file mode 100644 index 000000000..9c294b257 --- /dev/null +++ b/terraform/account-wide-infrastructure/modules/athena/sql/rep_producer.sql @@ -0,0 +1,184 @@ +CREATE OR REPLACE VIEW "rep_producer" AS +WITH + pc AS ( + SELECT + time + , event_timestamp + , date + , host + , event_log_reference + , event_level + , event_location + , event_message + , event_service + , event_function_request_id + , event_correlation_id + , event_xray_trace_id + , event_pointer_types + , COALESCE("event_headers_nhsd-end-user-organisation-ods", event_metadata_ods_code) user_ods + FROM + producer_createdocumentreference +) +, pd AS ( + SELECT + time + , event_timestamp + , date + , host + , event_log_reference + , event_level + , event_location + , event_message + , event_service + , event_function_request_id + , event_correlation_id + , event_xray_trace_id + , event_pointer_types + , COALESCE("event_headers_nhsd-end-user-organisation-ods", event_metadata_ods_code) user_ods + FROM + producer_deletedocumentreference +) +, pr AS ( + SELECT + time + , event_timestamp + , date + , host + , event_log_reference + , event_level + , event_location + , event_message + , event_service + , event_function_request_id + , event_correlation_id + , event_xray_trace_id + , event_pointer_types + , COALESCE("event_headers_nhsd-end-user-organisation-ods", event_metadata_ods_code) user_ods + FROM + producer_readdocumentreference +) +, ps AS ( + SELECT + time + , event_timestamp + , date + , host + , event_log_reference + , event_level + , event_location + , event_message + , event_service + , event_function_request_id + , event_correlation_id + , event_xray_trace_id + , event_pointer_types + , COALESCE("event_headers_nhsd-end-user-organisation-ods", event_metadata_ods_code) user_ods + FROM + producer_searchdocumentreference +) +, psp AS ( + SELECT + time + , event_timestamp + , date + , host + , event_log_reference + , event_level + , event_location + , event_message + , event_service + , event_function_request_id + , event_correlation_id + , event_xray_trace_id + , event_pointer_types + , COALESCE("event_headers_nhsd-end-user-organisation-ods", event_metadata_ods_code) user_ods + FROM + producer_searchpostdocumentreference +) +, pu AS ( + SELECT + time + , event_timestamp + , date + , host + , event_log_reference + , event_level + , event_location + , event_message + , event_service + , event_function_request_id + , event_correlation_id + , event_xray_trace_id + , event_pointer_types + , COALESCE("event_headers_nhsd-end-user-organisation-ods", event_metadata_ods_code) user_ods + FROM + producer_updatedocumentreference +) +, pus AS ( + SELECT + time + , event_timestamp + , date + , host + , event_log_reference + , event_level + , event_location + , event_message + , event_service + , event_function_request_id + , event_correlation_id + , event_xray_trace_id + , event_pointer_types + , COALESCE("event_headers_nhsd-end-user-organisation-ods", event_metadata_ods_code) user_ods + FROM + producer_upsertdocumentreference +) +, base AS ( + SELECT * + FROM + pc +UNION SELECT * + FROM + pd +UNION SELECT * + FROM + pr +UNION SELECT * + FROM + ps +UNION SELECT * + FROM + psp +UNION SELECT * + FROM + pu +UNION SELECT * + FROM + pus +) +, ods_codes AS ( + SELECT DISTINCT + user_ods + , event_xray_trace_id + FROM + base + WHERE (user_ods IS NOT NULL) +) +SELECT + time +, event_timestamp +, date +, host +, event_log_reference +, event_level +, event_location +, event_message +, event_service +, event_function_request_id +, b.event_correlation_id +, b.event_xray_trace_id +, event_pointer_types +, oc.user_ods +FROM + (base b +LEFT JOIN ods_codes oc ON (b.event_xray_trace_id = oc.event_xray_trace_id)) diff --git a/terraform/account-wide-infrastructure/modules/athena/vars.tf b/terraform/account-wide-infrastructure/modules/athena/vars.tf index d09d6f65c..a1d447972 100644 --- a/terraform/account-wide-infrastructure/modules/athena/vars.tf +++ b/terraform/account-wide-infrastructure/modules/athena/vars.tf @@ -1,8 +1,3 @@ -variable "database" { - description = "What the db will be called" - default = "nrl_reporting" -} - variable "name_prefix" { type = string description = "The prefix to apply to all resources in the module." @@ -11,3 +6,8 @@ variable "name_prefix" { variable "target_bucket_name" { type = string } + +variable "glue_database" { + type = string + description = "The Glue database in use" +} diff --git a/terraform/account-wide-infrastructure/modules/backup-source/sns.tf b/terraform/account-wide-infrastructure/modules/backup-source/sns.tf index f91b26b96..c0bb6827c 100644 --- a/terraform/account-wide-infrastructure/modules/backup-source/sns.tf +++ b/terraform/account-wide-infrastructure/modules/backup-source/sns.tf @@ -26,9 +26,9 @@ data "aws_iam_policy_document" "allow_backup_to_sns" { } resource "aws_sns_topic_subscription" "aws_backup_notifications_email_target" { - for_each = var.notification_target_email_addresses + count = length(var.notification_target_email_addresses) topic_arn = aws_sns_topic.backup.arn protocol = "email" - endpoint = each.value + endpoint = var.notification_target_email_addresses[count.index] filter_policy = jsonencode({ "State" : [{ "anything-but" : "COMPLETED" }] }) } diff --git a/terraform/account-wide-infrastructure/modules/backup-source/variables.tf b/terraform/account-wide-infrastructure/modules/backup-source/variables.tf index 72cc612f6..e2c5985d3 100644 --- a/terraform/account-wide-infrastructure/modules/backup-source/variables.tf +++ b/terraform/account-wide-infrastructure/modules/backup-source/variables.tf @@ -10,7 +10,8 @@ variable "environment_name" { variable "notification_target_email_addresses" { description = "The email addresses to which backup notifications will be sent via SNS." - type = set(string) + type = list(string) + sensitive = true default = [] } diff --git a/terraform/account-wide-infrastructure/modules/ec2/outputs.tf b/terraform/account-wide-infrastructure/modules/ec2/outputs.tf deleted file mode 100644 index 10e5a82d1..000000000 --- a/terraform/account-wide-infrastructure/modules/ec2/outputs.tf +++ /dev/null @@ -1,7 +0,0 @@ -output "instance_id" { - value = aws_instance.web.id -} - -output "public_ip" { - value = aws_instance.web.public_ip -} diff --git a/terraform/account-wide-infrastructure/modules/glue/glue.tf b/terraform/account-wide-infrastructure/modules/glue/glue.tf index e36433b5c..64a3c5d99 100644 --- a/terraform/account-wide-infrastructure/modules/glue/glue.tf +++ b/terraform/account-wide-infrastructure/modules/glue/glue.tf @@ -1,46 +1,50 @@ # Create Glue Data Catalog Database resource "aws_glue_catalog_database" "log_database" { + count = var.is_enabled ? 1 : 0 + name = "${var.name_prefix}-reporting" location_uri = "${aws_s3_bucket.target-data-bucket.id}/" } # Create Glue Crawler resource "aws_glue_crawler" "log_crawler" { + count = var.is_enabled ? 1 : 0 + name = "${var.name_prefix}-log-crawler" - database_name = aws_glue_catalog_database.log_database.name + database_name = aws_glue_catalog_database.log_database[0].name role = aws_iam_role.glue_service_role.name s3_target { - path = "${aws_s3_bucket.target-data-bucket.id}/consumer_countDocumentReference/" + path = "s3://${aws_s3_bucket.target-data-bucket.id}/consumer_countDocumentReference/" } s3_target { - path = "${aws_s3_bucket.target-data-bucket.id}/consumer_readDocumentReference/" + path = "s3://${aws_s3_bucket.target-data-bucket.id}/consumer_readDocumentReference/" } s3_target { - path = "${aws_s3_bucket.target-data-bucket.id}/consumer_searchDocumentReference/" + path = "s3://${aws_s3_bucket.target-data-bucket.id}/consumer_searchDocumentReference/" } s3_target { - path = "${aws_s3_bucket.target-data-bucket.id}/consumer_searchPostDocumentReference/" + path = "s3://${aws_s3_bucket.target-data-bucket.id}/consumer_searchPostDocumentReference/" } s3_target { - path = "${aws_s3_bucket.target-data-bucket.id}/producer_createDocumentReference/" + path = "s3://${aws_s3_bucket.target-data-bucket.id}/producer_createDocumentReference/" } s3_target { - path = "${aws_s3_bucket.target-data-bucket.id}/producer_deleteDocumentReference/" + path = "s3://${aws_s3_bucket.target-data-bucket.id}/producer_deleteDocumentReference/" } s3_target { - path = "${aws_s3_bucket.target-data-bucket.id}/producer_readDocumentReference/" + path = "s3://${aws_s3_bucket.target-data-bucket.id}/producer_readDocumentReference/" } s3_target { - path = "${aws_s3_bucket.target-data-bucket.id}/producer_searchDocumentReference/" + path = "s3://${aws_s3_bucket.target-data-bucket.id}/producer_searchDocumentReference/" } s3_target { - path = "${aws_s3_bucket.target-data-bucket.id}/producer_searchPostDocumentReference/" + path = "s3://${aws_s3_bucket.target-data-bucket.id}/producer_searchPostDocumentReference/" } s3_target { - path = "${aws_s3_bucket.target-data-bucket.id}/producer_updateDocumentReference/" + path = "s3://${aws_s3_bucket.target-data-bucket.id}/producer_updateDocumentReference/" } s3_target { - path = "${aws_s3_bucket.target-data-bucket.id}/producer_upsertDocumentReference/" + path = "s3://${aws_s3_bucket.target-data-bucket.id}/producer_upsertDocumentReference/" } schema_change_policy { delete_behavior = "LOG" @@ -53,14 +57,18 @@ resource "aws_glue_crawler" "log_crawler" { }) } resource "aws_glue_trigger" "log_trigger" { + count = var.is_enabled ? 1 : 0 + name = "${var.name_prefix}-org-report-trigger" type = "ON_DEMAND" actions { - crawler_name = aws_glue_crawler.log_crawler.name + crawler_name = aws_glue_crawler.log_crawler[0].name } } resource "aws_glue_job" "glue_job" { + count = var.is_enabled ? 1 : 0 + name = "${var.name_prefix}-glue-job" role_arn = aws_iam_role.glue_service_role.arn description = "Transfer logs from source to bucket" diff --git a/terraform/account-wide-infrastructure/modules/glue/iam.tf b/terraform/account-wide-infrastructure/modules/glue/iam.tf index 097fe0386..e24851ef6 100644 --- a/terraform/account-wide-infrastructure/modules/glue/iam.tf +++ b/terraform/account-wide-infrastructure/modules/glue/iam.tf @@ -98,7 +98,8 @@ data "aws_iam_policy_document" "glue_service" { ] effect = "Allow" resources = [ - "arn:aws:iam::*:role/AWSGlueServiceRole*" + "arn:aws:iam::*:role/AWSGlueServiceRole*", + aws_iam_role.glue_service_role.arn, ] } } diff --git a/terraform/account-wide-infrastructure/modules/glue/outputs.tf b/terraform/account-wide-infrastructure/modules/glue/outputs.tf index dfc12029b..0c5547fa0 100644 --- a/terraform/account-wide-infrastructure/modules/glue/outputs.tf +++ b/terraform/account-wide-infrastructure/modules/glue/outputs.tf @@ -21,3 +21,7 @@ output "aws_kms_key_arn" { output "glue_crawler_name" { value = "s3//${aws_s3_bucket.source-data-bucket.id}/" } + +output "glue_database" { + value = var.is_enabled ? aws_glue_catalog_database.log_database[0].name : "" +} diff --git a/terraform/account-wide-infrastructure/modules/glue/s3.tf b/terraform/account-wide-infrastructure/modules/glue/s3.tf index 14f7b9824..cff5d1274 100644 --- a/terraform/account-wide-infrastructure/modules/glue/s3.tf +++ b/terraform/account-wide-infrastructure/modules/glue/s3.tf @@ -56,7 +56,7 @@ resource "aws_s3_bucket_lifecycle_configuration" "source-data-bucket-lifecycle" rule { - id = "bucket-versioning-rule" + id = "object-auto-delete-rule" status = "Enabled" expiration { @@ -68,7 +68,7 @@ resource "aws_s3_bucket_lifecycle_configuration" "source-data-bucket-lifecycle" resource "aws_s3_bucket_versioning" "source-data-bucket-versioning" { bucket = aws_s3_bucket.source-data-bucket.id versioning_configuration { - status = "Enabled" + status = "Disabled" } } @@ -180,10 +180,10 @@ resource "aws_s3_bucket_public_access_block" "code-bucket-public-access-block" { } resource "aws_s3_object" "script" { - bucket = aws_s3_bucket.code-bucket.bucket - key = "main.py" - source = "${path.module}/src/main.py" - etag = filemd5("${path.module}/src/main.py") + bucket = aws_s3_bucket.code-bucket.bucket + key = "main.py" + source = "${path.module}/src/main.py" + source_hash = filemd5("${path.module}/src/main.py") } data "archive_file" "python" { @@ -194,8 +194,8 @@ data "archive_file" "python" { } resource "aws_s3_object" "zip" { - bucket = aws_s3_bucket.code-bucket.bucket - key = "src.zip" - source = data.archive_file.python.output_path - etag = filemd5(data.archive_file.python.output_path) + bucket = aws_s3_bucket.code-bucket.bucket + key = "src.zip" + source = data.archive_file.python.output_path + source_hash = filemd5(data.archive_file.python.output_path) } diff --git a/terraform/account-wide-infrastructure/modules/glue/src/instances.py b/terraform/account-wide-infrastructure/modules/glue/src/instances.py deleted file mode 100644 index a94bcc459..000000000 --- a/terraform/account-wide-infrastructure/modules/glue/src/instances.py +++ /dev/null @@ -1,32 +0,0 @@ -import logging - -from awsglue.context import GlueContext -from pyspark.sql import SparkSession - - -class GlueContextSingleton: - """Singleton for GlueContext and SparkSession""" - - _instance = None - - def __new__(cls, spark_context): - if not cls._instance: - cls._instance = super().__new__(cls) - cls._instance.spark = SparkSession.builder.config( - "spark.sql.caseSensitive", "true" - ).getOrCreate() - cls._instance.context = GlueContext(spark_context) - return cls._instance - - -class LoggerSingleton: - """Singleton for logger""" - - _instance = None - - def __new__(cls): - if not cls._instance: - cls._instance = super().__new__(cls) - cls._instance.logger = logging.getLogger("ETLLogger") - cls._instance.logger.setLevel(logging.INFO) - return cls._instance diff --git a/terraform/account-wide-infrastructure/modules/glue/src/main.py b/terraform/account-wide-infrastructure/modules/glue/src/main.py index 64f616b59..712be8877 100644 --- a/terraform/account-wide-infrastructure/modules/glue/src/main.py +++ b/terraform/account-wide-infrastructure/modules/glue/src/main.py @@ -1,18 +1,28 @@ +import logging import sys +from awsglue.context import GlueContext from awsglue.utils import getResolvedOptions from pipeline import LogPipeline -from pyspark.context import SparkContext +from pyspark.sql import SparkSession from transformations import dtype_conversion, rename_cols, resolve_dupes +# Spark and Glue Context initialization +spark = SparkSession.builder.config("spark.sql.caseSensitive", "true").getOrCreate() +glue_context = GlueContext(spark.sparkContext) + +# Logger setup +MSG_FORMAT = "%(asctime)s %(levelname)s %(name)s: %(message)s" +DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" +logging.basicConfig(format=MSG_FORMAT, datefmt=DATETIME_FORMAT) +logger = logging.getLogger("ETLLogger") +logger.setLevel(logging.INFO) + # Get arguments from AWS Glue job args = getResolvedOptions( sys.argv, ["job_name", "source_path", "target_path", "partition_cols"] ) -# Start Glue context -sc = SparkContext() - partition_cols = args["partition_cols"].split(",") if "partition_cols" in args else [] host_prefixes = [ @@ -31,7 +41,9 @@ # Initialize ETL process etl_job = LogPipeline( - spark_context=sc, + glue_context=glue_context, + spark=spark, + logger=logger, source_path=args["source_path"], target_path=args["target_path"], host_prefixes=host_prefixes, diff --git a/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py b/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py index f018911ad..7c1b1597c 100644 --- a/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py +++ b/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py @@ -1,13 +1,14 @@ import time import boto3 -from instances import GlueContextSingleton, LoggerSingleton class LogPipeline: def __init__( self, - spark_context, + glue_context, + spark, + logger, source_path, target_path, host_prefixes, @@ -16,9 +17,9 @@ def __init__( transformations=[], ): """Initialize Glue context, Spark session, logger, and paths""" - self.glue_context = GlueContextSingleton(spark_context).context - self.spark = GlueContextSingleton(spark_context).spark - self.logger = LoggerSingleton().logger + self.glue_context = glue_context + self.spark = spark + self.logger = logger self.source_path = source_path self.target_path = target_path self.host_prefixes = host_prefixes @@ -72,7 +73,12 @@ def extract_dynamic(self): if last_runtime: data[name] = self.glue_context.create_dynamic_frame.from_options( connection_type="s3", - connection_options={"paths": [self.source_path], "recurse": True}, + connection_options={ + "paths": [self.source_path], + "recurse": True, + "groupFiles": "inPartition", + "groupSize": "134217728", + }, format="json", ).filter( f=lambda x, n=name: (x["host"].endswith(n)) @@ -82,7 +88,12 @@ def extract_dynamic(self): else: data[name] = self.glue_context.create_dynamic_frame.from_options( connection_type="s3", - connection_options={"paths": [self.source_path], "recurse": True}, + connection_options={ + "paths": [self.source_path], + "recurse": True, + "groupFiles": "inPartition", + "groupSize": "134217728", + }, format="json", ).filter(f=lambda x, n=name: x["host"].endswith(n)) @@ -96,7 +107,7 @@ def transform(self, dataframe, name): ) for transformation in self.transformations: self.logger.info(f"Applying transformation: {transformation.__name__}") - dataframe = transformation(dataframe) + dataframe = transformation(dataframe, self.logger) return dataframe def load(self, data): @@ -105,7 +116,10 @@ def load(self, data): for name, dataframe in data.items(): name = name.replace("--", "_") try: - dataframe.coalesce(1).write.mode("append").partitionBy( + self.logger.info( + f"Attempting to load dataframe {name} into {self.target_path}{name}" + ) + dataframe.write.mode("append").partitionBy( *self.partition_cols ).parquet(f"{self.target_path}{name}") except: diff --git a/terraform/account-wide-infrastructure/modules/glue/src/transformations.py b/terraform/account-wide-infrastructure/modules/glue/src/transformations.py index 3b23d6515..425e5a061 100644 --- a/terraform/account-wide-infrastructure/modules/glue/src/transformations.py +++ b/terraform/account-wide-infrastructure/modules/glue/src/transformations.py @@ -1,3 +1,6 @@ +from collections import defaultdict +from functools import reduce + from pyspark.sql.functions import ( coalesce, col, @@ -12,34 +15,41 @@ from pyspark.sql.types import NullType -def resolve_dupes(df): - drop = [] - for i in range(len(df.columns)): - for j in range(i + 1, len(df.columns)): - if df.columns[i].lower() == df.columns[j].lower(): - df = df.withColumn( - df.columns[i].lower() + "_", - when( - col(df.columns[i]).isNull() | col(df.columns[j]).isNull(), - coalesce(col(df.columns[i]), col(df.columns[j])), - ).otherwise( - concat(col(df.columns[i]), lit(","), col(df.columns[j])) - ), - ) - drop.extend([df.columns[i], df.columns[j]]) - df = df.drop(*drop) +def resolve_dupes(df, logger): + column_groups = defaultdict(list) + for column_name in df.columns: + normalised_name = column_name.lower().rstrip("_") + column_groups[normalised_name].append(column_name) - return df + final_select_exprs = [] + for lower_name, original_names in column_groups.items(): + + if len(original_names) == 1: + final_select_exprs.append(col(original_names[0]).alias(lower_name)) + else: + logger.info(f"Resolving duplicate group '{lower_name}': {original_names}") + + merge_logic = lambda col1, col2: when( + col1.isNull() | col2.isNull(), coalesce(col1, col2) + ).otherwise(concat(col1, lit(", "), col2)) + + merged_column_expr = reduce(merge_logic, [col(c) for c in original_names]) + + final_select_exprs.append(merged_column_expr.alias(lower_name)) + + return df.select(*final_select_exprs) -def rename_cols(df): +def rename_cols(df, logger): + logger.info("Replacing '.' with '_'") for col_name in df.columns: df = df.withColumnRenamed(col_name, col_name.replace(".", "_")) return df -def dtype_conversion(df): +def dtype_conversion(df, logger): try: + logger.info("Formatting event_timestamp") df = ( df.withColumn( "event_timestamp_cleaned", @@ -56,13 +66,15 @@ def dtype_conversion(df): ) df = df.drop("event_timestamp_cleaned") - except: - ... + except Exception as e: + logger.info(f"Failed formatting of timestamp column with error: {e}") + logger.info("Handling Null Type columns") select_exprs = [] for column_name in df.columns: column_type = df.schema[column_name].dataType if isinstance(column_type, NullType): + logger.info(f"Converting {column_name} to string") select_exprs.append(col(column_name).cast("string").alias(column_name)) else: select_exprs.append(col(column_name)) diff --git a/terraform/account-wide-infrastructure/modules/glue/vars.tf b/terraform/account-wide-infrastructure/modules/glue/vars.tf index cb03095bf..ae3281303 100644 --- a/terraform/account-wide-infrastructure/modules/glue/vars.tf +++ b/terraform/account-wide-infrastructure/modules/glue/vars.tf @@ -22,3 +22,9 @@ variable "code_bucket" { description = "S3 bucket for Glue job scripts" default = "code-bucket" } + +variable "is_enabled" { + type = bool + description = "Flag to enable or disable the Glue module" + default = true +} diff --git a/terraform/account-wide-infrastructure/modules/ec2/data.tf b/terraform/account-wide-infrastructure/modules/powerbi-gw-ec2/data.tf similarity index 100% rename from terraform/account-wide-infrastructure/modules/ec2/data.tf rename to terraform/account-wide-infrastructure/modules/powerbi-gw-ec2/data.tf diff --git a/terraform/account-wide-infrastructure/modules/ec2/ec2.tf b/terraform/account-wide-infrastructure/modules/powerbi-gw-ec2/ec2.tf similarity index 84% rename from terraform/account-wide-infrastructure/modules/ec2/ec2.tf rename to terraform/account-wide-infrastructure/modules/powerbi-gw-ec2/ec2.tf index e5016f13f..eb106e8fc 100644 --- a/terraform/account-wide-infrastructure/modules/ec2/ec2.tf +++ b/terraform/account-wide-infrastructure/modules/powerbi-gw-ec2/ec2.tf @@ -1,11 +1,16 @@ -resource "aws_instance" "web" { +resource "aws_instance" "powerbi_gw" { associate_public_ip_address = false iam_instance_profile = aws_iam_instance_profile.powerbi_profile.name ami = local.selected_ami_id instance_type = var.instance_type key_name = aws_key_pair.ec2_key_pair.key_name subnet_id = var.subnet_id - security_groups = var.security_groups + vpc_security_group_ids = var.security_groups + + root_block_device { + volume_size = 40 + volume_type = "gp2" + } user_data = file("${path.module}/scripts/user_data.tpl") diff --git a/terraform/account-wide-infrastructure/modules/ec2/iam.tf b/terraform/account-wide-infrastructure/modules/powerbi-gw-ec2/iam.tf similarity index 100% rename from terraform/account-wide-infrastructure/modules/ec2/iam.tf rename to terraform/account-wide-infrastructure/modules/powerbi-gw-ec2/iam.tf diff --git a/terraform/account-wide-infrastructure/modules/ec2/locals.tf b/terraform/account-wide-infrastructure/modules/powerbi-gw-ec2/locals.tf similarity index 100% rename from terraform/account-wide-infrastructure/modules/ec2/locals.tf rename to terraform/account-wide-infrastructure/modules/powerbi-gw-ec2/locals.tf diff --git a/terraform/account-wide-infrastructure/modules/ec2/scripts/user_data.tpl b/terraform/account-wide-infrastructure/modules/powerbi-gw-ec2/scripts/user_data.tpl similarity index 100% rename from terraform/account-wide-infrastructure/modules/ec2/scripts/user_data.tpl rename to terraform/account-wide-infrastructure/modules/powerbi-gw-ec2/scripts/user_data.tpl diff --git a/terraform/account-wide-infrastructure/modules/ec2/vars.tf b/terraform/account-wide-infrastructure/modules/powerbi-gw-ec2/vars.tf similarity index 100% rename from terraform/account-wide-infrastructure/modules/ec2/vars.tf rename to terraform/account-wide-infrastructure/modules/powerbi-gw-ec2/vars.tf diff --git a/terraform/account-wide-infrastructure/modules/vpc/vpc.tf b/terraform/account-wide-infrastructure/modules/vpc/vpc.tf index b1fa293ea..b2678bc6c 100644 --- a/terraform/account-wide-infrastructure/modules/vpc/vpc.tf +++ b/terraform/account-wide-infrastructure/modules/vpc/vpc.tf @@ -55,7 +55,6 @@ resource "aws_route_table" "private_rt" { } resource "aws_eip" "natgw-ip" { - domain = "vpc" } resource "aws_nat_gateway" "nat" { diff --git a/terraform/account-wide-infrastructure/prod/athena.tf b/terraform/account-wide-infrastructure/prod/athena.tf new file mode 100644 index 000000000..9242ddd5e --- /dev/null +++ b/terraform/account-wide-infrastructure/prod/athena.tf @@ -0,0 +1,7 @@ +module "prod-athena" { + count = var.enable_reporting ? 1 : 0 + source = "../modules/athena" + name_prefix = "nhsd-nrlf--prod" + target_bucket_name = module.prod-glue.target_bucket_name + glue_database = module.prod-glue.glue_database +} diff --git a/terraform/account-wide-infrastructure/prod/ec2.tf b/terraform/account-wide-infrastructure/prod/ec2.tf new file mode 100644 index 000000000..669b2d5be --- /dev/null +++ b/terraform/account-wide-infrastructure/prod/ec2.tf @@ -0,0 +1,25 @@ +module "vpc" { + count = var.enable_reporting && var.enable_powerbi_auto_push ? 1 : 0 + source = "../modules/vpc" + vpc_cidr_block = var.vpc_cidr_block + enable_dns_hostnames = var.enable_dns_hostnames + vpc_public_subnets_cidr_block = var.vpc_public_subnets_cidr_block + vpc_private_subnets_cidr_block = var.vpc_private_subnets_cidr_block + aws_azs = var.aws_azs + name_prefix = "nhsd-nrlf--prod" +} + +module "powerbi_gw_instance" { + count = var.enable_reporting && var.enable_powerbi_auto_push ? 1 : 0 + source = "../modules/powerbi-gw-ec2" + use_custom_ami = false + instance_type = var.powerbi_gw_instance_type + name_prefix = "nhsd-nrlf--test-powerbi-gw" + target_bucket_arn = module.prod-glue.target_bucket_arn + glue_kms_key_arn = module.prod-glue.aws_kms_key_arn + athena_kms_key_arn = module.prod-athena[0].kms_key_arn + athena_bucket_arn = module.prod-athena[0].bucket_arn + + subnet_id = module.vpc[0].private_subnet_id + security_groups = [module.vpc[0].powerbi_gw_security_group_id] +} diff --git a/terraform/account-wide-infrastructure/prod/glue.tf b/terraform/account-wide-infrastructure/prod/glue.tf new file mode 100644 index 000000000..34c7f540d --- /dev/null +++ b/terraform/account-wide-infrastructure/prod/glue.tf @@ -0,0 +1,6 @@ +module "prod-glue" { + is_enabled = var.enable_reporting + source = "../modules/glue" + name_prefix = "nhsd-nrlf--prod" + python_version = 3 +} diff --git a/terraform/account-wide-infrastructure/prod/vars.tf b/terraform/account-wide-infrastructure/prod/vars.tf index 3a6abb601..d999261f5 100644 --- a/terraform/account-wide-infrastructure/prod/vars.tf +++ b/terraform/account-wide-infrastructure/prod/vars.tf @@ -8,3 +8,63 @@ variable "prod_api_domain_name" { description = "The internal DNS name of the API Gateway for the prod environment" default = "prod.api.record-locator.national.nhs.uk" } + +variable "aws_azs" { + type = string + description = "AWS Availability Zones" + default = "eu-west-2a" +} + +variable "enable_dns_hostnames" { + type = bool + description = "Enable DNS hostnames in VPC" + default = true +} + +variable "enable_reporting" { + type = bool + description = "Enable account-wide reporting processes in the prod account" + default = false +} + +variable "vpc_cidr_block" { + type = string + description = "Base CIDR Block for VPC" + default = "10.0.0.0/16" +} + +variable "vpc_public_subnets_cidr_block" { + type = string + description = "CIDR Block for Public Subnets in VPC" + default = "10.0.0.0/24" +} + +variable "vpc_private_subnets_cidr_block" { + type = string + description = "CIDR Block for Private Subnets in VPC" + default = "10.0.1.0/24" +} + +variable "instance_type" { + type = string + description = "Type for EC2 Instance" + default = "t2.micro" +} + +variable "enable_powerbi_auto_push" { + type = bool + description = "Enable automatic pushing of info into PowerBI" + default = false +} + +variable "powerbi_gw_instance_type" { + type = string + description = "Type for PowerBI GW EC2 Instance" + default = "t2.micro" +} + +variable "use_powerbi_gw_custom_ami" { + type = bool + description = "Use custom image for PowerBI GW instance" + default = true +} diff --git a/terraform/account-wide-infrastructure/test/athena.tf b/terraform/account-wide-infrastructure/test/athena.tf index b64111d44..c31f4f5af 100644 --- a/terraform/account-wide-infrastructure/test/athena.tf +++ b/terraform/account-wide-infrastructure/test/athena.tf @@ -1,5 +1,7 @@ -module "int-athena" { +module "test-athena" { + count = var.enable_reporting ? 1 : 0 source = "../modules/athena" - name_prefix = "nhsd-nrlf--int" - target_bucket_name = module.int-glue.target_bucket_name + name_prefix = "nhsd-nrlf--test" + target_bucket_name = module.test-glue.target_bucket_name + glue_database = module.test-glue.glue_database } diff --git a/terraform/account-wide-infrastructure/test/ec2.tf b/terraform/account-wide-infrastructure/test/ec2.tf new file mode 100644 index 000000000..c7f1caf6e --- /dev/null +++ b/terraform/account-wide-infrastructure/test/ec2.tf @@ -0,0 +1,25 @@ +module "vpc" { + count = var.enable_reporting && var.enable_powerbi_auto_push ? 1 : 0 + source = "../modules/vpc" + vpc_cidr_block = var.vpc_cidr_block + enable_dns_hostnames = var.enable_dns_hostnames + vpc_public_subnets_cidr_block = var.vpc_public_subnets_cidr_block + vpc_private_subnets_cidr_block = var.vpc_private_subnets_cidr_block + aws_azs = var.aws_azs + name_prefix = "nhsd-nrlf--test" +} + +module "powerbi_gw_instance" { + count = var.enable_reporting && var.enable_powerbi_auto_push ? 1 : 0 + source = "../modules/powerbi-gw-ec2" + use_custom_ami = var.use_powerbi_gw_custom_ami + instance_type = var.powerbi_gw_instance_type + name_prefix = "nhsd-nrlf--test-powerbi-gw" + target_bucket_arn = module.test-glue.target_bucket_arn + glue_kms_key_arn = module.test-glue.aws_kms_key_arn + athena_kms_key_arn = module.test-athena[0].kms_key_arn + athena_bucket_arn = module.test-athena[0].bucket_arn + + subnet_id = module.vpc[0].private_subnet_id + security_groups = [module.vpc[0].powerbi_gw_security_group_id] +} diff --git a/terraform/account-wide-infrastructure/test/glue.tf b/terraform/account-wide-infrastructure/test/glue.tf index 57e2e82e9..86e714de3 100644 --- a/terraform/account-wide-infrastructure/test/glue.tf +++ b/terraform/account-wide-infrastructure/test/glue.tf @@ -1,5 +1,6 @@ -module "int-glue" { +module "test-glue" { + is_enabled = var.enable_reporting source = "../modules/glue" - name_prefix = "nhsd-nrlf--int" + name_prefix = "nhsd-nrlf--test" python_version = 3 } diff --git a/terraform/account-wide-infrastructure/test/vars.tf b/terraform/account-wide-infrastructure/test/vars.tf index be722db5e..144929512 100644 --- a/terraform/account-wide-infrastructure/test/vars.tf +++ b/terraform/account-wide-infrastructure/test/vars.tf @@ -28,3 +28,57 @@ variable "ref_api_domain_name" { description = "The internal DNS name of the API Gateway for the ref environment" default = "ref.api.record-locator.ref.national.nhs.uk" } + +variable "enable_reporting" { + type = bool + description = "Enable account-wide reporting processes in the test account" + default = false +} + +variable "aws_azs" { + type = string + description = "AWS Availability Zones" + default = "eu-west-2a" +} + +variable "enable_dns_hostnames" { + type = bool + description = "Enable DNS hostnames in VPC" + default = true +} + +variable "vpc_cidr_block" { + type = string + description = "Base CIDR Block for VPC" + default = "10.0.0.0/16" +} + +variable "vpc_public_subnets_cidr_block" { + type = string + description = "CIDR Block for Public Subnets in VPC" + default = "10.0.0.0/24" +} + +variable "vpc_private_subnets_cidr_block" { + type = string + description = "CIDR Block for Private Subnets in VPC" + default = "10.0.1.0/24" +} + +variable "enable_powerbi_auto_push" { + type = bool + description = "Enable automatic pushing of info into PowerBI" + default = false +} + +variable "powerbi_gw_instance_type" { + type = string + description = "Type for PowerBI GW EC2 Instance" + default = "t2.micro" +} + +variable "use_powerbi_gw_custom_ami" { + type = bool + description = "Use custom image for PowerBI GW instance" + default = true +} diff --git a/terraform/infrastructure/data.tf b/terraform/infrastructure/data.tf index e2d2d23d0..926bd13d2 100644 --- a/terraform/infrastructure/data.tf +++ b/terraform/infrastructure/data.tf @@ -43,11 +43,9 @@ data "external" "current-info" { } data "aws_s3_bucket" "source-data-bucket" { - count = local.is_dev_env && !local.is_sandbox_env ? 1 : 0 - bucket = "${local.shared_prefix}-source-data-bucket" + bucket = "${local.account_prefix}-source-data-bucket" } data "aws_kms_key" "glue" { - count = local.is_dev_env && !local.is_sandbox_env ? 1 : 0 - key_id = "alias/${local.shared_prefix}-glue" + key_id = "alias/${local.account_prefix}-glue" } diff --git a/terraform/infrastructure/etc/dev.tfvars b/terraform/infrastructure/etc/dev.tfvars index 285015605..4ba4b986f 100644 --- a/terraform/infrastructure/etc/dev.tfvars +++ b/terraform/infrastructure/etc/dev.tfvars @@ -1,6 +1,8 @@ -account_name = "dev" +account_name = "dev" +aws_account_name = "dev" domain = "api.record-locator.dev.national.nhs.uk" public_domain = "internal-dev.api.service.nhs.uk" public_sandbox_domain = "internal-dev-sandbox.api.service.nhs.uk" log_retention_period = 90 +enable_reporting = true diff --git a/terraform/infrastructure/etc/int.tfvars b/terraform/infrastructure/etc/int.tfvars index 7c8b4b8ed..4baa2a936 100644 --- a/terraform/infrastructure/etc/int.tfvars +++ b/terraform/infrastructure/etc/int.tfvars @@ -1,8 +1,10 @@ -// TODO-NOW - Change this file name to int and update all references in codebase (and github repo config) -account_name = "int" +account_name = "int" +aws_account_name = "test" + domain = "api.record-locator.int.national.nhs.uk" deletion_protection = true public_domain = "int.api.service.nhs.uk" public_sandbox_domain = "sandbox.api.service.nhs.uk" log_retention_period = 90 +enable_reporting = false diff --git a/terraform/infrastructure/etc/prod.tfvars b/terraform/infrastructure/etc/prod.tfvars index 01ba96e83..4f9ca34e9 100644 --- a/terraform/infrastructure/etc/prod.tfvars +++ b/terraform/infrastructure/etc/prod.tfvars @@ -1,5 +1,8 @@ -account_name = "prod" +account_name = "prod" +aws_account_name = "prod" + domain = "api.record-locator.national.nhs.uk" public_domain = "api.service.nhs.uk" deletion_protection = true log_retention_period = 2192 +enable_reporting = false diff --git a/terraform/infrastructure/etc/qa.tfvars b/terraform/infrastructure/etc/qa.tfvars index 39e1ec44a..bfada691e 100644 --- a/terraform/infrastructure/etc/qa.tfvars +++ b/terraform/infrastructure/etc/qa.tfvars @@ -1,6 +1,8 @@ -account_name = "qa" +account_name = "qa" +aws_account_name = "test" domain = "qa.record-locator.national.nhs.uk" public_domain = "internal-qa.api.service.nhs.uk" public_sandbox_domain = "internal-qa-sandbox.api.service.nhs.uk" log_retention_period = 90 +enable_reporting = false diff --git a/terraform/infrastructure/etc/ref.tfvars b/terraform/infrastructure/etc/ref.tfvars index ec7a38035..9647baa00 100644 --- a/terraform/infrastructure/etc/ref.tfvars +++ b/terraform/infrastructure/etc/ref.tfvars @@ -1,4 +1,7 @@ -account_name = "ref" +account_name = "ref" +aws_account_name = "test" + domain = "api.record-locator.ref.national.nhs.uk" public_domain = "ref.api.service.nhs.uk" log_retention_period = 30 +enable_reporting = false diff --git a/terraform/infrastructure/firehose.tf b/terraform/infrastructure/firehose.tf index db063e6f4..b7a9dd370 100644 --- a/terraform/infrastructure/firehose.tf +++ b/terraform/infrastructure/firehose.tf @@ -1,14 +1,14 @@ module "firehose__processor" { - source = "./modules/firehose" - assume_account = local.aws_account_id - prefix = local.prefix - region = local.region - environment = local.environment - cloudwatch_kms_arn = module.kms__cloudwatch.kms_arn - splunk_environment = local.splunk_environment - splunk_index = local.splunk_index - destination = "splunk" - reporting_bucket_arn = local.reporting_bucket_arn - reporting_kms_arn = local.reporting_kms_arn - reporting_infra_toggle = local.is_dev_env && !local.is_sandbox_env + count = var.use_shared_resources ? 1 : 0 + source = "./modules/firehose" + assume_account = local.aws_account_id + prefix = local.prefix + region = local.region + environment = local.environment + cloudwatch_kms_arn = module.kms__cloudwatch.kms_arn + splunk_environment = local.splunk_environment + splunk_index = local.splunk_index + destination = "splunk" + reporting_bucket_arn = local.reporting_bucket_arn + reporting_kms_arn = local.reporting_kms_arn } diff --git a/terraform/infrastructure/lambda.tf b/terraform/infrastructure/lambda.tf index 4658b4f78..c934452db 100644 --- a/terraform/infrastructure/lambda.tf +++ b/terraform/infrastructure/lambda.tf @@ -11,7 +11,7 @@ module "consumer__readDocumentReference" { PREFIX = "${local.prefix}--" ENVIRONMENT = local.environment POWERTOOLS_LOG_LEVEL = local.log_level - SPLUNK_INDEX = module.firehose__processor.splunk.index + SPLUNK_INDEX = local.splunk_index AUTH_STORE = local.auth_store_id TABLE_NAME = local.pointers_table_name } @@ -39,7 +39,7 @@ module "consumer__countDocumentReference" { ENVIRONMENT = local.environment AUTH_STORE = local.auth_store_id POWERTOOLS_LOG_LEVEL = local.log_level - SPLUNK_INDEX = module.firehose__processor.splunk.index + SPLUNK_INDEX = local.splunk_index TABLE_NAME = local.pointers_table_name } additional_policies = [ @@ -66,7 +66,7 @@ module "consumer__searchDocumentReference" { ENVIRONMENT = local.environment AUTH_STORE = local.auth_store_id POWERTOOLS_LOG_LEVEL = local.log_level - SPLUNK_INDEX = module.firehose__processor.splunk.index + SPLUNK_INDEX = local.splunk_index TABLE_NAME = local.pointers_table_name } additional_policies = [ @@ -93,7 +93,7 @@ module "consumer__searchPostDocumentReference" { ENVIRONMENT = local.environment AUTH_STORE = local.auth_store_id POWERTOOLS_LOG_LEVEL = local.log_level - SPLUNK_INDEX = module.firehose__processor.splunk.index + SPLUNK_INDEX = local.splunk_index TABLE_NAME = local.pointers_table_name } additional_policies = [ @@ -119,7 +119,7 @@ module "producer__createDocumentReference" { PREFIX = "${local.prefix}--" ENVIRONMENT = local.environment AUTH_STORE = local.auth_store_id - SPLUNK_INDEX = module.firehose__processor.splunk.index + SPLUNK_INDEX = local.splunk_index POWERTOOLS_LOG_LEVEL = local.log_level TABLE_NAME = local.pointers_table_name } @@ -148,7 +148,7 @@ module "producer__deleteDocumentReference" { ENVIRONMENT = local.environment AUTH_STORE = local.auth_store_id POWERTOOLS_LOG_LEVEL = local.log_level - SPLUNK_INDEX = module.firehose__processor.splunk.index + SPLUNK_INDEX = local.splunk_index TABLE_NAME = local.pointers_table_name } additional_policies = [ @@ -176,7 +176,7 @@ module "producer__readDocumentReference" { ENVIRONMENT = local.environment AUTH_STORE = local.auth_store_id POWERTOOLS_LOG_LEVEL = local.log_level - SPLUNK_INDEX = module.firehose__processor.splunk.index + SPLUNK_INDEX = local.splunk_index TABLE_NAME = local.pointers_table_name } additional_policies = [ @@ -203,7 +203,7 @@ module "producer__searchDocumentReference" { ENVIRONMENT = local.environment AUTH_STORE = local.auth_store_id POWERTOOLS_LOG_LEVEL = local.log_level - SPLUNK_INDEX = module.firehose__processor.splunk.index + SPLUNK_INDEX = local.splunk_index TABLE_NAME = local.pointers_table_name } additional_policies = [ @@ -230,7 +230,7 @@ module "producer__searchPostDocumentReference" { ENVIRONMENT = local.environment AUTH_STORE = local.auth_store_id POWERTOOLS_LOG_LEVEL = local.log_level - SPLUNK_INDEX = module.firehose__processor.splunk.index + SPLUNK_INDEX = local.splunk_index TABLE_NAME = local.pointers_table_name } additional_policies = [ @@ -257,7 +257,7 @@ module "producer__updateDocumentReference" { ENVIRONMENT = local.environment AUTH_STORE = local.auth_store_id POWERTOOLS_LOG_LEVEL = local.log_level - SPLUNK_INDEX = module.firehose__processor.splunk.index + SPLUNK_INDEX = local.splunk_index TABLE_NAME = local.pointers_table_name } additional_policies = [ @@ -285,7 +285,7 @@ module "producer__upsertDocumentReference" { ENVIRONMENT = local.environment AUTH_STORE = local.auth_store_id POWERTOOLS_LOG_LEVEL = local.log_level - SPLUNK_INDEX = module.firehose__processor.splunk.index + SPLUNK_INDEX = local.splunk_index TABLE_NAME = local.pointers_table_name } additional_policies = [ @@ -313,7 +313,7 @@ module "consumer__status" { ENVIRONMENT = local.environment AUTH_STORE = local.auth_store_id POWERTOOLS_LOG_LEVEL = local.log_level - SPLUNK_INDEX = module.firehose__processor.splunk.index + SPLUNK_INDEX = local.splunk_index DYNAMODB_TIMEOUT = local.dynamodb_timeout_seconds TABLE_NAME = local.pointers_table_name } @@ -342,7 +342,7 @@ module "producer__status" { ENVIRONMENT = local.environment AUTH_STORE = local.auth_store_id POWERTOOLS_LOG_LEVEL = local.log_level - SPLUNK_INDEX = module.firehose__processor.splunk.index + SPLUNK_INDEX = local.splunk_index DYNAMODB_TIMEOUT = local.dynamodb_timeout_seconds TABLE_NAME = local.pointers_table_name } diff --git a/terraform/infrastructure/locals.tf b/terraform/infrastructure/locals.tf index 74882da5a..957788553 100644 --- a/terraform/infrastructure/locals.tf +++ b/terraform/infrastructure/locals.tf @@ -4,6 +4,9 @@ locals { stack_name = terraform.workspace deletion_protection = var.deletion_protection prefix = "${local.project}--${local.stack_name}" + account_prefix = "${local.project}--${var.aws_account_name}" + + aws_account_id = data.aws_caller_identity.current.account_id kms = { deletion_window_in_days = 7 @@ -22,30 +25,22 @@ locals { dynamodb_timeout_seconds = "3" is_sandbox_env = length(regexall("-sandbox-", local.stack_name)) > 0 - is_dev_env = var.account_name == "dev" || var.account_name == "int" environment = local.is_sandbox_env ? "${var.account_name}-sandbox" : var.account_name shared_prefix = "${local.project}--${local.environment}" public_domain = local.is_sandbox_env ? var.public_sandbox_domain : var.public_domain - # Logic / vars for reporting - reporting_bucket_arn = local.is_dev_env && !local.is_sandbox_env ? data.aws_s3_bucket.source-data-bucket[0].arn : null - reporting_kms_arn = local.is_dev_env && !local.is_sandbox_env ? data.aws_kms_key.glue[0].arn : null - firehose_lambda_subscriptions = local.is_dev_env && !local.is_sandbox_env ? [ - module.firehose__processor.firehose_subscription, - module.firehose__processor.firehose_reporting_subscription - ] : [ - module.firehose__processor.firehose_subscription - ] - - # Logic / vars for splunk environment + reporting_bucket_arn = data.aws_s3_bucket.source-data-bucket.arn + reporting_kms_arn = data.aws_kms_key.glue.arn + firehose_lambda_subscriptions = var.use_shared_resources ? { + "splunk_subscription" : module.firehose__processor[0].firehose_subscription, + "reports_subscription" : module.firehose__processor[0].firehose_reporting_subscription + } : {} splunk_environment = local.is_sandbox_env ? "${var.account_name}sandbox" : var.account_name splunk_index = "aws_recordlocator_${local.splunk_environment}" log_level = var.account_name == "dev" || var.account_name == "qa" ? "DEBUG" : "INFO" - aws_account_id = data.aws_caller_identity.current.account_id - auth_store_id = var.use_shared_resources ? data.aws_s3_bucket.authorization-store[0].id : module.ephemeral-s3-permission-store[0].bucket_id auth_store_read_policy_arn = var.use_shared_resources ? data.aws_iam_policy.auth-store-read-policy[0].arn : module.ephemeral-s3-permission-store[0].bucket_read_policy_arn diff --git a/terraform/infrastructure/modules/api_gateway/api_gateway.tf b/terraform/infrastructure/modules/api_gateway/api_gateway.tf index b59636f69..40dca31cf 100644 --- a/terraform/infrastructure/modules/api_gateway/api_gateway.tf +++ b/terraform/infrastructure/modules/api_gateway/api_gateway.tf @@ -112,6 +112,7 @@ resource "aws_api_gateway_method_settings" "api_gateway_method_settings" { resource "aws_api_gateway_gateway_response" "api_access_denied" { rest_api_id = aws_api_gateway_rest_api.api_gateway_rest_api.id + status_code = "403" response_type = "ACCESS_DENIED" response_templates = { "application/json" = jsonencode({ diff --git a/terraform/infrastructure/modules/firehose/cloudwatch.tf b/terraform/infrastructure/modules/firehose/cloudwatch.tf index 86aff3fd2..5539dffa0 100644 --- a/terraform/infrastructure/modules/firehose/cloudwatch.tf +++ b/terraform/infrastructure/modules/firehose/cloudwatch.tf @@ -9,13 +9,11 @@ resource "aws_cloudwatch_log_stream" "firehose" { } resource "aws_cloudwatch_log_group" "firehose_reporting" { - count = var.reporting_infra_toggle ? 1 : 0 name = "/aws/kinesisfirehose/${var.prefix}-firehose-reporting" retention_in_days = local.cloudwatch.retention.days } resource "aws_cloudwatch_log_stream" "firehose_reporting" { - count = var.reporting_infra_toggle ? 1 : 0 name = "${var.prefix}-firehose-reporting" - log_group_name = aws_cloudwatch_log_group.firehose_reporting[0].name + log_group_name = aws_cloudwatch_log_group.firehose_reporting.name } diff --git a/terraform/infrastructure/modules/firehose/kinesis.tf b/terraform/infrastructure/modules/firehose/kinesis.tf index 144b8fa29..530c9a636 100644 --- a/terraform/infrastructure/modules/firehose/kinesis.tf +++ b/terraform/infrastructure/modules/firehose/kinesis.tf @@ -58,7 +58,6 @@ resource "aws_kinesis_firehose_delivery_stream" "firehose" { } resource "aws_kinesis_firehose_delivery_stream" "reporting_stream" { - count = var.reporting_infra_toggle ? 1 : 0 name = "${var.prefix}--cloudwatch-reporting-delivery-stream" destination = "extended_s3" @@ -69,7 +68,7 @@ resource "aws_kinesis_firehose_delivery_stream" "reporting_stream" { buffering_interval = 600 processing_configuration { - enabled = "true" + enabled = true processors { type = "Decompression" @@ -92,8 +91,8 @@ resource "aws_kinesis_firehose_delivery_stream" "reporting_stream" { cloudwatch_logging_options { enabled = true - log_group_name = aws_cloudwatch_log_group.firehose_reporting[0].name - log_stream_name = aws_cloudwatch_log_stream.firehose_reporting[0].name + log_group_name = aws_cloudwatch_log_group.firehose_reporting.name + log_stream_name = aws_cloudwatch_log_stream.firehose_reporting.name } } } diff --git a/terraform/infrastructure/modules/firehose/locals.tf b/terraform/infrastructure/modules/firehose/locals.tf index 80a0f3367..92f9a796a 100644 --- a/terraform/infrastructure/modules/firehose/locals.tf +++ b/terraform/infrastructure/modules/firehose/locals.tf @@ -32,13 +32,13 @@ locals { } iam_firehose = { - cloudwatch_reporting_log_group_arn = var.reporting_infra_toggle ? aws_cloudwatch_log_group.firehose_reporting[0].arn : null - cloudwatch_reporting_log_stream_arn = var.reporting_infra_toggle ? aws_cloudwatch_log_stream.firehose_reporting[0].arn : null - reporting_s3_arn = var.reporting_infra_toggle ? "${var.reporting_bucket_arn}/*" : null + cloudwatch_reporting_log_group_arn = aws_cloudwatch_log_group.firehose_reporting.arn + cloudwatch_reporting_log_stream_arn = aws_cloudwatch_log_stream.firehose_reporting.arn + reporting_s3_arn = "${var.reporting_bucket_arn}/*" } iam_subscriptions = { - firehose_reporting_stream_arn = var.reporting_infra_toggle ? aws_kinesis_firehose_delivery_stream.reporting_stream[0].arn : null + firehose_reporting_stream_arn = aws_kinesis_firehose_delivery_stream.reporting_stream.arn } iam_kms_resources = compact([ diff --git a/terraform/infrastructure/modules/firehose/output.tf b/terraform/infrastructure/modules/firehose/output.tf index a0b594642..e48eba33d 100644 --- a/terraform/infrastructure/modules/firehose/output.tf +++ b/terraform/infrastructure/modules/firehose/output.tf @@ -11,12 +11,6 @@ output "delivery_stream" { } } -output "splunk" { - value = { - index = var.splunk_index - } -} - output "firehose_subscription" { value = { destination = { @@ -33,7 +27,7 @@ output "firehose_subscription" { } output "firehose_reporting_subscription" { - value = var.reporting_infra_toggle ? { + value = { destination = { arn = local.iam_subscriptions.firehose_reporting_stream_arn } @@ -44,5 +38,5 @@ output "firehose_reporting_subscription" { # At least two items, and the first not any of INIT_START, START, END, REPORT pattern = "[first_item_on_this_log_line != \"INIT_START\" && first_item_on_this_log_line != \"START\" && first_item_on_this_log_line != \"END\" && first_item_on_this_log_line != \"REPORT\", everything_else_on_this_log_line]" } - } : null + } } diff --git a/terraform/infrastructure/modules/firehose/vars.tf b/terraform/infrastructure/modules/firehose/vars.tf index e98affd1d..4844674e2 100644 --- a/terraform/infrastructure/modules/firehose/vars.tf +++ b/terraform/infrastructure/modules/firehose/vars.tf @@ -35,6 +35,7 @@ variable "error_prefix" { default = "errors" } + variable "reporting_bucket_arn" { type = string default = null @@ -44,7 +45,3 @@ variable "reporting_kms_arn" { type = string default = null } - -variable "reporting_infra_toggle" { - type = bool -} diff --git a/terraform/infrastructure/modules/lambda/cloudwatch.tf b/terraform/infrastructure/modules/lambda/cloudwatch.tf index b95fc1bbf..f81f4e739 100644 --- a/terraform/infrastructure/modules/lambda/cloudwatch.tf +++ b/terraform/infrastructure/modules/lambda/cloudwatch.tf @@ -5,11 +5,11 @@ resource "aws_cloudwatch_log_group" "lambda_cloudwatch_log_group" { } resource "aws_cloudwatch_log_subscription_filter" "lambda_log_filter" { - name = "${aws_lambda_function.lambda_function.function_name}_filter" - log_group_name = aws_cloudwatch_log_group.lambda_cloudwatch_log_group.name + for_each = var.firehose_subscriptions - count = length(var.firehose_subscriptions) - role_arn = var.firehose_subscriptions[count.index].role.arn - destination_arn = var.firehose_subscriptions[count.index].destination.arn - filter_pattern = var.firehose_subscriptions[count.index].filter.pattern + name = "${aws_lambda_function.lambda_function.function_name}_${each.key}_filter" + log_group_name = aws_cloudwatch_log_group.lambda_cloudwatch_log_group.name + role_arn = each.value.role.arn + destination_arn = each.value.destination.arn + filter_pattern = each.value.filter.pattern } diff --git a/terraform/infrastructure/modules/lambda/vars.tf b/terraform/infrastructure/modules/lambda/vars.tf index 400cc1282..56f08599a 100644 --- a/terraform/infrastructure/modules/lambda/vars.tf +++ b/terraform/infrastructure/modules/lambda/vars.tf @@ -23,7 +23,9 @@ variable "additional_policies" { variable "handler" {} variable "firehose_subscriptions" { - default = [] + description = "The firehose subscriptions to attach to the lambda logs" + type = map(any) + default = {} } variable "vpc" { diff --git a/terraform/infrastructure/vars.tf b/terraform/infrastructure/vars.tf index 69ada2bb6..e6ca502fd 100644 --- a/terraform/infrastructure/vars.tf +++ b/terraform/infrastructure/vars.tf @@ -1,5 +1,12 @@ variable "account_name" { - type = string + type = string + description = "The name of the AWS environment in the account, e.g. dev, qa, int, prod" +} + +variable "aws_account_name" { + type = string + description = "The name of the AWS account, e.g. dev, test, prod" + default = "dev" } variable "assume_role_arn" { @@ -48,3 +55,9 @@ variable "log_retention_period" { default = 90 type = number } + +variable "enable_reporting" { + type = bool + description = "Enable reporting for this environment" + default = false +}