diff --git a/scripts/seed_nft_tables.py b/scripts/seed_nft_tables.py index 84c6e628f..e74b94d1b 100644 --- a/scripts/seed_nft_tables.py +++ b/scripts/seed_nft_tables.py @@ -1,12 +1,16 @@ +import csv from datetime import datetime, timedelta, timezone from itertools import cycle from math import gcd from random import shuffle -from typing import Any +from typing import Any, Iterator import boto3 import fire +# import json +import numpy as np + from nrlf.consumer.fhir.r4.model import DocumentReference from nrlf.core.constants import ( CATEGORY_ATTRIBUTES, @@ -145,7 +149,7 @@ def _populate_seed_table( px_with_pointers: int, pointers_per_px: float = 1.0, type_dists: dict[str, int] = DEFAULT_TYPE_DISTRIBUTIONS, - custodian_dists: dict[str, int] = DEFAULT_CUSTODIAN_DISTRIBUTIONS, + custodian_dists: dict[str, dict[str, int]] = DEFAULT_CUSTODIAN_DISTRIBUTIONS, ): """ Seeds a table with example data for non-functional testing. @@ -155,25 +159,41 @@ def _populate_seed_table( # set up iterations type_iter = _set_up_cyclical_iterator(type_dists) custodian_iters = _set_up_custodian_iterators(custodian_dists) - count_iter = _set_up_cyclical_iterator(DEFAULT_COUNT_DISTRIBUTIONS) + # count_iter = _set_up_cyclical_iterator(DEFAULT_COUNT_DISTRIBUTIONS) + count_iter = _get_pointer_count_poisson_distributions( + px_with_pointers, pointers_per_px + ) + # count_iter = _get_pointer_count_negbinom_distributions(px_with_pointers, pointers_per_px) testnum_cls = TestNhsNumbersIterator() testnum_iter = iter(testnum_cls) px_counter = 0 doc_ref_target = int(pointers_per_px * px_with_pointers) print( - f"Will upsert {doc_ref_target} test pointers for {px_with_pointers} patients." + f"Will upsert ~{doc_ref_target} test pointers for {px_with_pointers} patients." ) doc_ref_counter = 0 batch_counter = 0 + unprocessed_count = 0 + + pointer_data: list[list[str]] = [] start_time = datetime.now(tz=timezone.utc) - batch_upsert_items = [] - while px_counter <= px_with_pointers: + batch_upsert_items: list[dict[str, Any]] = [] + while px_counter < px_with_pointers: pointers_for_px = int(next(count_iter)) + if batch_counter + pointers_for_px > 25 or px_counter == px_with_pointers: - resource.batch_write_item(RequestItems={table_name: batch_upsert_items}) + response = resource.batch_write_item( + RequestItems={table_name: batch_upsert_items} + ) + + if response.get("UnprocessedItems"): + unprocessed_count += len( + response.get("UnprocessedItems").get(table_name, []) + ) + batch_upsert_items = [] batch_counter = 0 @@ -189,15 +209,36 @@ def _populate_seed_table( ) put_req = {"PutRequest": {"Item": pointer.model_dump()}} batch_upsert_items.append(put_req) + pointer_data.append( + [ + pointer.id, + pointer.type, + pointer.custodian, + pointer.nhs_number, + ] + ) px_counter += 1 + if px_counter % 1000 == 0: + print(".", end="", flush=True) + if px_counter % 100000 == 0: + print(f" {px_counter} patients processed ({doc_ref_counter} pointers).") + + print(" Done.") + end_time = datetime.now(tz=timezone.utc) print( - f"Created {doc_ref_counter} pointers in {timedelta.total_seconds(end_time - start_time)} seconds." + f"Created {doc_ref_counter} pointers in {timedelta.total_seconds(end_time - start_time)} seconds (unprocessed: {unprocessed_count})." ) + with open("./dist/seed-nft-pointers.csv", "w") as f: + writer = csv.writer(f) + writer.writerow(["pointer_id", "pointer_type", "custodian", "nhs_number"]) + writer.writerows(pointer_data) + print(f"Pointer data saved to ./dist/seed-nft-pointers.csv") # noqa + -def _set_up_cyclical_iterator(dists: dict[str, int]) -> iter: +def _set_up_cyclical_iterator(dists: dict[str, int]) -> Iterator[str]: """ Given a dict of values and their relative frequencies, returns an iterator that will cycle through a the reduced and shuffled set of values. @@ -205,17 +246,25 @@ def _set_up_cyclical_iterator(dists: dict[str, int]) -> iter: It also means each batch will contain a representative sample of the distribution. """ d = gcd(*dists.values()) - value_list = [] + value_list: list[str] = [] for entry in dists: value_list.extend([entry] * (dists[entry] // d)) shuffle(value_list) return cycle(value_list) +def _get_pointer_count_poisson_distributions( + num_of_patients: int, pointers_per_px: float +) -> Iterator[int]: + p_count_distr = np.random.poisson(lam=pointers_per_px - 1, size=num_of_patients) + 1 + p_count_distr = np.clip(p_count_distr, a_min=1, a_max=4) + return cycle(p_count_distr) + + def _set_up_custodian_iterators( - custodian_dists: dict[dict[str, int]] -) -> dict[str, iter]: - custodian_iters = {} + custodian_dists: dict[str, dict[str, int]] +) -> dict[str, Iterator[str]]: + custodian_iters: dict[str, Iterator[str]] = {} for pointer_type in custodian_dists: custodian_iters[pointer_type] = _set_up_cyclical_iterator( custodian_dists[pointer_type] @@ -223,21 +272,5 @@ def _set_up_custodian_iterators( return custodian_iters -def _set_up_count_iterator(pointers_per_px: float) -> iter: - """ - Given a target average number of pointers per patient, - generates a distribution of counts per individual patient. - """ - - extra_per_hundred = int( - (pointers_per_px - 1.0) * 100 - ) # no patients can have zero pointers - counts = {} - counts["3"] = extra_per_hundred // 10 - counts["2"] = extra_per_hundred - 2 * counts["3"] - counts["1"] = 100 - counts[2] - counts[3] - return _set_up_cyclical_iterator(counts) - - if __name__ == "__main__": fire.Fire(_populate_seed_table) diff --git a/terraform/account-wide-infrastructure/modules/metadata-bucket/output.tf b/terraform/account-wide-infrastructure/modules/metadata-bucket/output.tf new file mode 100644 index 000000000..77107c9ec --- /dev/null +++ b/terraform/account-wide-infrastructure/modules/metadata-bucket/output.tf @@ -0,0 +1,4 @@ +output "bucket_name" { + description = "Name of the metadata S3 bucket" + value = aws_s3_bucket.metadata_bucket.bucket +} diff --git a/terraform/account-wide-infrastructure/modules/metadata-bucket/s3.tf b/terraform/account-wide-infrastructure/modules/metadata-bucket/s3.tf new file mode 100644 index 000000000..a9f50439c --- /dev/null +++ b/terraform/account-wide-infrastructure/modules/metadata-bucket/s3.tf @@ -0,0 +1,56 @@ +resource "aws_s3_bucket" "metadata_bucket" { + bucket = "${var.name_prefix}-metadata" + force_destroy = false +} + +resource "aws_s3_bucket_policy" "metadata_bucket_policy" { + bucket = aws_s3_bucket.metadata_bucket.id + + policy = jsonencode({ + Version = "2012-10-17" + Id = "metadata_bucket_policy" + Statement = [ + { + Sid = "HTTPSOnly" + Effect = "Deny" + Principal = "*" + Action = "s3:*" + Resource = [ + aws_s3_bucket.metadata_bucket.arn, + "${aws_s3_bucket.metadata_bucket.arn}/*", + ] + Condition = { + Bool = { + "aws:SecureTransport" = "false" + } + } + }, + ] + }) +} + +resource "aws_s3_bucket_public_access_block" "metadata_bucket_public_access_block" { + bucket = aws_s3_bucket.metadata_bucket.id + + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "metadata_bucket" { + bucket = aws_s3_bucket.metadata_bucket.bucket + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "AES256" + } + } +} + +resource "aws_s3_bucket_versioning" "metadata_bucket" { + bucket = aws_s3_bucket.metadata_bucket.id + versioning_configuration { + status = "Enabled" + } +} diff --git a/terraform/account-wide-infrastructure/modules/metadata-bucket/vars.tf b/terraform/account-wide-infrastructure/modules/metadata-bucket/vars.tf new file mode 100644 index 000000000..a2772d710 --- /dev/null +++ b/terraform/account-wide-infrastructure/modules/metadata-bucket/vars.tf @@ -0,0 +1,4 @@ +variable "name_prefix" { + type = string + description = "The prefix to apply to all resources in the module." +} diff --git a/terraform/account-wide-infrastructure/test/dynamodb__pointers-table.tf b/terraform/account-wide-infrastructure/test/dynamodb__pointers-table.tf index edfd782ed..98c7bae25 100644 --- a/terraform/account-wide-infrastructure/test/dynamodb__pointers-table.tf +++ b/terraform/account-wide-infrastructure/test/dynamodb__pointers-table.tf @@ -32,9 +32,6 @@ module "ref-pointers-table" { } module "perftest-pointers-table" { - source = "../modules/pointers-table" - name_prefix = "nhsd-nrlf--perftest" - enable_deletion_protection = true - enable_pitr = true - kms_deletion_window_in_days = 30 + source = "../modules/pointers-table" + name_prefix = "nhsd-nrlf--perftest" } diff --git a/terraform/account-wide-infrastructure/test/s3.tf b/terraform/account-wide-infrastructure/test/s3.tf index 57c51ebf6..4320131fc 100644 --- a/terraform/account-wide-infrastructure/test/s3.tf +++ b/terraform/account-wide-infrastructure/test/s3.tf @@ -64,3 +64,8 @@ module "perftest-truststore-bucket" { name_prefix = "nhsd-nrlf--perftest" server_certificate_file = "../../../truststore/server/perftest.pem" } + +module "perftest-metadata-bucket" { + source = "../modules/metadata-bucket" + name_prefix = "nhsd-nrlf--perftest" +} diff --git a/terraform/infrastructure/data.tf b/terraform/infrastructure/data.tf index 926bd13d2..c99a1bedf 100644 --- a/terraform/infrastructure/data.tf +++ b/terraform/infrastructure/data.tf @@ -17,22 +17,22 @@ data "aws_iam_policy" "auth-store-read-policy" { data "aws_dynamodb_table" "pointers-table" { count = var.use_shared_resources ? 1 : 0 - name = "${local.shared_prefix}-pointers-table" + name = "${local.pointers_table_prefix}-pointers-table" } data "aws_iam_policy" "pointers-table-read" { count = var.use_shared_resources ? 1 : 0 - name = "${local.shared_prefix}-pointers-table-read" + name = "${local.pointers_table_prefix}-pointers-table-read" } data "aws_iam_policy" "pointers-table-write" { count = var.use_shared_resources ? 1 : 0 - name = "${local.shared_prefix}-pointers-table-write" + name = "${local.pointers_table_prefix}-pointers-table-write" } data "aws_iam_policy" "pointers-kms-read-write" { count = var.use_shared_resources ? 1 : 0 - name = "${local.shared_prefix}-pointers-kms-read-write" + name = "${local.pointers_table_prefix}-pointers-kms-read-write" } data "external" "current-info" { diff --git a/terraform/infrastructure/etc/dev.tfvars b/terraform/infrastructure/etc/dev.tfvars index ce5b55be9..f1f1b7cbc 100644 --- a/terraform/infrastructure/etc/dev.tfvars +++ b/terraform/infrastructure/etc/dev.tfvars @@ -1,8 +1,12 @@ account_name = "dev" aws_account_name = "dev" +dynamodb_pointers_table_prefix = "nhsd-nrlf--dev" +dynamodb_sandbox_pointers_table_prefix = "nhsd-nrlf--dev-sandbox" + domain = "api.record-locator.dev.national.nhs.uk" public_domain = "internal-dev.api.service.nhs.uk" public_sandbox_domain = "internal-dev-sandbox.api.service.nhs.uk" -log_retention_period = 90 -enable_reporting = false + +log_retention_period = 90 +enable_reporting = false diff --git a/terraform/infrastructure/etc/int.tfvars b/terraform/infrastructure/etc/int.tfvars index e63c4680b..613f5cb19 100644 --- a/terraform/infrastructure/etc/int.tfvars +++ b/terraform/infrastructure/etc/int.tfvars @@ -1,10 +1,13 @@ account_name = "int" aws_account_name = "test" -domain = "api.record-locator.int.national.nhs.uk" -deletion_protection = true +dynamodb_pointers_table_prefix = "nhsd-nrlf--int" +dynamodb_sandbox_pointers_table_prefix = "nhsd-nrlf--int-sandbox" +deletion_protection = true +domain = "api.record-locator.int.national.nhs.uk" public_domain = "int.api.service.nhs.uk" public_sandbox_domain = "sandbox.api.service.nhs.uk" -log_retention_period = 90 -enable_reporting = true + +log_retention_period = 90 +enable_reporting = true diff --git a/terraform/infrastructure/etc/perftest.tfvars b/terraform/infrastructure/etc/perftest.tfvars index 377fbbfd5..2ddb7ecc5 100644 --- a/terraform/infrastructure/etc/perftest.tfvars +++ b/terraform/infrastructure/etc/perftest.tfvars @@ -1,9 +1,11 @@ account_name = "perftest" aws_account_name = "test" -domain = "perftest.record-locator.national.nhs.uk" -public_domain = "perftest.api.service.nhs.uk" -deletion_protection = true +dynamodb_pointers_table_prefix = "nhsd-nrlf--perftest" + +domain = "perftest.record-locator.national.nhs.uk" +public_domain = "perftest.api.service.nhs.uk" + log_retention_period = 30 enable_reporting = false disable_firehose_lambda_subscriptions = true diff --git a/terraform/infrastructure/etc/prod.tfvars b/terraform/infrastructure/etc/prod.tfvars index 7f93103b0..7ffd74262 100644 --- a/terraform/infrastructure/etc/prod.tfvars +++ b/terraform/infrastructure/etc/prod.tfvars @@ -1,8 +1,11 @@ account_name = "prod" aws_account_name = "prod" -domain = "api.record-locator.national.nhs.uk" -public_domain = "api.service.nhs.uk" -deletion_protection = true +dynamodb_pointers_table_prefix = "nhsd-nrlf--prod" +deletion_protection = true + +domain = "api.record-locator.national.nhs.uk" +public_domain = "api.service.nhs.uk" + log_retention_period = 2192 enable_reporting = true diff --git a/terraform/infrastructure/etc/qa.tfvars b/terraform/infrastructure/etc/qa.tfvars index bfada691e..989530574 100644 --- a/terraform/infrastructure/etc/qa.tfvars +++ b/terraform/infrastructure/etc/qa.tfvars @@ -1,8 +1,12 @@ account_name = "qa" aws_account_name = "test" +dynamodb_pointers_table_prefix = "nhsd-nrlf--qa" +dynamodb_sandbox_pointers_table_prefix = "nhsd-nrlf--qa-sandbox" + domain = "qa.record-locator.national.nhs.uk" public_domain = "internal-qa.api.service.nhs.uk" public_sandbox_domain = "internal-qa-sandbox.api.service.nhs.uk" -log_retention_period = 90 -enable_reporting = false + +log_retention_period = 90 +enable_reporting = false diff --git a/terraform/infrastructure/etc/ref.tfvars b/terraform/infrastructure/etc/ref.tfvars index 9647baa00..70bad7a92 100644 --- a/terraform/infrastructure/etc/ref.tfvars +++ b/terraform/infrastructure/etc/ref.tfvars @@ -1,7 +1,10 @@ account_name = "ref" aws_account_name = "test" -domain = "api.record-locator.ref.national.nhs.uk" -public_domain = "ref.api.service.nhs.uk" +dynamodb_pointers_table_prefix = "nhsd-nrlf--ref" + +domain = "api.record-locator.ref.national.nhs.uk" +public_domain = "ref.api.service.nhs.uk" + log_retention_period = 30 enable_reporting = false diff --git a/terraform/infrastructure/locals.tf b/terraform/infrastructure/locals.tf index de4e6327a..a6c0df93a 100644 --- a/terraform/infrastructure/locals.tf +++ b/terraform/infrastructure/locals.tf @@ -51,6 +51,8 @@ locals { auth_store_id = var.use_shared_resources ? data.aws_s3_bucket.authorization-store[0].id : module.ephemeral-s3-permission-store[0].bucket_id auth_store_read_policy_arn = var.use_shared_resources ? data.aws_iam_policy.auth-store-read-policy[0].arn : module.ephemeral-s3-permission-store[0].bucket_read_policy_arn + pointers_table_prefix = local.is_sandbox_env ? "${var.dynamodb_sandbox_pointers_table_prefix}" : "${var.dynamodb_pointers_table_prefix}" + pointers_table_name = var.use_shared_resources ? data.aws_dynamodb_table.pointers-table[0].name : module.ephemeral-pointers-table[0].table_name pointers_table_read_policy_arn = var.use_shared_resources ? data.aws_iam_policy.pointers-table-read[0].arn : module.ephemeral-pointers-table[0].read_policy_arn pointers_table_write_policy_arn = var.use_shared_resources ? data.aws_iam_policy.pointers-table-write[0].arn : module.ephemeral-pointers-table[0].write_policy_arn diff --git a/terraform/infrastructure/vars.tf b/terraform/infrastructure/vars.tf index 07c901d1a..b8db7d4f1 100644 --- a/terraform/infrastructure/vars.tf +++ b/terraform/infrastructure/vars.tf @@ -67,3 +67,14 @@ variable "disable_firehose_lambda_subscriptions" { type = bool default = false } + +variable "dynamodb_pointers_table_prefix" { + type = string + description = "The prefix of the DynamoDB pointers table to use when using shared resources" +} + +variable "dynamodb_sandbox_pointers_table_prefix" { + type = string + description = "The prefix of the DynamoDB pointers table to use when using shared resources in a sandbox environment" + default = null +}