From 3d8c4e1d516658a5c89a7ba6dcaa6f01bb6b738d Mon Sep 17 00:00:00 2001 From: Kate Bobyn Date: Mon, 27 Oct 2025 16:04:54 +0000 Subject: [PATCH 1/3] NRL-1705 find and identify duplicates for a set of custodians --- reports/find_duplicate_pointers.py | 111 +++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 reports/find_duplicate_pointers.py diff --git a/reports/find_duplicate_pointers.py b/reports/find_duplicate_pointers.py new file mode 100644 index 000000000..f13fab310 --- /dev/null +++ b/reports/find_duplicate_pointers.py @@ -0,0 +1,111 @@ +from datetime import datetime, timedelta, timezone +from typing import Any + +import boto3 +import fire + +from nrlf.core.logger import logger + +dynamodb = boto3.client("dynamodb") +paginator = dynamodb.get_paginator("scan") + +logger.setLevel("ERROR") + +REQUIRED_ATTRIBUTES = [ + "nhs_number", + "custodian", + "id", + "master_identifier", + "type_id", + "created_on", +] + + +def _get_duplicates(table_name: str, custodians: str | tuple[str]) -> Any: + """ + Get masterids for duplicate pointers in the given table for a list of custodians. + Parameters: + - table_name: The name of the pointers table to use. + """ + custodian_list = ( + custodians.split(",") if isinstance(custodians, str) else list(custodians) + ) + + print( # noqa + f"Finding duplicate pointers for custodians {custodian_list} in table {table_name}...." + ) + + required_attributes = REQUIRED_ATTRIBUTES + + expression_names_str = ",".join( + [f":param{custodian}" for custodian in custodian_list] + ) + expression_values_list = { + f":param{custodian}": {"S": custodian} for custodian in custodian_list + } + + params: dict[str, Any] = { + "TableName": table_name, + "PaginationConfig": {"PageSize": 50}, + "FilterExpression": f"custodian IN ({expression_names_str})", + "ExpressionAttributeValues": expression_values_list, + "ProjectionExpression": ",".join(required_attributes), + } + pointers_by_key = dict() + total_scanned_count = 0 + duplicate_count = 0 + duplicates_set = set() + + start_time = datetime.now(tz=timezone.utc) + + for page in paginator.paginate(**params): + for item in page["Items"]: + pointer_id = item.get("id", {}).get("S", "no-id") + pointer_type = item.get("type_id", {}).get("S", "no-type") + master_id = item.get("master_identifier", {}).get("S", "no-master-id") + custodian = item.get("custodian", {}).get("S", "no-custodian") + patient_id = item.get("nhs_number", {}).get("S", "no-patient-id") + created_on = item.get("created_on", {}).get("S", "no-creation-datetime") + + pointer_data = { + "id": pointer_id, + "master_id": master_id, + "datetime": created_on, + } + + px_type_ods_key = f"{patient_id}-{custodian}-{pointer_type}" + + if px_type_ods_key not in pointers_by_key: + pointers_by_key[px_type_ods_key] = [pointer_data] + else: + pointers_by_key[px_type_ods_key].append(pointer_data) + duplicate_count += 1 + duplicates_set.add(px_type_ods_key) + + total_scanned_count += page["ScannedCount"] + + if total_scanned_count % 1000 == 0: + print(".", end="", flush=True) # noqa + + if total_scanned_count % 100000 == 0: + print( # noqa + f"scanned={total_scanned_count} found={duplicate_count} potential duplicates " + ) + + end_time = datetime.now(tz=timezone.utc) + + print(" Table scan completed") # noqa + + for key in duplicates_set: + print(f"Duplicates for {key}:") # noqa + print(pointers_by_key[key]) # noqa + + return { + "duplicates-found": duplicate_count, + "scanned-count": total_scanned_count, + "took-secs": timedelta.total_seconds(end_time - start_time), + } + + +if __name__ == "__main__": + fire.Fire(_get_duplicates) From bbcf87fd45086219c34a2f227fbd9e1e71cad4f8 Mon Sep 17 00:00:00 2001 From: Kate Bobyn Date: Tue, 28 Oct 2025 14:38:31 +0000 Subject: [PATCH 2/3] NRL-1705 save output as json in textfile --- reports/find_duplicate_pointers.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/reports/find_duplicate_pointers.py b/reports/find_duplicate_pointers.py index f13fab310..d1e7794d2 100644 --- a/reports/find_duplicate_pointers.py +++ b/reports/find_duplicate_pointers.py @@ -1,3 +1,4 @@ +import json from datetime import datetime, timedelta, timezone from typing import Any @@ -21,11 +22,15 @@ ] -def _get_duplicates(table_name: str, custodians: str | tuple[str]) -> Any: +def _get_duplicates( + table_name: str, custodians: str | tuple[str], filename: str = "duplicates" +) -> Any: """ Get masterids for duplicate pointers in the given table for a list of custodians. Parameters: - table_name: The name of the pointers table to use. + - custodians: The ODS codes of the custodian(s) to check. + - filename: A name for the output text file containing the list of affected pointers. """ custodian_list = ( custodians.split(",") if isinstance(custodians, str) else list(custodians) @@ -73,7 +78,7 @@ def _get_duplicates(table_name: str, custodians: str | tuple[str]) -> Any: "datetime": created_on, } - px_type_ods_key = f"{patient_id}-{custodian}-{pointer_type}" + px_type_ods_key = f"{custodian}-{patient_id}-{pointer_type}" if px_type_ods_key not in pointers_by_key: pointers_by_key[px_type_ods_key] = [pointer_data] @@ -96,11 +101,17 @@ def _get_duplicates(table_name: str, custodians: str | tuple[str]) -> Any: print(" Table scan completed") # noqa - for key in duplicates_set: - print(f"Duplicates for {key}:") # noqa - print(pointers_by_key[key]) # noqa + output_pointers = dict() + + for key in sorted(duplicates_set): + output_pointers[key] = pointers_by_key[key] + + print(f"Writing pointers to file ./{filename}.txt ...") # noqa + with open(f"{filename}.txt", "w") as f: + f.write(json.dumps(output_pointers, indent=2)) return { + "output_file": f"{filename}.txt", "duplicates-found": duplicate_count, "scanned-count": total_scanned_count, "took-secs": timedelta.total_seconds(end_time - start_time), From cb7b5ed12d4395ded0b618f2a3d6b1a58538244d Mon Sep 17 00:00:00 2001 From: katebobyn-nhs <101277966+katebobyn-nhs@users.noreply.github.com> Date: Tue, 28 Oct 2025 14:48:08 +0000 Subject: [PATCH 3/3] NRL-1705 replace constructors per S7498 --- reports/find_duplicate_pointers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reports/find_duplicate_pointers.py b/reports/find_duplicate_pointers.py index d1e7794d2..eecefd278 100644 --- a/reports/find_duplicate_pointers.py +++ b/reports/find_duplicate_pointers.py @@ -56,7 +56,7 @@ def _get_duplicates( "ExpressionAttributeValues": expression_values_list, "ProjectionExpression": ",".join(required_attributes), } - pointers_by_key = dict() + pointers_by_key = {} total_scanned_count = 0 duplicate_count = 0 duplicates_set = set() @@ -101,7 +101,7 @@ def _get_duplicates( print(" Table scan completed") # noqa - output_pointers = dict() + output_pointers = {} for key in sorted(duplicates_set): output_pointers[key] = pointers_by_key[key]