|
| 1 | +import json |
| 2 | +from datetime import datetime, timedelta, timezone |
| 3 | +from typing import Any |
| 4 | + |
| 5 | +import boto3 |
| 6 | +import fire |
| 7 | + |
| 8 | +from nrlf.core.logger import logger |
| 9 | + |
| 10 | +dynamodb = boto3.client("dynamodb") |
| 11 | +paginator = dynamodb.get_paginator("scan") |
| 12 | + |
| 13 | +logger.setLevel("ERROR") |
| 14 | + |
| 15 | +REQUIRED_ATTRIBUTES = [ |
| 16 | + "nhs_number", |
| 17 | + "custodian", |
| 18 | + "id", |
| 19 | + "master_identifier", |
| 20 | + "type_id", |
| 21 | + "created_on", |
| 22 | +] |
| 23 | + |
| 24 | + |
| 25 | +def _get_duplicates( |
| 26 | + table_name: str, custodians: str | tuple[str], filename: str = "duplicates" |
| 27 | +) -> Any: |
| 28 | + """ |
| 29 | + Get masterids for duplicate pointers in the given table for a list of custodians. |
| 30 | + Parameters: |
| 31 | + - table_name: The name of the pointers table to use. |
| 32 | + - custodians: The ODS codes of the custodian(s) to check. |
| 33 | + - filename: A name for the output text file containing the list of affected pointers. |
| 34 | + """ |
| 35 | + custodian_list = ( |
| 36 | + custodians.split(",") if isinstance(custodians, str) else list(custodians) |
| 37 | + ) |
| 38 | + |
| 39 | + print( # noqa |
| 40 | + f"Finding duplicate pointers for custodians {custodian_list} in table {table_name}...." |
| 41 | + ) |
| 42 | + |
| 43 | + required_attributes = REQUIRED_ATTRIBUTES |
| 44 | + |
| 45 | + expression_names_str = ",".join( |
| 46 | + [f":param{custodian}" for custodian in custodian_list] |
| 47 | + ) |
| 48 | + expression_values_list = { |
| 49 | + f":param{custodian}": {"S": custodian} for custodian in custodian_list |
| 50 | + } |
| 51 | + |
| 52 | + params: dict[str, Any] = { |
| 53 | + "TableName": table_name, |
| 54 | + "PaginationConfig": {"PageSize": 50}, |
| 55 | + "FilterExpression": f"custodian IN ({expression_names_str})", |
| 56 | + "ExpressionAttributeValues": expression_values_list, |
| 57 | + "ProjectionExpression": ",".join(required_attributes), |
| 58 | + } |
| 59 | + pointers_by_key = {} |
| 60 | + total_scanned_count = 0 |
| 61 | + duplicate_count = 0 |
| 62 | + duplicates_set = set() |
| 63 | + |
| 64 | + start_time = datetime.now(tz=timezone.utc) |
| 65 | + |
| 66 | + for page in paginator.paginate(**params): |
| 67 | + for item in page["Items"]: |
| 68 | + pointer_id = item.get("id", {}).get("S", "no-id") |
| 69 | + pointer_type = item.get("type_id", {}).get("S", "no-type") |
| 70 | + master_id = item.get("master_identifier", {}).get("S", "no-master-id") |
| 71 | + custodian = item.get("custodian", {}).get("S", "no-custodian") |
| 72 | + patient_id = item.get("nhs_number", {}).get("S", "no-patient-id") |
| 73 | + created_on = item.get("created_on", {}).get("S", "no-creation-datetime") |
| 74 | + |
| 75 | + pointer_data = { |
| 76 | + "id": pointer_id, |
| 77 | + "master_id": master_id, |
| 78 | + "datetime": created_on, |
| 79 | + } |
| 80 | + |
| 81 | + px_type_ods_key = f"{custodian}-{patient_id}-{pointer_type}" |
| 82 | + |
| 83 | + if px_type_ods_key not in pointers_by_key: |
| 84 | + pointers_by_key[px_type_ods_key] = [pointer_data] |
| 85 | + else: |
| 86 | + pointers_by_key[px_type_ods_key].append(pointer_data) |
| 87 | + duplicate_count += 1 |
| 88 | + duplicates_set.add(px_type_ods_key) |
| 89 | + |
| 90 | + total_scanned_count += page["ScannedCount"] |
| 91 | + |
| 92 | + if total_scanned_count % 1000 == 0: |
| 93 | + print(".", end="", flush=True) # noqa |
| 94 | + |
| 95 | + if total_scanned_count % 100000 == 0: |
| 96 | + print( # noqa |
| 97 | + f"scanned={total_scanned_count} found={duplicate_count} potential duplicates " |
| 98 | + ) |
| 99 | + |
| 100 | + end_time = datetime.now(tz=timezone.utc) |
| 101 | + |
| 102 | + print(" Table scan completed") # noqa |
| 103 | + |
| 104 | + output_pointers = {} |
| 105 | + |
| 106 | + for key in sorted(duplicates_set): |
| 107 | + output_pointers[key] = pointers_by_key[key] |
| 108 | + |
| 109 | + print(f"Writing pointers to file ./{filename}.txt ...") # noqa |
| 110 | + with open(f"{filename}.txt", "w") as f: |
| 111 | + f.write(json.dumps(output_pointers, indent=2)) |
| 112 | + |
| 113 | + return { |
| 114 | + "output_file": f"{filename}.txt", |
| 115 | + "duplicates-found": duplicate_count, |
| 116 | + "scanned-count": total_scanned_count, |
| 117 | + "took-secs": timedelta.total_seconds(end_time - start_time), |
| 118 | + } |
| 119 | + |
| 120 | + |
| 121 | +if __name__ == "__main__": |
| 122 | + fire.Fire(_get_duplicates) |
0 commit comments