Skip to content

Commit 08fdf73

Browse files
Merge pull request #1084 from NHSDigital/feature/kabo5-NRL-1705-flag-duplicates-devops-script
Feature/kabo5 nrl 1705 flag duplicates devops script
2 parents b788431 + cb7b5ed commit 08fdf73

File tree

1 file changed

+122
-0
lines changed

1 file changed

+122
-0
lines changed

reports/find_duplicate_pointers.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import json
2+
from datetime import datetime, timedelta, timezone
3+
from typing import Any
4+
5+
import boto3
6+
import fire
7+
8+
from nrlf.core.logger import logger
9+
10+
dynamodb = boto3.client("dynamodb")
11+
paginator = dynamodb.get_paginator("scan")
12+
13+
logger.setLevel("ERROR")
14+
15+
REQUIRED_ATTRIBUTES = [
16+
"nhs_number",
17+
"custodian",
18+
"id",
19+
"master_identifier",
20+
"type_id",
21+
"created_on",
22+
]
23+
24+
25+
def _get_duplicates(
26+
table_name: str, custodians: str | tuple[str], filename: str = "duplicates"
27+
) -> Any:
28+
"""
29+
Get masterids for duplicate pointers in the given table for a list of custodians.
30+
Parameters:
31+
- table_name: The name of the pointers table to use.
32+
- custodians: The ODS codes of the custodian(s) to check.
33+
- filename: A name for the output text file containing the list of affected pointers.
34+
"""
35+
custodian_list = (
36+
custodians.split(",") if isinstance(custodians, str) else list(custodians)
37+
)
38+
39+
print( # noqa
40+
f"Finding duplicate pointers for custodians {custodian_list} in table {table_name}...."
41+
)
42+
43+
required_attributes = REQUIRED_ATTRIBUTES
44+
45+
expression_names_str = ",".join(
46+
[f":param{custodian}" for custodian in custodian_list]
47+
)
48+
expression_values_list = {
49+
f":param{custodian}": {"S": custodian} for custodian in custodian_list
50+
}
51+
52+
params: dict[str, Any] = {
53+
"TableName": table_name,
54+
"PaginationConfig": {"PageSize": 50},
55+
"FilterExpression": f"custodian IN ({expression_names_str})",
56+
"ExpressionAttributeValues": expression_values_list,
57+
"ProjectionExpression": ",".join(required_attributes),
58+
}
59+
pointers_by_key = {}
60+
total_scanned_count = 0
61+
duplicate_count = 0
62+
duplicates_set = set()
63+
64+
start_time = datetime.now(tz=timezone.utc)
65+
66+
for page in paginator.paginate(**params):
67+
for item in page["Items"]:
68+
pointer_id = item.get("id", {}).get("S", "no-id")
69+
pointer_type = item.get("type_id", {}).get("S", "no-type")
70+
master_id = item.get("master_identifier", {}).get("S", "no-master-id")
71+
custodian = item.get("custodian", {}).get("S", "no-custodian")
72+
patient_id = item.get("nhs_number", {}).get("S", "no-patient-id")
73+
created_on = item.get("created_on", {}).get("S", "no-creation-datetime")
74+
75+
pointer_data = {
76+
"id": pointer_id,
77+
"master_id": master_id,
78+
"datetime": created_on,
79+
}
80+
81+
px_type_ods_key = f"{custodian}-{patient_id}-{pointer_type}"
82+
83+
if px_type_ods_key not in pointers_by_key:
84+
pointers_by_key[px_type_ods_key] = [pointer_data]
85+
else:
86+
pointers_by_key[px_type_ods_key].append(pointer_data)
87+
duplicate_count += 1
88+
duplicates_set.add(px_type_ods_key)
89+
90+
total_scanned_count += page["ScannedCount"]
91+
92+
if total_scanned_count % 1000 == 0:
93+
print(".", end="", flush=True) # noqa
94+
95+
if total_scanned_count % 100000 == 0:
96+
print( # noqa
97+
f"scanned={total_scanned_count} found={duplicate_count} potential duplicates "
98+
)
99+
100+
end_time = datetime.now(tz=timezone.utc)
101+
102+
print(" Table scan completed") # noqa
103+
104+
output_pointers = {}
105+
106+
for key in sorted(duplicates_set):
107+
output_pointers[key] = pointers_by_key[key]
108+
109+
print(f"Writing pointers to file ./{filename}.txt ...") # noqa
110+
with open(f"{filename}.txt", "w") as f:
111+
f.write(json.dumps(output_pointers, indent=2))
112+
113+
return {
114+
"output_file": f"{filename}.txt",
115+
"duplicates-found": duplicate_count,
116+
"scanned-count": total_scanned_count,
117+
"took-secs": timedelta.total_seconds(end_time - start_time),
118+
}
119+
120+
121+
if __name__ == "__main__":
122+
fire.Fire(_get_duplicates)

0 commit comments

Comments
 (0)