Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions lambdas/shared/src/common/data_quality/reporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import json
import uuid
from dataclasses import asdict, dataclass

from botocore.exceptions import ClientError

from common.clients import get_s3_client, logger
from common.data_quality.checker import DataQualityChecker
from common.data_quality.completeness import MissingFields


@dataclass
class DataQualityReport:
data_quality_report_id: str
validationDate: str
completeness: MissingFields
validity: list[str]
timeliness_recorded_days: int
timeliness_ingested_seconds: int


class DataQualityReporter:
"""Generates and sends a data quality report to the relevant S3 bucket."""

def __init__(self, is_batch_csv: bool, bucket: str):
self.s3_client = get_s3_client()
self.bucket = bucket
self.dq_checker = DataQualityChecker(is_batch_csv=is_batch_csv)

def generate_and_send_report(self, immunisation: dict) -> None:
"""Formats and sends a data quality report to the S3 bucket."""
dq_output = self.dq_checker.run_checks(immunisation)
dq_report_id = str(uuid.uuid4())
file_key = f"{dq_report_id}.json"

# Build report
dq_report = DataQualityReport(
data_quality_report_id=dq_report_id,
validationDate=dq_output.validation_datetime,
completeness=dq_output.missing_fields,
validity=dq_output.invalid_fields,
timeliness_recorded_days=dq_output.timeliness.recorded_timeliness_days,
timeliness_ingested_seconds=dq_output.timeliness.ingested_timeliness_seconds,
)

# Send to S3 bucket
try:
self.s3_client.put_object(
Bucket=self.bucket, Key=file_key, Body=json.dumps(asdict(dq_report)), ContentType="application/json"
)
except ClientError as error:
# We only log the error here because we want the data quality checks to have minimal impact on the API's
# functionality. This should only happen in the case of AWS infrastructure issues.
logger.error("error whilst sending data quality for report id: %s with error: %s", dq_report_id, str(error))
return None

logger.info("data quality report sent successfully for report id: %s", dq_report_id)

return None
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

class TestDataQualityCompletenessChecker(unittest.TestCase):
def setUp(self):
super().setUp()
self.DataQualityCompletenessChecker = DataQualityCompletenessChecker()

def test_check_completeness_no_missing_fields(self):
Expand Down
133 changes: 131 additions & 2 deletions lambdas/shared/tests/test_common/data_quality/test_reporter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,135 @@
import datetime
import json
import unittest
import uuid
from copy import deepcopy
from dataclasses import asdict
from unittest.mock import patch

import boto3
from moto import mock_aws

from common.data_quality.completeness import MissingFields
from common.data_quality.reporter import DataQualityReport, DataQualityReporter
from test_common.data_quality.sample_values import VALID_BATCH_IMMUNISATION, VALID_FHIR_IMMUNISATION


@mock_aws
class TestDataQualityReporter(unittest.TestCase):
def test_something(self):
self.assertEqual(True, True) # add assertion here
def setUp(self):
# Fix date.today() for all validation tests
date_today_patcher = patch("common.data_quality.models.immunization_batch_row_model.datetime", wraps=datetime)
self.mock_date_today = date_today_patcher.start()
self.mock_date_today.date.today.return_value = datetime.date(2024, 5, 20)

# Fix datetime.now
self.mock_fixed_datetime = datetime.datetime(2024, 5, 20, 14, 12, 30, 123, tzinfo=datetime.timezone.utc)
datetime_now_patcher = patch("common.data_quality.checker.datetime", wraps=datetime.datetime)
self.mock_datetime_now = datetime_now_patcher.start()
self.mock_datetime_now.now.return_value = self.mock_fixed_datetime

# Fix generated UUID
self.example_uuid = uuid.UUID("fa711f35-c08b-48c8-b498-3b151e686ddf")
uuid_patcher = patch("uuid.uuid4", return_value=self.example_uuid)
self.mock_uuid = uuid_patcher.start()

# Set up mock S3 bucket
self.bucket = "test_bucket"
self.s3_client = boto3.client("s3")
self.s3_client.create_bucket(Bucket=self.bucket)

# Instantiate reporters
self.batch_dq_reporter = DataQualityReporter(is_batch_csv=True, bucket=self.bucket)
self.fhir_json_dq_reporter = DataQualityReporter(is_batch_csv=False, bucket=self.bucket)

# Expected reports
self.expected_dq_report_no_issues = DataQualityReport(
data_quality_report_id=str(self.example_uuid),
validationDate="2024-05-20T14:12:30.000Z",
completeness=MissingFields(required_fields=[], mandatory_fields=[], optional_fields=[]),
validity=[],
timeliness_recorded_days=4,
timeliness_ingested_seconds=785550,
)
self.expected_dq_report_with_issues = DataQualityReport(
data_quality_report_id=str(self.example_uuid),
validationDate="2024-05-20T14:12:30.000Z",
completeness=MissingFields(
required_fields=["NHS_NUMBER", "INDICATION_CODE"],
mandatory_fields=["PERSON_FORENAME", "PERSON_SURNAME"],
optional_fields=["PERFORMING_PROFESSIONAL_FORENAME", "PERFORMING_PROFESSIONAL_SURNAME"],
),
validity=["NHS_NUMBER", "DOSE_AMOUNT", "INDICATION_CODE"],
timeliness_recorded_days=4,
timeliness_ingested_seconds=785550,
)

def generate_and_send_report_test_logic(
self, expected_dq_report: DataQualityReport, immunisation: dict, is_batch_csv: bool
):
# run generate report
if is_batch_csv:
self.batch_dq_reporter.generate_and_send_report(immunisation)
else:
self.fhir_json_dq_reporter.generate_and_send_report(immunisation)

expected_json = json.dumps(asdict(expected_dq_report))

actual_json_object = self.s3_client.get_object(Bucket=self.bucket, Key=f"{str(self.example_uuid)}.json")
actual_json = actual_json_object.get("Body").read().decode("utf-8")

self.assertEqual(expected_json, actual_json)

def test_generate_and_send_report_no_issues_batch(self):
self.generate_and_send_report_test_logic(
expected_dq_report=self.expected_dq_report_no_issues,
immunisation=VALID_BATCH_IMMUNISATION,
is_batch_csv=True,
)

def test_generate_and_send_report_no_issues_api(self):
self.generate_and_send_report_test_logic(
expected_dq_report=self.expected_dq_report_no_issues,
immunisation=VALID_FHIR_IMMUNISATION,
is_batch_csv=False,
)

def test_generate_and_send_report_with_issues_batch(self):
batch_immunisation_with_issues = deepcopy(VALID_BATCH_IMMUNISATION)

# Missing fields
batch_immunisation_with_issues.pop("NHS_NUMBER") # required
batch_immunisation_with_issues.pop("INDICATION_CODE") # required
batch_immunisation_with_issues.pop("PERSON_FORENAME") # mandatory
batch_immunisation_with_issues.pop("PERSON_SURNAME") # mandatory
batch_immunisation_with_issues.pop("PERFORMING_PROFESSIONAL_FORENAME") # optional
batch_immunisation_with_issues.pop("PERFORMING_PROFESSIONAL_SURNAME") # optional

# Invalid fields
batch_immunisation_with_issues["DOSE_AMOUNT"] = "6.789"

self.generate_and_send_report_test_logic(
expected_dq_report=self.expected_dq_report_with_issues,
immunisation=batch_immunisation_with_issues,
is_batch_csv=True,
)

def test_generate_and_send_report_with_issues_api(self):
fhir_immunisation_with_issues = deepcopy(VALID_FHIR_IMMUNISATION)

# Missing fields
fhir_immunisation_with_issues["contained"][1]["identifier"][0]["value"] = "" # required
fhir_immunisation_with_issues["reasonCode"][0]["coding"][0]["code"] = "" # required
fhir_immunisation_with_issues["contained"][1]["name"][0]["given"][0] = "" # mandatory
fhir_immunisation_with_issues["contained"][1]["name"][0]["family"] = "" # mandatory
fhir_immunisation_with_issues["contained"][0]["name"][0]["given"][0] = "" # optional
fhir_immunisation_with_issues["contained"][0]["name"][0]["family"] = "" # optional

# Invalid fields
fhir_immunisation_with_issues["doseQuantity"]["value"] = "6.789"

self.generate_and_send_report_test_logic(
expected_dq_report=self.expected_dq_report_with_issues,
immunisation=fhir_immunisation_with_issues,
is_batch_csv=False,
)
Loading