From 84e38fd719e9a1837454bc0d1594380ed353b096 Mon Sep 17 00:00:00 2001
From: NogaNHS <127490765+NogaNHS@users.noreply.github.com>
Date: Wed, 11 Feb 2026 10:54:46 +0000
Subject: [PATCH 01/13] [PRMP-1384] Add file validation for document uploads
and update dependencies
- Implement check for locked or corrupt files during document upload process.
- Introduce new INVALID status for documents that are password protected or corrupted.
- Create requirements for files_lambda_layer to include msoffcrypto-tool.
---
...base-lambda-layer-reusable-publish-all.yml | 12 ++
.../base-lambdas-reusable-deploy-all.yml | 2 +-
Makefile | 4 +
lambdas/enums/document_status.py | 1 +
lambdas/enums/lambda_error.py | 8 +-
lambdas/enums/virus_scan_result.py | 1 +
.../requirements_files_lambda_layer.txt | 1 +
lambdas/requirements/requirements_test.txt | 2 +-
lambdas/services/base/s3_service.py | 35 ++---
.../services/get_document_upload_status.py | 6 +-
lambdas/services/pdf_stitch_service.py | 34 -----
.../upload_document_reference_service.py | 94 +++++++-----
.../unit/services/test_pdf_stitch_service.py | 68 ---------
.../test_upload_document_reference_service.py | 111 +++++++++-----
lambdas/tests/unit/utils/test_file_utils.py | 129 ++++++++++++++++-
lambdas/utils/file_utils.py | 43 +++++-
poetry.lock | 135 ++++++++++++------
pyproject.toml | 3 +
18 files changed, 441 insertions(+), 248 deletions(-)
create mode 100644 lambdas/requirements/layers/requirements_files_lambda_layer.txt
delete mode 100755 lambdas/services/pdf_stitch_service.py
delete mode 100644 lambdas/tests/unit/services/test_pdf_stitch_service.py
diff --git a/.github/workflows/base-lambda-layer-reusable-publish-all.yml b/.github/workflows/base-lambda-layer-reusable-publish-all.yml
index be97edb37f..a7e49b91b9 100644
--- a/.github/workflows/base-lambda-layer-reusable-publish-all.yml
+++ b/.github/workflows/base-lambda-layer-reusable-publish-all.yml
@@ -87,3 +87,15 @@ jobs:
lambda_layer_name: alerting_lambda_layer
secrets:
AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }}
+
+ deploy_files_lambda_layer:
+ name: Deploy files_lambda_layer
+ uses: ./.github/workflows/base-lambda-layer-reusable-publish.yml
+ with:
+ environment: ${{ inputs.environment}}
+ python_version: ${{ inputs.python_version }}
+ build_branch: ${{ inputs.build_branch }}
+ sandbox: ${{ inputs.sandbox }}
+ lambda_layer_name: files_lambda_layer
+ secrets:
+ AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }}
\ No newline at end of file
diff --git a/.github/workflows/base-lambdas-reusable-deploy-all.yml b/.github/workflows/base-lambdas-reusable-deploy-all.yml
index 495c36fb46..32d347ae45 100644
--- a/.github/workflows/base-lambdas-reusable-deploy-all.yml
+++ b/.github/workflows/base-lambdas-reusable-deploy-all.yml
@@ -695,7 +695,7 @@ jobs:
sandbox: ${{ inputs.sandbox }}
lambda_handler_name: document_reference_virus_scan_handler
lambda_aws_name: DocumentReferenceVirusScanCheck
- lambda_layer_names: "core_lambda_layer"
+ lambda_layer_names: "core_lambda_layer,files_lambda_layer"
secrets:
AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }}
diff --git a/Makefile b/Makefile
index f9739a9c22..704f970661 100644
--- a/Makefile
+++ b/Makefile
@@ -7,6 +7,7 @@ GITHUB_REQUIREMENTS=$(REQUIREMENTS_PATH)/requirements_github_runner.txt
TEST_REQUIREMENTS=$(REQUIREMENTS_PATH)/requirements_test.txt
CORE_REQUIREMENTS=$(LAMBDA_LAYER_REQUIREMENTS_PATH)/requirements_core_lambda_layer.txt
DATA_REQUIREMENTS=$(LAMBDA_LAYER_REQUIREMENTS_PATH)/requirements_data_lambda_layer.txt
+FILES_REQUIREMENTS=$(LAMBDA_LAYER_REQUIREMENTS_PATH)/requirements_files_lambda_layer.txt
REPORTS_REQUIREMENTS=$(LAMBDA_LAYER_REQUIREMENTS_PATH)/requirements_reports_lambda_layer.txt
ALERTING_REQUIREMENTS=$(LAMBDA_LAYER_REQUIREMENTS_PATH)/requirements_alerting_lambda_layer.txt
EDGE_REQUIREMENTS=$(REQUIREMENTS_PATH)/requirements_edge_lambda.txt
@@ -98,6 +99,7 @@ sort-requirements:
sort -o $(TEST_REQUIREMENTS) $(TEST_REQUIREMENTS)
sort -o $(CORE_REQUIREMENTS) $(CORE_REQUIREMENTS)
sort -o $(DATA_REQUIREMENTS) $(DATA_REQUIREMENTS)
+ sort -o $(FILES_REQUIREMENTS) $(FILES_REQUIREMENTS)
sort -o $(REPORTS_REQUIREMENTS) $(REPORTS_REQUIREMENTS)
sort -o $(ALERTING_REQUIREMENTS) $(ALERTING_REQUIREMENTS)
@@ -106,6 +108,7 @@ check-packages:
./lambdas/venv/bin/pip-audit -r $(TEST_REQUIREMENTS)
./lambdas/venv/bin/pip-audit -r $(CORE_REQUIREMENTS)
./lambdas/venv/bin/pip-audit -r $(DATA_REQUIREMENTS)
+ ./lambdas/venv/bin/pip-audit -r $(FILES_REQUIREMENTS)
./lambdas/venv/bin/pip-audit -r $(REPORTS_REQUIREMENTS)
./lambdas/venv/bin/pip-audit -r $(ALERTING_REQUIREMENTS)
@@ -206,6 +209,7 @@ env:
@./lambdas/venv/bin/pip3 install -r $(TEST_REQUIREMENTS) --no-cache-dir
@./lambdas/venv/bin/pip3 install -r $(CORE_REQUIREMENTS) --no-cache-dir
@./lambdas/venv/bin/pip3 install -r $(DATA_REQUIREMENTS) --no-cache-dir
+ @./lambdas/venv/bin/pip3 install -r $(FILES_REQUIREMENTS) --no-cache-dir
@./lambdas/venv/bin/pip3 install -r $(REPORTS_REQUIREMENTS) --no-cache-dir
@./lambdas/venv/bin/pip3 install -r $(ALERTING_REQUIREMENTS) --no-cache-dir
@echo " "
diff --git a/lambdas/enums/document_status.py b/lambdas/enums/document_status.py
index 2a16cd5bec..65ccac376b 100644
--- a/lambdas/enums/document_status.py
+++ b/lambdas/enums/document_status.py
@@ -6,6 +6,7 @@ class DocumentStatus(Enum):
FORBIDDEN = ("forbidden", "UC_4003")
NOT_FOUND = ("not-found", "UC_4004")
INFECTED = ("infected", "UC_4005")
+ INVALID = ("invalid", "UC_4006")
@property
def code(self):
diff --git a/lambdas/enums/lambda_error.py b/lambdas/enums/lambda_error.py
index 57563fd3db..95b60ccdfb 100644
--- a/lambdas/enums/lambda_error.py
+++ b/lambdas/enums/lambda_error.py
@@ -43,7 +43,7 @@ def create_error_response(
return error_response
def to_str(
- self, params: Optional[dict] = None, details: Optional[str] = None
+ self, params: Optional[dict] = None, details: Optional[str] = None,
) -> str:
message = self.value["message"]
if "%" in message and params:
@@ -59,7 +59,7 @@ def create_error_body(
**kwargs,
) -> str:
return self.create_error_response(
- params=params, details=details, **kwargs
+ params=params, details=details, **kwargs,
).create()
"""
@@ -440,6 +440,10 @@ def create_error_body(
"err_code": "UC_4005",
"message": "Some of the given document references are not referring to clean files",
}
+ UploadConfirmResultFilesInvalid = {
+ "err_code": "UC_4006",
+ "message": "Some of the given document references are password protected or corrupted",
+ }
UploadConfirmResultAWSFailure = {
"err_code": "UC_5004",
"message": "Error occurred with an AWS service",
diff --git a/lambdas/enums/virus_scan_result.py b/lambdas/enums/virus_scan_result.py
index 484d9e9027..ade769ab5c 100644
--- a/lambdas/enums/virus_scan_result.py
+++ b/lambdas/enums/virus_scan_result.py
@@ -7,6 +7,7 @@ class VirusScanResult(StrEnum):
INFECTED_ALLOWED = "InfectedAllowed"
UNSCANNABLE = "Unscannable"
ERROR = "Error"
+ INVALID = "Invalid"
SCAN_RESULT_TAG_KEY = "scan-result"
diff --git a/lambdas/requirements/layers/requirements_files_lambda_layer.txt b/lambdas/requirements/layers/requirements_files_lambda_layer.txt
new file mode 100644
index 0000000000..0d4bff021d
--- /dev/null
+++ b/lambdas/requirements/layers/requirements_files_lambda_layer.txt
@@ -0,0 +1 @@
+msoffcrypto-tool==6.0.0
\ No newline at end of file
diff --git a/lambdas/requirements/requirements_test.txt b/lambdas/requirements/requirements_test.txt
index 2a3ae5bf28..ac64194651 100644
--- a/lambdas/requirements/requirements_test.txt
+++ b/lambdas/requirements/requirements_test.txt
@@ -9,4 +9,4 @@ pytest-unordered==0.6.0
pytest==7.4.3
requests_mock==1.11.0
ruff==0.12.8
-syrupy==4.9.1
+syrupy==4.9.1
\ No newline at end of file
diff --git a/lambdas/services/base/s3_service.py b/lambdas/services/base/s3_service.py
index 3ee42b4e24..c9de2a9bea 100644
--- a/lambdas/services/base/s3_service.py
+++ b/lambdas/services/base/s3_service.py
@@ -40,7 +40,7 @@ def __init__(self, custom_aws_role=None):
if custom_aws_role:
self.iam_service = IAMService()
self.custom_client, self.expiration_time = self.iam_service.assume_role(
- self.custom_aws_role, "s3", config=self.config
+ self.custom_aws_role, "s3", config=self.config,
)
# S3 Location should be a minimum of a s3_object_key but can also be a directory location in the form of
@@ -48,11 +48,11 @@ def __init__(self, custom_aws_role=None):
def create_upload_presigned_url(self, s3_bucket_name: str, s3_object_location: str):
if self.custom_client:
if datetime.now(timezone.utc) > self.expiration_time - timedelta(
- minutes=10
+ minutes=10,
):
logger.info(S3Service.EXPIRED_SESSION_WARNING)
self.custom_client, self.expiration_time = self.iam_service.assume_role(
- self.custom_aws_role, "s3", config=self.config
+ self.custom_aws_role, "s3", config=self.config,
)
return self.custom_client.generate_presigned_post(
s3_bucket_name,
@@ -65,11 +65,11 @@ def create_upload_presigned_url(self, s3_bucket_name: str, s3_object_location: s
def create_put_presigned_url(self, s3_bucket_name: str, file_key: str):
if self.custom_client:
if datetime.now(timezone.utc) > self.expiration_time - timedelta(
- minutes=10
+ minutes=10,
):
logger.info(S3Service.EXPIRED_SESSION_WARNING)
self.custom_client, self.expiration_time = self.iam_service.assume_role(
- self.custom_aws_role, "s3", config=self.config
+ self.custom_aws_role, "s3", config=self.config,
)
logger.info("Generating presigned URL")
return self.custom_client.generate_presigned_url(
@@ -82,11 +82,11 @@ def create_put_presigned_url(self, s3_bucket_name: str, file_key: str):
def create_download_presigned_url(self, s3_bucket_name: str, file_key: str):
if self.custom_client:
if datetime.now(timezone.utc) > self.expiration_time - timedelta(
- minutes=10
+ minutes=10,
):
logger.info(S3Service.EXPIRED_SESSION_WARNING)
self.custom_client, self.expiration_time = self.iam_service.assume_role(
- self.custom_aws_role, "s3", config=self.config
+ self.custom_aws_role, "s3", config=self.config,
)
logger.info("Generating presigned URL")
return self.custom_client.generate_presigned_url(
@@ -150,17 +150,17 @@ def copy_across_bucket(
raise e
def delete_object(
- self, s3_bucket_name: str, file_key: str, version_id: str | None = None
+ self, s3_bucket_name: str, file_key: str, version_id: str | None = None,
):
if version_id is None:
return self.client.delete_object(Bucket=s3_bucket_name, Key=file_key)
return self.client.delete_object(
- Bucket=s3_bucket_name, Key=file_key, VersionId=version_id
+ Bucket=s3_bucket_name, Key=file_key, VersionId=version_id,
)
def create_object_tag(
- self, s3_bucket_name: str, file_key: str, tag_key: str, tag_value: str
+ self, s3_bucket_name: str, file_key: str, tag_key: str, tag_value: str,
):
return self.client.put_object_tagging(
Bucket=s3_bucket_name,
@@ -168,7 +168,7 @@ def create_object_tag(
Tagging={
"TagSet": [
{"Key": tag_key, "Value": tag_value},
- ]
+ ],
},
)
@@ -182,7 +182,7 @@ def get_tag_value(self, s3_bucket_name: str, file_key: str, tag_key: str) -> str
return key_value_pair["Value"]
raise TagNotFoundException(
- f"Object {file_key} doesn't have a tag of key {tag_key}"
+ f"Object {file_key} doesn't have a tag of key {tag_key}",
)
def file_exist_on_s3(self, s3_bucket_name: str, file_key: str) -> bool:
@@ -218,8 +218,11 @@ def get_file_size(self, s3_bucket_name: str, object_key: str) -> int:
def get_head_object(self, bucket: str, key: str):
return self.client.head_object(Bucket=bucket, Key=key)
- def get_object_stream(self, bucket: str, key: str):
- response = self.client.get_object(Bucket=bucket, Key=key)
+ def get_object_stream(self, bucket: str, key: str, byte_range: str = None):
+ params = {"Bucket": bucket, "Key": key}
+ if byte_range:
+ params["Range"] = byte_range
+ response = self.client.get_object(**params)
return response.get("Body")
def stream_s3_object_to_memory(self, bucket: str, key: str) -> BytesIO:
@@ -247,11 +250,11 @@ def upload_file_obj(
logger.info(f"Uploaded file object to s3://{s3_bucket_name}/{file_key}")
except ClientError as e:
logger.error(
- f"Failed to upload file object to s3://{s3_bucket_name}/{file_key} - {e}"
+ f"Failed to upload file object to s3://{s3_bucket_name}/{file_key} - {e}",
)
raise e
def save_or_create_file(self, source_bucket: str, file_key: str, body: bytes):
return self.client.put_object(
- Bucket=source_bucket, Key=file_key, Body=BytesIO(body)
+ Bucket=source_bucket, Key=file_key, Body=BytesIO(body),
)
diff --git a/lambdas/services/get_document_upload_status.py b/lambdas/services/get_document_upload_status.py
index 113b08908b..f6c67c6b67 100644
--- a/lambdas/services/get_document_upload_status.py
+++ b/lambdas/services/get_document_upload_status.py
@@ -24,12 +24,14 @@ def _determine_document_status(self, doc_ref, nhs_number):
if doc_ref.doc_status == "cancelled":
if doc_ref.virus_scanner_result == VirusScanResult.INFECTED:
return DocumentStatus.INFECTED.display, DocumentStatus.INFECTED.code
+ elif doc_ref.virus_scanner_result == VirusScanResult.INVALID:
+ return DocumentStatus.INVALID.display, DocumentStatus.INVALID.code
return DocumentStatus.CANCELLED.display, DocumentStatus.CANCELLED.code
return doc_ref.doc_status, None
def get_document_references_by_id(
- self, nhs_number: str, document_ids: list[str]
+ self, nhs_number: str, document_ids: list[str],
) -> dict:
"""
Checks the status of a list of documents for a given patient.
@@ -42,7 +44,7 @@ def get_document_references_by_id(
A dictionary with a list of document IDs and their corresponding statuses.
"""
found_docs = self.document_service.get_batch_document_references_by_id(
- document_ids, SupportedDocumentTypes.LG
+ document_ids, SupportedDocumentTypes.LG,
)
found_docs_by_id = {doc.id: doc for doc in found_docs}
results = {}
diff --git a/lambdas/services/pdf_stitch_service.py b/lambdas/services/pdf_stitch_service.py
deleted file mode 100755
index d08a20d1ab..0000000000
--- a/lambdas/services/pdf_stitch_service.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import os
-from uuid import uuid4
-
-from pypdf import PdfReader, PdfWriter
-from utils.audit_logging_setup import LoggingService
-
-logger = LoggingService(__name__)
-
-
-def stitch_pdf(filenames: list[str], temp_folder: str = "/tmp/") -> str:
- """
- Given a list of local PDF files, stitch them into one file and return the local file path of a resulting file.
-
- Example usage:
- filenames = ["file1.pdf", "file2.pdf", "file3.pdf"]
- tmp_folder = "/tmp/"
- stitch_pdf(filename, tmp_folder)
-
- Result:
- "/tmp/(filename_of_stitched_file).pdf"
- """
- merger = PdfWriter()
- for filename in filenames:
- merger.append(filename)
- output_filename = os.path.join(temp_folder, f"{str(uuid4())}.pdf")
- merger.write(output_filename)
- return output_filename
-
-
-def count_page_number(filename: str) -> int:
- """
- Return the total number of pages in a PDF file
- """
- return len(PdfReader(filename).pages)
diff --git a/lambdas/services/upload_document_reference_service.py b/lambdas/services/upload_document_reference_service.py
index 685d2e1544..eaded11355 100644
--- a/lambdas/services/upload_document_reference_service.py
+++ b/lambdas/services/upload_document_reference_service.py
@@ -1,3 +1,4 @@
+import io
import os
from typing import Optional
@@ -21,6 +22,7 @@
FileProcessingException,
TransactionConflictException,
)
+from utils.file_utils import check_file_locked_or_corrupt
from utils.lambda_exceptions import InvalidDocTypeException
from utils.s3_utils import DocTypeS3BucketRouter
from utils.utilities import get_virus_scan_service
@@ -42,7 +44,7 @@ def __init__(self):
self.bucket_router = DocTypeS3BucketRouter()
def handle_upload_document_reference_request(
- self, object_key: str, object_size: int = 0
+ self, object_key: str, object_size: int = 0,
):
"""Handle the upload document reference request with comprehensive error handling"""
if not object_key:
@@ -58,13 +60,13 @@ def handle_upload_document_reference_request(
self._get_infrastructure_for_document_key(object_parts)
preliminary_document_reference = self._fetch_preliminary_document_reference(
- document_key, nhs_number
+ document_key, nhs_number,
)
if not preliminary_document_reference:
return
self._process_preliminary_document_reference(
- preliminary_document_reference, object_key, object_size
+ preliminary_document_reference, object_key, object_size,
)
except Exception as e:
@@ -86,12 +88,12 @@ def _get_infrastructure_for_document_key(self, object_parts: list[str]) -> None:
self.destination_bucket_name = self.bucket_router.resolve(doc_type)
except KeyError:
logger.error(
- f"SNOMED code {doc_type.code} - {doc_type.display_name} is not supported"
+ f"SNOMED code {doc_type.code} - {doc_type.display_name} is not supported",
)
raise InvalidDocTypeException(400, LambdaError.DocTypeDB)
def _fetch_preliminary_document_reference(
- self, document_key: str, nhs_number: str | None = None
+ self, document_key: str, nhs_number: str | None = None,
) -> Optional[DocumentReference]:
"""Fetch document reference from the database"""
try:
@@ -101,7 +103,7 @@ def _fetch_preliminary_document_reference(
else:
if not nhs_number:
logger.error(
- f"Failed to process object key with ID: {document_key}"
+ f"Failed to process object key with ID: {document_key}",
)
raise FileProcessingException(400, LambdaError.DocRefInvalidFiles)
@@ -117,24 +119,24 @@ def _fetch_preliminary_document_reference(
if not documents:
logger.error(
- f"No document with the following key found in {self.table_name} table: {document_key}"
+ f"No document with the following key found in {self.table_name} table: {document_key}",
)
logger.info("Skipping this object")
return None
if len(documents) > 1:
logger.warning(
- f"Multiple documents found for key {document_key}, using first one"
+ f"Multiple documents found for key {document_key}, using first one",
)
return documents[0]
except ClientError as e:
logger.error(
- f"Error fetching document reference for key {document_key}: {str(e)}"
+ f"Error fetching document reference for key {document_key}: {str(e)}",
)
raise DocumentServiceException(
- f"Failed to fetch document reference: {str(e)}"
+ f"Failed to fetch document reference: {str(e)}",
)
def _process_preliminary_document_reference(
@@ -146,20 +148,29 @@ def _process_preliminary_document_reference(
"""Process the preliminary (uploading) document reference with virus scanning and file operations"""
try:
virus_scan_result = self._perform_virus_scan(
- preliminary_document_reference, object_key
+ preliminary_document_reference, object_key,
)
- preliminary_document_reference.virus_scanner_result = virus_scan_result
if virus_scan_result == VirusScanResult.CLEAN:
- self._process_clean_document(
- preliminary_document_reference,
- object_key,
- )
+ file_type_extension = preliminary_document_reference.file_name.split('.')[-1].lower()
+ is_file_protected = self.is_file_invalid(object_key, file_type_extension)
+ if is_file_protected:
+ logger.warning(
+ f"Document {preliminary_document_reference.id} is password protected or corrupt, "
+ f"marking as such in database",
+ )
+ virus_scan_result = VirusScanResult.Invalid
+ else:
+ self._process_clean_document(
+ preliminary_document_reference,
+ object_key,
+ )
else:
logger.warning(
- f"Document {preliminary_document_reference.id} failed virus scan"
+ f"Document {preliminary_document_reference.id} failed virus scan",
)
+ preliminary_document_reference.virus_scanner_result = virus_scan_result
preliminary_document_reference.file_size = object_size
preliminary_document_reference.uploaded = True
preliminary_document_reference.uploading = False
@@ -173,7 +184,7 @@ def _process_preliminary_document_reference(
and self.doc_type.code != SnomedCodes.PATIENT_DATA.value.code
):
self._finalize_and_supersede_with_transaction(
- preliminary_document_reference
+ preliminary_document_reference,
)
# Update NRL Pointer
@@ -184,7 +195,7 @@ def _process_preliminary_document_reference(
except Exception as e:
logger.error(
- f"Error processing document reference {preliminary_document_reference.id}: {str(e)}"
+ f"Error processing document reference {preliminary_document_reference.id}: {str(e)}",
)
raise
@@ -199,7 +210,7 @@ def _finalize_and_supersede_with_transaction(self, new_document: DocumentReferen
"""
try:
logger.info(
- f"Checking for existing final documents to supersede for NHS number {new_document.nhs_number}"
+ f"Checking for existing final documents to supersede for NHS number {new_document.nhs_number}",
)
existing_docs: list[DocumentReference] = (
@@ -243,7 +254,7 @@ def _finalize_and_supersede_with_transaction(self, new_document: DocumentReferen
# Supersede existing final documents
if existing_docs:
logger.info(
- f"Superseding {len(existing_docs)} existing final document(s) for NHS number {new_document.nhs_number}"
+ f"Superseding {len(existing_docs)} existing final document(s) for NHS number {new_document.nhs_number}",
)
for doc in existing_docs:
@@ -297,28 +308,28 @@ def _finalize_and_supersede_with_transaction(self, new_document: DocumentReferen
f" and superseded {len(existing_docs)} document(s)"
if existing_docs
else ""
- )
+ ),
)
except ClientError as e:
error_code = e.response.get("Error", {}).get("Code", "")
if error_code == "TransactionCanceledException":
logger.error(
- f"Transaction cancelled - concurrent update detected for NHS number {new_document.nhs_number}"
+ f"Transaction cancelled - concurrent update detected for NHS number {new_document.nhs_number}",
)
raise TransactionConflictException(
f"Concurrent update detected while finalizing document for NHS number {new_document.nhs_number}. "
- f"Another process may have already finalized a document for this patient."
+ f"Another process may have already finalized a document for this patient.",
)
raise
except Exception as e:
if isinstance(e, TransactionConflictException):
logger.error(
- f"Cancelling preliminary document {new_document.id} due to transaction conflict"
+ f"Cancelling preliminary document {new_document.id} due to transaction conflict",
)
else:
logger.error(
- f"Unexpected error while finalizing document for {new_document.nhs_number}: {e}"
+ f"Unexpected error while finalizing document for {new_document.nhs_number}: {e}",
)
new_document.doc_status = "cancelled"
@@ -327,47 +338,47 @@ def _finalize_and_supersede_with_transaction(self, new_document: DocumentReferen
new_document.file_size = None
self._update_dynamo_table(new_document)
self.delete_file_from_bucket(
- new_document.file_location, new_document.s3_version_id
+ new_document.file_location, new_document.s3_version_id,
)
def document_reference_key(self, document_id):
return {DocumentReferenceMetadataFields.ID.value: document_id}
def _perform_virus_scan(
- self, document_reference: DocumentReference, object_key: str
+ self, document_reference: DocumentReference, object_key: str,
) -> VirusScanResult:
"""Perform a virus scan on the document"""
try:
return self.virus_scan_service.scan_file(
- object_key, nhs_number=document_reference.nhs_number
+ object_key, nhs_number=document_reference.nhs_number,
)
except Exception as e:
logger.error(
- f"Virus scan failed for document {document_reference.id}: {str(e)}"
+ f"Virus scan failed for document {document_reference.id}: {str(e)}",
)
return VirusScanResult.ERROR
def _process_clean_document(
- self, document_reference: DocumentReference, object_key: str
+ self, document_reference: DocumentReference, object_key: str,
):
"""Process a document that passed virus scanning"""
try:
self.copy_files_from_staging_bucket(document_reference, object_key)
logger.info(
- f"Successfully processed clean document: {document_reference.id}"
+ f"Successfully processed clean document: {document_reference.id}",
)
except Exception as e:
logger.error(
- f"Error processing clean document {document_reference.id}: {str(e)}"
+ f"Error processing clean document {document_reference.id}: {str(e)}",
)
document_reference.doc_status = "cancelled"
raise FileProcessingException(f"Failed to process clean document: {str(e)}")
def copy_files_from_staging_bucket(
- self, document_reference: DocumentReference, source_file_key: str
+ self, document_reference: DocumentReference, source_file_key: str,
):
"""Copy files from staging bucket to destination bucket"""
try:
@@ -389,7 +400,7 @@ def copy_files_from_staging_bucket(
)
document_reference.s3_bucket_name = self.destination_bucket_name
document_reference.file_location = document_reference._build_s3_location(
- self.destination_bucket_name, dest_file_key
+ self.destination_bucket_name, dest_file_key,
)
document_reference.s3_version_id = copy_result.get("VersionId")
return copy_result
@@ -397,7 +408,7 @@ def copy_files_from_staging_bucket(
except ClientError as e:
logger.error(f"Error copying files from staging bucket: {str(e)}")
raise FileProcessingException(
- f"Failed to copy file from staging bucket: {str(e)}"
+ f"Failed to copy file from staging bucket: {str(e)}",
)
def delete_file_from_staging_bucket(self, source_file_key: str):
@@ -413,10 +424,10 @@ def delete_file_from_bucket(self, file_location: str, version_id: str):
"""Delete file from bucket"""
try:
s3_bucket_name, source_file_key = DocumentReference._parse_s3_location(
- file_location
+ file_location,
)
logger.info(
- f"Deleting file from bucket: {s3_bucket_name}/{source_file_key}"
+ f"Deleting file from bucket: {s3_bucket_name}/{source_file_key}",
)
self.s3_service.delete_object(s3_bucket_name, source_file_key, version_id)
@@ -457,5 +468,10 @@ def _update_dynamo_table(
except ClientError as e:
logger.error(f"Error updating DynamoDB table: {str(e)}")
raise DocumentServiceException(
- f"Failed to update document in database: {str(e)}"
+ f"Failed to update document in database: {str(e)}",
)
+
+ def is_file_invalid(self, object_key: str, file_type_extension: str) -> bool:
+ entire_object = self.s3_service.get_object_stream(self.staging_s3_bucket_name, object_key)
+ file_stream = io.BytesIO(entire_object.read())
+ return check_file_locked_or_corrupt(file_stream, file_type_extension)
diff --git a/lambdas/tests/unit/services/test_pdf_stitch_service.py b/lambdas/tests/unit/services/test_pdf_stitch_service.py
deleted file mode 100644
index 2634e1a1fb..0000000000
--- a/lambdas/tests/unit/services/test_pdf_stitch_service.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import os
-import tempfile
-from io import BytesIO
-
-import pytest
-from pypdf import PdfWriter
-from pypdf.errors import PyPdfError
-from services.pdf_stitch_service import count_page_number, stitch_pdf
-
-
-def test_stitch_pdf():
- test_pdf_folder = "tests/unit/helpers/data/pdf/"
- input_test_files = [
- f"{test_pdf_folder}/{filename}"
- for filename in ["file1.pdf", "file2.pdf", "file3.pdf"]
- ]
-
- stitched_file = stitch_pdf(input_test_files)
- assert count_page_number(stitched_file) == sum(
- count_page_number(filepath) for filepath in input_test_files
- )
-
- os.remove(stitched_file)
-
-
-def test_stitch_pdf_with_given_desc_folder():
- test_pdf_folder = "tests/unit/helpers/data/pdf/"
- test_desc_folder = tempfile.mkdtemp()
-
- input_test_files = [
- f"{test_pdf_folder}/{filename}"
- for filename in ["file1.pdf", "file2.pdf", "file3.pdf"]
- ]
-
- stitched_file = stitch_pdf(input_test_files, test_desc_folder)
-
- assert stitched_file.startswith(test_desc_folder)
-
- os.remove(stitched_file)
-
-
-def test_stitch_pdf_raise_error_if_fail_to_perform_stitching():
- test_pdf_folder = "tests/unit/helpers/data/pdf/"
- input_test_files = [
- f"{test_pdf_folder}/{filename}" for filename in ["invalid_pdf.pdf", "file1.pdf"]
- ]
-
- with pytest.raises(PyPdfError):
- stitch_pdf(input_test_files)
-
-
-def test_stitch_pdf_raise_error_when_input_file_not_found():
- test_file = "non-exist-file.pdf"
-
- with pytest.raises(FileNotFoundError):
- stitch_pdf([test_file])
-
-
-def create_in_memory_pdf(page_count: int = 1) -> BytesIO:
- # Creates a PDF in memory with the received number of pages
- writer = PdfWriter()
- for _ in range(page_count):
- writer.add_blank_page(width=72, height=72)
-
- stream = BytesIO()
- writer.write(stream)
- stream.seek(0)
- return stream
diff --git a/lambdas/tests/unit/services/test_upload_document_reference_service.py b/lambdas/tests/unit/services/test_upload_document_reference_service.py
index e030d461fd..3d85e5df70 100644
--- a/lambdas/tests/unit/services/test_upload_document_reference_service.py
+++ b/lambdas/tests/unit/services/test_upload_document_reference_service.py
@@ -11,7 +11,7 @@
FinalOrPreliminaryAndNotSuperseded,
PreliminaryStatus,
)
-from utils.exceptions import DocumentServiceException, FileProcessingException, TransactionConflictException
+from utils.exceptions import DocumentServiceException, FileProcessingException
from utils.lambda_exceptions import InvalidDocTypeException
from lambdas.enums.snomed_codes import SnomedCodes
@@ -23,6 +23,7 @@ def mock_document_reference():
doc_ref = DocumentReference.model_construct()
doc_ref.id = "test-doc-id"
doc_ref.nhs_number = "9000000001"
+ doc_ref.file_name = "test-file.txt"
doc_ref.s3_file_key = "original/test-key"
doc_ref.s3_bucket_name = "original-bucket"
doc_ref.file_location = "original-location"
@@ -31,7 +32,7 @@ def mock_document_reference():
doc_ref.doc_status = "preliminary"
doc_ref.version = "1"
doc_ref._build_s3_location = Mock(
- return_value="s3://test-lg-bucket/9000000001/test-doc-id"
+ return_value="s3://test-lg-bucket/9000000001/test-doc-id",
)
return doc_ref
@@ -41,13 +42,13 @@ def mock_virus_scan_service(
mocker,
):
mock = mocker.patch(
- "services.upload_document_reference_service.get_virus_scan_service"
+ "services.upload_document_reference_service.get_virus_scan_service",
)
yield mock
@pytest.fixture
-def service(set_env, mock_virus_scan_service):
+def service(set_env, mock_virus_scan_service, mocker):
with patch.multiple(
"services.upload_document_reference_service",
DocumentService=Mock(),
@@ -59,6 +60,8 @@ def service(set_env, mock_virus_scan_service):
service.dynamo_service = Mock()
service.virus_scan_service = MockVirusScanService()
service.s3_service = Mock()
+ mocker.patch("io.BytesIO", return_value=None)
+ mocker.patch("services.upload_document_reference_service.check_file_locked_or_corrupt", return_value=False)
return service
@@ -71,33 +74,34 @@ def test_handle_upload_document_reference_request_with_empty_object_key(service)
def test_handle_upload_document_reference_request_with_none_object_key(service):
"""Test handling of a None object key"""
- service.handle_upload_document_reference_request(None, 122)
+ service.handle_upload_document_reference_request("", 122)
service.document_service.fetch_documents_from_table.assert_not_called()
def test_handle_upload_document_reference_request_success(
- service, mock_document_reference, mocker
+ service, mock_document_reference, mocker,
):
- """Test successful handling of the upload document reference request"""
object_key = "staging/test-doc-id"
object_size = 1111
mock_document_reference2 = Mock(spec=DocumentReference)
+ mock_document_reference2.file_name = "filename2.txt"
mock_document_reference2.id = "another-doc-id"
mock_document_reference2.doc_status = "final"
mock_document_reference2.version = "1"
service.s3_service.copy_across_bucket.return_value = {
- "VersionId": "test-version-id"
+ "VersionId": "test-version-id",
}
-
+ mocker.patch("io.BytesIO", return_value=None)
+ mocker.patch("services.upload_document_reference_service.check_file_locked_or_corrupt", return_value=False)
# First call fetches preliminary doc, second call fetches existing final docs to supersede
service.document_service.fetch_documents_from_table.side_effect = [
[mock_document_reference],
[mock_document_reference2],
]
service.virus_scan_service.scan_file = mocker.MagicMock(
- return_value=VirusScanResult.CLEAN
+ return_value=VirusScanResult.CLEAN,
)
service.handle_upload_document_reference_request(object_key, object_size)
@@ -114,7 +118,7 @@ def test_handle_upload_document_reference_request_with_exception(service):
object_key = "staging/test-doc-id"
service.document_service.fetch_documents_from_table.side_effect = Exception(
- "Test error"
+ "Test error",
)
service.handle_upload_document_reference_request(object_key)
@@ -125,7 +129,7 @@ def test_fetch_preliminary_document_reference_success(service, mock_document_ref
document_key = "test-doc-id"
service.table_name = "dev_LloydGeorgeReferenceMetadata"
service.document_service.fetch_documents_from_table.return_value = [
- mock_document_reference
+ mock_document_reference,
]
result = service._fetch_preliminary_document_reference(document_key)
@@ -150,7 +154,7 @@ def test_fetch_preliminary_document_reference_no_documents_found(service):
def test_fetch_preliminary_document_reference_multiple_documents_warning(
- service, mock_document_reference
+ service, mock_document_reference,
):
"""Test handling when multiple documents are found"""
document_key = "test-doc-id"
@@ -177,23 +181,23 @@ def test_fetch_preliminary_document_reference_exception(service):
def test__process_preliminary_document_reference_clean_virus_scan(
- service, mock_document_reference, mocker
+ service, mock_document_reference, mocker,
):
"""Test processing document reference with a clean virus scan"""
object_key = "staging/test-doc-id"
mocker.patch.object(
- service, "_perform_virus_scan", return_value=VirusScanResult.CLEAN
+ service, "_perform_virus_scan", return_value=VirusScanResult.CLEAN,
)
mock_delete = mocker.patch.object(service, "delete_file_from_staging_bucket")
mock_process_clean = mocker.patch.object(service, "_process_clean_document")
mock_finalize_transaction = mocker.patch.object(
- service, "_finalize_and_supersede_with_transaction"
+ service, "_finalize_and_supersede_with_transaction",
)
service._process_preliminary_document_reference(
- mock_document_reference, object_key, 1222
+ mock_document_reference, object_key, 1222,
)
mock_process_clean.assert_called_once()
@@ -205,20 +209,20 @@ def test__process_preliminary_document_reference_clean_virus_scan(
def test__process_preliminary_document_reference_infected_virus_scan(
- service, mock_document_reference, mocker
+ service, mock_document_reference, mocker,
):
"""Test processing document reference with an infected virus scan"""
object_key = "staging/test-doc-id"
mocker.patch.object(
- service, "_perform_virus_scan", return_value=VirusScanResult.INFECTED
+ service, "_perform_virus_scan", return_value=VirusScanResult.INFECTED,
)
mock_delete = mocker.patch.object(service, "delete_file_from_staging_bucket")
mock_process_clean = mocker.patch.object(service, "_process_clean_document")
mock_update_dynamo = mocker.patch.object(service, "_update_dynamo_table")
service._process_preliminary_document_reference(
- mock_document_reference, object_key, 1222
+ mock_document_reference, object_key, 1222,
)
mock_process_clean.assert_not_called()
@@ -233,7 +237,7 @@ def test_perform_virus_scan_returns_clean_hardcoded(service, mock_document_refer
def test_perform_virus_scan_exception_returns_infected(
- service, mock_document_reference, mocker
+ service, mock_document_reference, mocker,
):
"""Test virus scan exception handling returns INFECTED for safety"""
mock_virus_service = mocker.patch.object(service, "virus_scan_service")
@@ -260,7 +264,7 @@ def test_process_clean_document_success(service, mock_document_reference, mocker
def test_process_clean_document_exception_restores_original_values(
- service, mock_document_reference, mocker
+ service, mock_document_reference, mocker,
):
"""Test that original values are restored when processing fails"""
object_key = "staging/test-doc-id"
@@ -269,7 +273,7 @@ def test_process_clean_document_exception_restores_original_values(
original_location = "original-location"
mocker.patch.object(
- service, "copy_files_from_staging_bucket", side_effect=Exception("Copy failed")
+ service, "copy_files_from_staging_bucket", side_effect=Exception("Copy failed"),
)
with pytest.raises(FileProcessingException):
service._process_clean_document(
@@ -306,7 +310,7 @@ def test_copy_files_from_staging_bucket_client_error(service, mock_document_refe
source_file_key = "staging/test-doc-id"
client_error = ClientError(
error_response={
- "Error": {"Code": "NoSuchBucket", "Message": "Bucket does not exist"}
+ "Error": {"Code": "NoSuchBucket", "Message": "Bucket does not exist"},
},
operation_name="CopyObject",
)
@@ -323,7 +327,7 @@ def test_delete_file_from_staging_bucket_success(service):
service.delete_file_from_staging_bucket(source_file_key)
service.s3_service.delete_object.assert_called_once_with(
- MOCK_STAGING_STORE_BUCKET, source_file_key
+ MOCK_STAGING_STORE_BUCKET, source_file_key,
)
@@ -336,7 +340,7 @@ def test_delete_pdm_file_from_staging_bucket_success(service):
service.delete_file_from_staging_bucket(source_file_key)
service.s3_service.delete_object.assert_called_once_with(
- MOCK_STAGING_STORE_BUCKET, source_file_key
+ MOCK_STAGING_STORE_BUCKET, source_file_key,
)
@@ -345,7 +349,7 @@ def test_delete_file_from_staging_bucket_client_error(service):
source_file_key = "staging/test-doc-id"
client_error = ClientError(
error_response={
- "Error": {"Code": "NoSuchKey", "Message": "Key does not exist"}
+ "Error": {"Code": "NoSuchKey", "Message": "Key does not exist"},
},
operation_name="DeleteObject",
)
@@ -389,7 +393,7 @@ def test_update_dynamo_table_client_error(service, mock_document_reference):
"""Test handling of ClientError during DynamoDB update"""
client_error = ClientError(
error_response={
- "Error": {"Code": "ResourceNotFoundException", "Message": "Table not found"}
+ "Error": {"Code": "ResourceNotFoundException", "Message": "Table not found"},
},
operation_name="UpdateItem",
)
@@ -418,7 +422,7 @@ def test_document_key_extraction_from_object_key_for_lg(
# First call returns preliminary doc, second call returns empty list (no existing finals)
service.s3_service.copy_across_bucket.return_value = {
- "VersionId": "test-version-id"
+ "VersionId": "test-version-id",
}
service.document_service.fetch_documents_from_table.side_effect = [
@@ -446,7 +450,7 @@ def test_document_key_extraction_from_object_key_for_lg(
def test_finalize_and_supersede_with_transaction_with_existing_finals(
- service, mock_document_reference, mocker
+ service, mock_document_reference,
):
"""Test transaction-based finalisation with existing final documents to supersede"""
new_doc = mock_document_reference
@@ -463,7 +467,7 @@ def test_finalize_and_supersede_with_transaction_with_existing_finals(
service.table_name = "dev_LloydGeorgeReferenceMetadata"
service.document_service.fetch_documents_from_table.return_value = [
- existing_final_doc
+ existing_final_doc,
]
mock_build_update = Mock(return_value={"Update": "transaction1"})
@@ -511,7 +515,7 @@ def test_finalize_and_supersede_with_transaction_with_existing_finals(
def test_finalize_and_supersede_with_transaction_no_existing_docs(
- service, mock_document_reference, mocker
+ service, mock_document_reference,
):
"""Test transaction-based finalization when no existing final documents found"""
new_doc = mock_document_reference
@@ -535,7 +539,7 @@ def test_finalize_and_supersede_with_transaction_no_existing_docs(
def test_finalize_and_supersede_with_transaction_multiple_existing(
- service, mock_document_reference, mocker
+ service, mock_document_reference,
):
"""Test transaction-based finalization superseding multiple existing final documents"""
new_doc = mock_document_reference
@@ -571,7 +575,7 @@ def test_finalize_and_supersede_with_transaction_multiple_existing(
def test_finalize_and_supersede_with_transaction_skips_same_id(
- service, mock_document_reference, mocker
+ service, mock_document_reference,
):
"""Test that transaction skips documents with the same ID"""
new_doc = mock_document_reference
@@ -599,7 +603,7 @@ def test_finalize_and_supersede_with_transaction_skips_same_id(
def test_finalize_and_supersede_with_transaction_handles_transaction_cancelled(
- service, mock_document_reference
+ service, mock_document_reference,
):
new_doc = mock_document_reference
@@ -639,21 +643,21 @@ def test_handle_upload_document_reference_request_no_document_found(service):
def test_process_preliminary_document_reference_exception_during_processing(
- service, mock_document_reference, mocker
+ service, mock_document_reference, mocker,
):
"""Test that exceptions during processing are properly raised"""
object_key = "staging/test-doc-id"
mocker.patch.object(
- service, "_perform_virus_scan", return_value=VirusScanResult.CLEAN
+ service, "_perform_virus_scan", return_value=VirusScanResult.CLEAN,
)
mocker.patch.object(
- service, "_process_clean_document", side_effect=Exception("Processing failed")
+ service, "_process_clean_document", side_effect=Exception("Processing failed"),
)
with pytest.raises(Exception) as exc_info:
service._process_preliminary_document_reference(
- mock_document_reference, object_key, 1222
+ mock_document_reference, object_key, 1222,
)
assert "Processing failed" in str(exc_info.value)
@@ -690,3 +694,32 @@ def test_get_infra_invalid_doc_type(monkeypatch, service):
# Call function and assert the exception is raised
with pytest.raises(InvalidDocTypeException):
service._get_infrastructure_for_document_key(["fhir_upload", "999999"])
+
+def test_is_file_invalid_calls_correct_functions(service, mocker):
+ """Test that is_file_invalid calls the right functions in the correct order"""
+ object_key = "test-folder/test-file.docx"
+ file_extension = "docx"
+ file_content = b"fake docx file content"
+
+ mock_stream = Mock()
+ mock_stream.read.return_value = file_content
+ service.s3_service.get_object_stream.return_value = mock_stream
+ mock_bytesio = mocker.patch("services.upload_document_reference_service.io.BytesIO")
+ mock_file_stream = Mock()
+ mock_bytesio.return_value = mock_file_stream
+ mock_check = mocker.patch(
+ "services.upload_document_reference_service.check_file_locked_or_corrupt",
+ return_value=True,
+ )
+
+ result = service.is_file_invalid(object_key, file_extension)
+
+ assert result is True
+ service.s3_service.get_object_stream.assert_called_once_with(
+ service.staging_s3_bucket_name, object_key,
+ )
+ mock_stream.read.assert_called_once_with()
+ mock_bytesio.assert_called_once_with(file_content)
+ mock_check.assert_called_once_with(mock_file_stream, file_extension)
+
+
diff --git a/lambdas/tests/unit/utils/test_file_utils.py b/lambdas/tests/unit/utils/test_file_utils.py
index 3a225f1841..6bda04566c 100644
--- a/lambdas/tests/unit/utils/test_file_utils.py
+++ b/lambdas/tests/unit/utils/test_file_utils.py
@@ -1,4 +1,11 @@
-from utils.file_utils import convert_csv_dictionary_to_bytes
+from io import BytesIO
+from unittest.mock import MagicMock, Mock, patch
+
+import pytest
+from utils.file_utils import (
+ check_file_locked_or_corrupt,
+ convert_csv_dictionary_to_bytes,
+)
def test_convert_csv_dictionary_to_bytes():
@@ -9,10 +16,128 @@ def test_convert_csv_dictionary_to_bytes():
]
result_bytes = convert_csv_dictionary_to_bytes(
- headers=headers, csv_dict_data=metadata_csv_data, encoding="utf-8"
+ headers=headers, csv_dict_data=metadata_csv_data, encoding="utf-8",
)
result_str = result_bytes.decode("utf-8")
expected_output = "id,name,age\r\n1,Alice,30\r\n2,Bob,25\r\n"
assert result_str == expected_output
+
+@pytest.mark.parametrize(
+ "file_extension,file_content,expected_result",
+ [
+ ("pdf", b"%PDF-1.4\nsome content", False),
+ ("zip", b"PK\x03\x04some zip content", False),
+ ],
+ ids=["pdf_file", "zip_file"],
+)
+def test_skipped_file_types(file_extension, file_content, expected_result):
+ file_stream = BytesIO(file_content)
+ result = check_file_locked_or_corrupt(file_stream, file_extension)
+ assert result == expected_result
+
+@pytest.mark.parametrize(
+ "file_extension,is_encrypted,expected_result",
+ [
+ ("docx", False, False),
+ ("docx", True, True),
+ ("xlsx", False, False),
+ ("xlsx", True, True),
+ ("pptx", False, False),
+ ("pptx", True, True),
+ ("doc", False, False),
+ ("doc", True, True),
+ ("xls", False, False),
+ ("xls", True, True),
+ ("ppt", False, False),
+ ("ppt", True, True),
+ ],
+)
+@patch("utils.file_utils.msoffcrypto.OfficeFile")
+def test_office_files(mock_office_file, file_extension, is_encrypted, expected_result):
+ mock_instance = Mock()
+ mock_instance.is_encrypted.return_value = is_encrypted
+ mock_office_file.return_value = mock_instance
+
+ file_stream = BytesIO(b"fake office content")
+ result = check_file_locked_or_corrupt(file_stream, file_extension)
+
+ assert result == expected_result
+ mock_office_file.assert_called_once_with(file_stream)
+ mock_instance.is_encrypted.assert_called_once()
+
+@pytest.mark.parametrize(
+ "file_extension,file_content,expected_result",
+ [
+ ("rtf", b"{\\rtf1\\ansi\\deff0 {\\fonttbl {\\f0 Times New Roman;}}\\f0\\fs60 Hello!}", False),
+ ("rtf", b"This is not an RTF file", True),
+ ("csv", b"name,age,city\nAlice,30,NYC\nBob,25,LA", False),
+ ("csv", b"\xff\xfe Invalid UTF-8", True),
+ ("json", b'{"key": "value", "number": 123}', False),
+ ("txt", b"This is a simple text file.\nWith multiple lines.", False),
+ ("txt", b"", False),
+ ("xml", b'- data
', False),
+ ],
+)
+def test_text_based_files(file_extension, file_content, expected_result):
+ file_stream = BytesIO(file_content)
+ result = check_file_locked_or_corrupt(file_stream, file_extension)
+ assert result == expected_result
+
+@pytest.mark.parametrize(
+ "file_extension",
+ ["jpg", "jpeg", "png", "tiff", "tif",],
+ ids=["jpg", "jpeg", "png", "tiff", "tif"],
+)
+@patch("utils.file_utils.Image.open")
+def test_image_files_valid(mock_image_open, file_extension):
+ mock_img = MagicMock()
+ mock_image_open.return_value.__enter__.return_value = mock_img
+
+ file_stream = BytesIO(b"fake image content")
+ result = check_file_locked_or_corrupt(file_stream, file_extension)
+
+ assert result is False
+ mock_image_open.assert_called_once_with(file_stream)
+ mock_img.verify.assert_called_once()
+
+@pytest.mark.parametrize(
+ "file_extension",
+ ["jpg", "png", "tiff"],
+)
+@patch("utils.file_utils.Image.open")
+def test_image_files_corrupt(mock_image_open, file_extension):
+ mock_image_open.side_effect = Exception("Corrupt image")
+
+ file_stream = BytesIO(b"corrupt image data")
+ result = check_file_locked_or_corrupt(file_stream, file_extension)
+
+ assert result is True
+
+@pytest.mark.parametrize(
+ "file_extension",
+ ["unknown", "mp4", "mp3", "avi", "mov"],
+ ids=["unknown", "mp4", "mp3", "avi", "mov"],
+)
+def test_unsupported_file_extensions(file_extension):
+ file_stream = BytesIO(b"some content")
+ result = check_file_locked_or_corrupt(file_stream, file_extension)
+ assert result is False
+
+@pytest.mark.parametrize(
+ "file_extension",
+ ["docx", "xlsx", "pptx", "doc", "xls"],
+ ids=["docx", "xlsx", "pptx", "doc", "xls"],
+)
+@patch("utils.file_utils.msoffcrypto.OfficeFile")
+def test_office_file_exception_returns_true(mock_office_file, file_extension):
+ mock_office_file.side_effect = Exception("Unable to process file")
+
+ file_stream = BytesIO(b"corrupt office content")
+ result = check_file_locked_or_corrupt(file_stream, file_extension)
+
+ assert result is True
+
+
+
diff --git a/lambdas/utils/file_utils.py b/lambdas/utils/file_utils.py
index ec24c67bc3..4623c3bfeb 100644
--- a/lambdas/utils/file_utils.py
+++ b/lambdas/utils/file_utils.py
@@ -1,9 +1,14 @@
import csv
from io import BytesIO, TextIOWrapper
+import msoffcrypto
+from PIL import Image
+from utils.audit_logging_setup import LoggingService
+
+logger = LoggingService(__name__)
def convert_csv_dictionary_to_bytes(
- headers: list[str], csv_dict_data: list[dict], encoding: str = "utf-8"
+ headers: list[str], csv_dict_data: list[dict], encoding: str = "utf-8",
) -> bytes:
csv_buffer = BytesIO()
csv_text_wrapper = TextIOWrapper(csv_buffer, encoding=encoding, newline="")
@@ -20,3 +25,39 @@ def convert_csv_dictionary_to_bytes(
csv_buffer.close()
return result
+
+
+def check_file_locked_or_corrupt(file_stream, ext):
+ file_stream.seek(0)
+ text_exts = ['rtf', 'csv', 'json', 'txt', 'xml']
+ media_exts = ['jpg', 'jpeg', 'png', 'tiff', 'tif']
+ try:
+ if ext == 'pdf' or ext == 'zip':
+ # Skipping PDF check, as this is covered by the antivirus scan
+ logger.info(f"Skipping check for {ext} files")
+ return False
+
+ elif ext in ['docx', 'xlsx', 'pptx', 'doc', 'xls', 'ppt']:
+ office_file = msoffcrypto.OfficeFile(file_stream)
+ encrypt = office_file.is_encrypted()
+ return encrypt
+
+ elif ext in text_exts:
+ sample = file_stream.read(1024)
+ sample.decode('utf-8')
+ if ext == 'rtf' and not sample.startswith(b'{\\rtf1'):
+ return True
+ return False
+
+ elif ext in media_exts:
+ with Image.open(file_stream) as img:
+ img.verify()
+ return False
+
+ else:
+ logger.info(f"File with extension {ext} is not supported for locked/corrupt check, treating as valid.")
+ return False
+
+ except Exception as e:
+ logger.error(f"Error checking file validity for .{ext}: {type(e).__name__} - {str(e)}")
+ return True
\ No newline at end of file
diff --git a/poetry.lock b/poetry.lock
index c329268964..4bf25323d7 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -205,7 +205,7 @@ version = "2.0.0"
description = "Foreign Function Interface for Python calling C code."
optional = false
python-versions = ">=3.9"
-groups = ["core-lambda"]
+groups = ["core-lambda", "files-lambda"]
markers = "platform_python_implementation != \"PyPy\""
files = [
{file = "cffi-2.0.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44"},
@@ -517,56 +517,74 @@ toml = ["tomli ; python_full_version <= \"3.11.0a6\""]
[[package]]
name = "cryptography"
-version = "44.0.1"
+version = "46.0.5"
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
optional = false
-python-versions = "!=3.9.0,!=3.9.1,>=3.7"
-groups = ["core-lambda"]
-files = [
- {file = "cryptography-44.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:bf688f615c29bfe9dfc44312ca470989279f0e94bb9f631f85e3459af8efc009"},
- {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd7c7e2d71d908dc0f8d2027e1604102140d84b155e658c20e8ad1304317691f"},
- {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887143b9ff6bad2b7570da75a7fe8bbf5f65276365ac259a5d2d5147a73775f2"},
- {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:322eb03ecc62784536bc173f1483e76747aafeb69c8728df48537eb431cd1911"},
- {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:21377472ca4ada2906bc313168c9dc7b1d7ca417b63c1c3011d0c74b7de9ae69"},
- {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:df978682c1504fc93b3209de21aeabf2375cb1571d4e61907b3e7a2540e83026"},
- {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:eb3889330f2a4a148abead555399ec9a32b13b7c8ba969b72d8e500eb7ef84cd"},
- {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:8e6a85a93d0642bd774460a86513c5d9d80b5c002ca9693e63f6e540f1815ed0"},
- {file = "cryptography-44.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6f76fdd6fd048576a04c5210d53aa04ca34d2ed63336d4abd306d0cbe298fddf"},
- {file = "cryptography-44.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6c8acf6f3d1f47acb2248ec3ea261171a671f3d9428e34ad0357148d492c7864"},
- {file = "cryptography-44.0.1-cp37-abi3-win32.whl", hash = "sha256:24979e9f2040c953a94bf3c6782e67795a4c260734e5264dceea65c8f4bae64a"},
- {file = "cryptography-44.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:fd0ee90072861e276b0ff08bd627abec29e32a53b2be44e41dbcdf87cbee2b00"},
- {file = "cryptography-44.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:a2d8a7045e1ab9b9f803f0d9531ead85f90c5f2859e653b61497228b18452008"},
- {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8272f257cf1cbd3f2e120f14c68bff2b6bdfcc157fafdee84a1b795efd72862"},
- {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e8d181e90a777b63f3f0caa836844a1182f1f265687fac2115fcf245f5fbec3"},
- {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:436df4f203482f41aad60ed1813811ac4ab102765ecae7a2bbb1dbb66dcff5a7"},
- {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4f422e8c6a28cf8b7f883eb790695d6d45b0c385a2583073f3cec434cc705e1a"},
- {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:72198e2b5925155497a5a3e8c216c7fb3e64c16ccee11f0e7da272fa93b35c4c"},
- {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:2a46a89ad3e6176223b632056f321bc7de36b9f9b93b2cc1cccf935a3849dc62"},
- {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:53f23339864b617a3dfc2b0ac8d5c432625c80014c25caac9082314e9de56f41"},
- {file = "cryptography-44.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:888fcc3fce0c888785a4876ca55f9f43787f4c5c1cc1e2e0da71ad481ff82c5b"},
- {file = "cryptography-44.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:00918d859aa4e57db8299607086f793fa7813ae2ff5a4637e318a25ef82730f7"},
- {file = "cryptography-44.0.1-cp39-abi3-win32.whl", hash = "sha256:9b336599e2cb77b1008cb2ac264b290803ec5e8e89d618a5e978ff5eb6f715d9"},
- {file = "cryptography-44.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:e403f7f766ded778ecdb790da786b418a9f2394f36e8cc8b796cc056ab05f44f"},
- {file = "cryptography-44.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1f9a92144fa0c877117e9748c74501bea842f93d21ee00b0cf922846d9d0b183"},
- {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:610a83540765a8d8ce0f351ce42e26e53e1f774a6efb71eb1b41eb01d01c3d12"},
- {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:5fed5cd6102bb4eb843e3315d2bf25fede494509bddadb81e03a859c1bc17b83"},
- {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:f4daefc971c2d1f82f03097dc6f216744a6cd2ac0f04c68fb935ea2ba2a0d420"},
- {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:94f99f2b943b354a5b6307d7e8d19f5c423a794462bde2bf310c770ba052b1c4"},
- {file = "cryptography-44.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d9c5b9f698a83c8bd71e0f4d3f9f839ef244798e5ffe96febfa9714717db7af7"},
- {file = "cryptography-44.0.1.tar.gz", hash = "sha256:f51f5705ab27898afda1aaa430f34ad90dc117421057782022edf0600bec5f14"},
+python-versions = "!=3.9.0,!=3.9.1,>=3.8"
+groups = ["core-lambda", "files-lambda"]
+files = [
+ {file = "cryptography-46.0.5-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:351695ada9ea9618b3500b490ad54c739860883df6c1f555e088eaf25b1bbaad"},
+ {file = "cryptography-46.0.5-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c18ff11e86df2e28854939acde2d003f7984f721eba450b56a200ad90eeb0e6b"},
+ {file = "cryptography-46.0.5-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d7e3d356b8cd4ea5aff04f129d5f66ebdc7b6f8eae802b93739ed520c47c79b"},
+ {file = "cryptography-46.0.5-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:50bfb6925eff619c9c023b967d5b77a54e04256c4281b0e21336a130cd7fc263"},
+ {file = "cryptography-46.0.5-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:803812e111e75d1aa73690d2facc295eaefd4439be1023fefc4995eaea2af90d"},
+ {file = "cryptography-46.0.5-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3ee190460e2fbe447175cda91b88b84ae8322a104fc27766ad09428754a618ed"},
+ {file = "cryptography-46.0.5-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:f145bba11b878005c496e93e257c1e88f154d278d2638e6450d17e0f31e558d2"},
+ {file = "cryptography-46.0.5-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:e9251e3be159d1020c4030bd2e5f84d6a43fe54b6c19c12f51cde9542a2817b2"},
+ {file = "cryptography-46.0.5-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:47fb8a66058b80e509c47118ef8a75d14c455e81ac369050f20ba0d23e77fee0"},
+ {file = "cryptography-46.0.5-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:4c3341037c136030cb46e4b1e17b7418ea4cbd9dd207e4a6f3b2b24e0d4ac731"},
+ {file = "cryptography-46.0.5-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:890bcb4abd5a2d3f852196437129eb3667d62630333aacc13dfd470fad3aaa82"},
+ {file = "cryptography-46.0.5-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:80a8d7bfdf38f87ca30a5391c0c9ce4ed2926918e017c29ddf643d0ed2778ea1"},
+ {file = "cryptography-46.0.5-cp311-abi3-win32.whl", hash = "sha256:60ee7e19e95104d4c03871d7d7dfb3d22ef8a9b9c6778c94e1c8fcc8365afd48"},
+ {file = "cryptography-46.0.5-cp311-abi3-win_amd64.whl", hash = "sha256:38946c54b16c885c72c4f59846be9743d699eee2b69b6988e0a00a01f46a61a4"},
+ {file = "cryptography-46.0.5-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:94a76daa32eb78d61339aff7952ea819b1734b46f73646a07decb40e5b3448e2"},
+ {file = "cryptography-46.0.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5be7bf2fb40769e05739dd0046e7b26f9d4670badc7b032d6ce4db64dddc0678"},
+ {file = "cryptography-46.0.5-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe346b143ff9685e40192a4960938545c699054ba11d4f9029f94751e3f71d87"},
+ {file = "cryptography-46.0.5-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:c69fd885df7d089548a42d5ec05be26050ebcd2283d89b3d30676eb32ff87dee"},
+ {file = "cryptography-46.0.5-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:8293f3dea7fc929ef7240796ba231413afa7b68ce38fd21da2995549f5961981"},
+ {file = "cryptography-46.0.5-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:1abfdb89b41c3be0365328a410baa9df3ff8a9110fb75e7b52e66803ddabc9a9"},
+ {file = "cryptography-46.0.5-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:d66e421495fdb797610a08f43b05269e0a5ea7f5e652a89bfd5a7d3c1dee3648"},
+ {file = "cryptography-46.0.5-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:4e817a8920bfbcff8940ecfd60f23d01836408242b30f1a708d93198393a80b4"},
+ {file = "cryptography-46.0.5-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:68f68d13f2e1cb95163fa3b4db4bf9a159a418f5f6e7242564fc75fcae667fd0"},
+ {file = "cryptography-46.0.5-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:a3d1fae9863299076f05cb8a778c467578262fae09f9dc0ee9b12eb4268ce663"},
+ {file = "cryptography-46.0.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c4143987a42a2397f2fc3b4d7e3a7d313fbe684f67ff443999e803dd75a76826"},
+ {file = "cryptography-46.0.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:7d731d4b107030987fd61a7f8ab512b25b53cef8f233a97379ede116f30eb67d"},
+ {file = "cryptography-46.0.5-cp314-cp314t-win32.whl", hash = "sha256:c3bcce8521d785d510b2aad26ae2c966092b7daa8f45dd8f44734a104dc0bc1a"},
+ {file = "cryptography-46.0.5-cp314-cp314t-win_amd64.whl", hash = "sha256:4d8ae8659ab18c65ced284993c2265910f6c9e650189d4e3f68445ef82a810e4"},
+ {file = "cryptography-46.0.5-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:4108d4c09fbbf2789d0c926eb4152ae1760d5a2d97612b92d508d96c861e4d31"},
+ {file = "cryptography-46.0.5-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d1f30a86d2757199cb2d56e48cce14deddf1f9c95f1ef1b64ee91ea43fe2e18"},
+ {file = "cryptography-46.0.5-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:039917b0dc418bb9f6edce8a906572d69e74bd330b0b3fea4f79dab7f8ddd235"},
+ {file = "cryptography-46.0.5-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:ba2a27ff02f48193fc4daeadf8ad2590516fa3d0adeeb34336b96f7fa64c1e3a"},
+ {file = "cryptography-46.0.5-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:61aa400dce22cb001a98014f647dc21cda08f7915ceb95df0c9eaf84b4b6af76"},
+ {file = "cryptography-46.0.5-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3ce58ba46e1bc2aac4f7d9290223cead56743fa6ab94a5d53292ffaac6a91614"},
+ {file = "cryptography-46.0.5-cp38-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:420d0e909050490d04359e7fdb5ed7e667ca5c3c402b809ae2563d7e66a92229"},
+ {file = "cryptography-46.0.5-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:582f5fcd2afa31622f317f80426a027f30dc792e9c80ffee87b993200ea115f1"},
+ {file = "cryptography-46.0.5-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:bfd56bb4b37ed4f330b82402f6f435845a5f5648edf1ad497da51a8452d5d62d"},
+ {file = "cryptography-46.0.5-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a3d507bb6a513ca96ba84443226af944b0f7f47dcc9a399d110cd6146481d24c"},
+ {file = "cryptography-46.0.5-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9f16fbdf4da055efb21c22d81b89f155f02ba420558db21288b3d0035bafd5f4"},
+ {file = "cryptography-46.0.5-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ced80795227d70549a411a4ab66e8ce307899fad2220ce5ab2f296e687eacde9"},
+ {file = "cryptography-46.0.5-cp38-abi3-win32.whl", hash = "sha256:02f547fce831f5096c9a567fd41bc12ca8f11df260959ecc7c3202555cc47a72"},
+ {file = "cryptography-46.0.5-cp38-abi3-win_amd64.whl", hash = "sha256:556e106ee01aa13484ce9b0239bca667be5004efb0aabbed28d353df86445595"},
+ {file = "cryptography-46.0.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:3b4995dc971c9fb83c25aa44cf45f02ba86f71ee600d81091c2f0cbae116b06c"},
+ {file = "cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:bc84e875994c3b445871ea7181d424588171efec3e185dced958dad9e001950a"},
+ {file = "cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:2ae6971afd6246710480e3f15824ed3029a60fc16991db250034efd0b9fb4356"},
+ {file = "cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:d861ee9e76ace6cf36a6a89b959ec08e7bc2493ee39d07ffe5acb23ef46d27da"},
+ {file = "cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:2b7a67c9cd56372f3249b39699f2ad479f6991e62ea15800973b956f4b73e257"},
+ {file = "cryptography-46.0.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8456928655f856c6e1533ff59d5be76578a7157224dbd9ce6872f25055ab9ab7"},
+ {file = "cryptography-46.0.5.tar.gz", hash = "sha256:abace499247268e3757271b2f1e244b36b06f8515cf27c4d49468fc9eb16e93d"},
]
[package.dependencies]
-cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""}
+cffi = {version = ">=2.0.0", markers = "python_full_version >= \"3.9.0\" and platform_python_implementation != \"PyPy\""}
[package.extras]
-docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=3.0.0) ; python_version >= \"3.8\""]
+docs = ["sphinx (>=5.3.0)", "sphinx-inline-tabs", "sphinx-rtd-theme (>=3.0.0)"]
docstest = ["pyenchant (>=3)", "readme-renderer (>=30.0)", "sphinxcontrib-spelling (>=7.3.1)"]
-nox = ["nox (>=2024.4.15)", "nox[uv] (>=2024.3.2) ; python_version >= \"3.8\""]
-pep8test = ["check-sdist ; python_version >= \"3.8\"", "click (>=8.0.1)", "mypy (>=1.4)", "ruff (>=0.3.6)"]
+nox = ["nox[uv] (>=2024.4.15)"]
+pep8test = ["check-sdist", "click (>=8.0.1)", "mypy (>=1.14)", "ruff (>=0.11.11)"]
sdist = ["build (>=1.0.0)"]
ssh = ["bcrypt (>=3.1.5)"]
-test = ["certifi (>=2024)", "cryptography-vectors (==44.0.1)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"]
+test = ["certifi (>=2024)", "cryptography-vectors (==46.0.5)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"]
test-randomorder = ["pytest-randomly"]
[[package]]
@@ -1190,6 +1208,22 @@ files = [
{file = "msgpack-1.1.2.tar.gz", hash = "sha256:3b60763c1373dd60f398488069bcdc703cd08a711477b5d480eecc9f9626f47e"},
]
+[[package]]
+name = "msoffcrypto-tool"
+version = "6.0.0"
+description = "Python tool and library for decrypting and encrypting MS Office files using a password or other keys"
+optional = false
+python-versions = "<4.0,>=3.10"
+groups = ["files-lambda"]
+files = [
+ {file = "msoffcrypto_tool-6.0.0-py3-none-any.whl", hash = "sha256:46c394ed5d9641e802fc79bf3fb0666a53748b23fa8c4aa634ae9d30d46fe397"},
+ {file = "msoffcrypto_tool-6.0.0.tar.gz", hash = "sha256:9a5ebc4c0096b42e5d7ebc2350afdc92dc511061e935ca188468094fdd032bbe"},
+]
+
+[package.dependencies]
+cryptography = ">=39.0"
+olefile = ">=0.46"
+
[[package]]
name = "mypy-extensions"
version = "1.1.0"
@@ -1219,6 +1253,21 @@ rsa = ["cryptography (>=3.0.0)"]
signals = ["blinker (>=1.4.0)"]
signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
+[[package]]
+name = "olefile"
+version = "0.47"
+description = "Python package to parse, read and write Microsoft OLE2 files (Structured Storage or Compound Document, Microsoft Office)"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+groups = ["files-lambda"]
+files = [
+ {file = "olefile-0.47-py2.py3-none-any.whl", hash = "sha256:543c7da2a7adadf21214938bb79c83ea12b473a4b6ee4ad4bf854e7715e13d1f"},
+ {file = "olefile-0.47.zip", hash = "sha256:599383381a0bf3dfbd932ca0ca6515acd174ed48870cbf7fee123d698c192c1c"},
+]
+
+[package.extras]
+tests = ["pytest", "pytest-cov"]
+
[[package]]
name = "openpyxl"
version = "3.1.5"
@@ -1664,7 +1713,7 @@ version = "2.23"
description = "C parser in Python"
optional = false
python-versions = ">=3.8"
-groups = ["core-lambda"]
+groups = ["core-lambda", "files-lambda"]
markers = "platform_python_implementation != \"PyPy\" and implementation_name != \"PyPy\""
files = [
{file = "pycparser-2.23-py3-none-any.whl", hash = "sha256:e5c6e8d3fbad53479cab09ac03729e0a9faf2bee3db8208a550daf5af81a5934"},
@@ -2450,4 +2499,4 @@ requests = "*"
[metadata]
lock-version = "2.1"
python-versions = "^3.11"
-content-hash = "77d0249d2dd6c9fbb02b0e434ade72ac221f4df82e8d98ec49250fe0b7ff74df"
+content-hash = "9d5fc7af2841c90590cd173e32abcf400d064cd303e49a5b2ac33964170a0e9b"
diff --git a/pyproject.toml b/pyproject.toml
index 4abaa1617d..fe89cf8388 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,6 +51,9 @@ pikepdf = "8.4.0"
[tool.poetry.group.data_lambda.dependencies]
polars = "1.31.0"
+[tool.poetry.group.files_lambda.dependencies]
+msoffcrypto-tool = "6.0.0"
+
[tool.poetry.group.reports_lambda.dependencies]
openpyxl = "^3.1.5"
reportlab = "^4.3.1"
From 8e33f200f5913e29dc127d367a2c2fef23f0e506 Mon Sep 17 00:00:00 2001
From: NogaNHS <127490765+NogaNHS@users.noreply.github.com>
Date: Wed, 11 Feb 2026 11:01:46 +0000
Subject: [PATCH 02/13] Fix virus scan result constant to use uppercase
enumeration
---
lambdas/requirements/requirements_test.txt | 2 +-
lambdas/services/upload_document_reference_service.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/lambdas/requirements/requirements_test.txt b/lambdas/requirements/requirements_test.txt
index ac64194651..2a3ae5bf28 100644
--- a/lambdas/requirements/requirements_test.txt
+++ b/lambdas/requirements/requirements_test.txt
@@ -9,4 +9,4 @@ pytest-unordered==0.6.0
pytest==7.4.3
requests_mock==1.11.0
ruff==0.12.8
-syrupy==4.9.1
\ No newline at end of file
+syrupy==4.9.1
diff --git a/lambdas/services/upload_document_reference_service.py b/lambdas/services/upload_document_reference_service.py
index eaded11355..bccbcf91c6 100644
--- a/lambdas/services/upload_document_reference_service.py
+++ b/lambdas/services/upload_document_reference_service.py
@@ -159,7 +159,7 @@ def _process_preliminary_document_reference(
f"Document {preliminary_document_reference.id} is password protected or corrupt, "
f"marking as such in database",
)
- virus_scan_result = VirusScanResult.Invalid
+ virus_scan_result = VirusScanResult.INVALID
else:
self._process_clean_document(
preliminary_document_reference,
From 04a44db8a181aff7c32db3ef9494c82f54d8d9ef Mon Sep 17 00:00:00 2001
From: adamwhitingnhs
Date: Wed, 11 Feb 2026 11:33:40 +0000
Subject: [PATCH 03/13] fix changed files queries on lint and format
---
.github/workflows/automated-pr-validator.yml | 2 +-
Makefile | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/automated-pr-validator.yml b/.github/workflows/automated-pr-validator.yml
index a290624edd..2fc24de896 100644
--- a/.github/workflows/automated-pr-validator.yml
+++ b/.github/workflows/automated-pr-validator.yml
@@ -218,7 +218,7 @@ jobs:
id: changed-files
run: |
git remote set-branches origin main && git fetch --depth 1 origin main && git branch main origin/main
- echo "CHANGED_FILES=$(git diff main --name-only | grep '.py$' | tr '\n' ' ')" >> $GITHUB_OUTPUT
+ echo "CHANGED_FILES=$(git diff main --name-status | grep -E '^[^D].*\.py$' | tr '\n' ' ')" >> $GITHUB_OUTPUT
- name: Run black
id: black
diff --git a/Makefile b/Makefile
index 704f970661..a08c2187bb 100644
--- a/Makefile
+++ b/Makefile
@@ -87,7 +87,7 @@ format:
@if [ $(FORMAT_ALL) = true ]; then \
CHANGED_FILES=''; \
else \
- CHANGED_FILES=$$(git diff main --name-only | grep '.py$$' | xargs); \
+ CHANGED_FILES=$$(git diff main --name-status | grep -E '^[^D].*\.py$$' | xargs); \
echo $$CHANGED_FILES; \
if [ -z "$$CHANGED_FILES" ]; then echo "No changed files to format"; exit 0; fi; \
fi; \
From 0bce0f0e3f0a3ea93d1b62c031af9af9efee5233 Mon Sep 17 00:00:00 2001
From: NogaNHS <127490765+NogaNHS@users.noreply.github.com>
Date: Wed, 11 Feb 2026 11:47:30 +0000
Subject: [PATCH 04/13] Refactor virus scan result handling to check for file
name presence before processing
---
.../upload_document_reference_service.py | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/lambdas/services/upload_document_reference_service.py b/lambdas/services/upload_document_reference_service.py
index bccbcf91c6..762133df50 100644
--- a/lambdas/services/upload_document_reference_service.py
+++ b/lambdas/services/upload_document_reference_service.py
@@ -152,14 +152,15 @@ def _process_preliminary_document_reference(
)
if virus_scan_result == VirusScanResult.CLEAN:
- file_type_extension = preliminary_document_reference.file_name.split('.')[-1].lower()
- is_file_protected = self.is_file_invalid(object_key, file_type_extension)
- if is_file_protected:
- logger.warning(
- f"Document {preliminary_document_reference.id} is password protected or corrupt, "
- f"marking as such in database",
- )
- virus_scan_result = VirusScanResult.INVALID
+ if getattr(preliminary_document_reference, "file_name", None):
+ file_type_extension = preliminary_document_reference.file_name.split('.')[-1].lower()
+ is_file_protected = self.is_file_invalid(object_key, file_type_extension)
+ if is_file_protected:
+ logger.warning(
+ f"Document {preliminary_document_reference.id} is password protected or corrupt, "
+ f"marking as such in database",
+ )
+ virus_scan_result = VirusScanResult.INVALID
else:
self._process_clean_document(
preliminary_document_reference,
From e8aa8e97d50d113eb01cc36249a3b0e571d09c9a Mon Sep 17 00:00:00 2001
From: adamwhitingnhs
Date: Wed, 11 Feb 2026 12:26:31 +0000
Subject: [PATCH 05/13] fix format ordering
---
.github/workflows/automated-pr-validator.yml | 2 +-
.lintstagedrc | 2 +-
Makefile | 4 ++--
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/automated-pr-validator.yml b/.github/workflows/automated-pr-validator.yml
index 2fc24de896..ba656d5869 100644
--- a/.github/workflows/automated-pr-validator.yml
+++ b/.github/workflows/automated-pr-validator.yml
@@ -218,7 +218,7 @@ jobs:
id: changed-files
run: |
git remote set-branches origin main && git fetch --depth 1 origin main && git branch main origin/main
- echo "CHANGED_FILES=$(git diff main --name-status | grep -E '^[^D].*\.py$' | tr '\n' ' ')" >> $GITHUB_OUTPUT
+ echo "CHANGED_FILES=$(git diff main --name-status | grep -E '^[^D].*\.py$' | cut -f2 | tr '\n' ' ')" >> $GITHUB_OUTPUT
- name: Run black
id: black
diff --git a/.lintstagedrc b/.lintstagedrc
index 5ac098b07a..0cfe966e6a 100644
--- a/.lintstagedrc
+++ b/.lintstagedrc
@@ -7,8 +7,8 @@
"./app/node_modules/prettier/bin/prettier.cjs --write"
],
"*.py": [
+ "./lambdas/venv/bin/ruff check --fix",
"./lambdas/venv/bin/python3 -m black",
- "./lambdas/venv/bin/ruff check ./lambdas",
"./lambdas/venv/bin/python3 -m isort --profile black",
]
}
\ No newline at end of file
diff --git a/Makefile b/Makefile
index a08c2187bb..7dd4df5c7c 100644
--- a/Makefile
+++ b/Makefile
@@ -87,12 +87,12 @@ format:
@if [ $(FORMAT_ALL) = true ]; then \
CHANGED_FILES=''; \
else \
- CHANGED_FILES=$$(git diff main --name-status | grep -E '^[^D].*\.py$$' | xargs); \
+ CHANGED_FILES=$$(git diff main --name-status | grep -E '^[^D].*\.py$$' | cut -f2 | xargs); \
echo $$CHANGED_FILES; \
if [ -z "$$CHANGED_FILES" ]; then echo "No changed files to format"; exit 0; fi; \
fi; \
- $(VENV_PATH_PREFIX)/bin/python3 -m black $$CHANGED_FILES; \
$(VENV_PATH_PREFIX)/bin/ruff check $$CHANGED_FILES --fix; \
+ $(VENV_PATH_PREFIX)/bin/python3 -m black $$CHANGED_FILES; \
$(VENV_PATH_PREFIX)/bin/python3 -m isort --profile black $$CHANGED_FILES
sort-requirements:
From bd7eee79b37bd836fd9ae302f364ce7258b2959b Mon Sep 17 00:00:00 2001
From: NogaNHS <127490765+NogaNHS@users.noreply.github.com>
Date: Wed, 11 Feb 2026 13:02:55 +0000
Subject: [PATCH 06/13] Refactor virus scan result handling to improve file
protection checks
---
.../services/upload_document_reference_service.py | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/lambdas/services/upload_document_reference_service.py b/lambdas/services/upload_document_reference_service.py
index 762133df50..f869b26eb1 100644
--- a/lambdas/services/upload_document_reference_service.py
+++ b/lambdas/services/upload_document_reference_service.py
@@ -152,15 +152,16 @@ def _process_preliminary_document_reference(
)
if virus_scan_result == VirusScanResult.CLEAN:
+ is_file_protected = False
if getattr(preliminary_document_reference, "file_name", None):
file_type_extension = preliminary_document_reference.file_name.split('.')[-1].lower()
is_file_protected = self.is_file_invalid(object_key, file_type_extension)
- if is_file_protected:
- logger.warning(
- f"Document {preliminary_document_reference.id} is password protected or corrupt, "
- f"marking as such in database",
- )
- virus_scan_result = VirusScanResult.INVALID
+ if is_file_protected:
+ logger.warning(
+ f"Document {preliminary_document_reference.id} is password protected or corrupt, "
+ f"marking as such in database",
+ )
+ virus_scan_result = VirusScanResult.INVALID
else:
self._process_clean_document(
preliminary_document_reference,
From 24dd1a5c691c76d3782e2a8d38d9051252e09f34 Mon Sep 17 00:00:00 2001
From: NogaNHS <127490765+NogaNHS@users.noreply.github.com>
Date: Wed, 11 Feb 2026 13:04:34 +0000
Subject: [PATCH 07/13] format
---
lambdas/enums/lambda_error.py | 8 +-
lambdas/services/base/s3_service.py | 35 ++++--
.../services/get_document_upload_status.py | 7 +-
.../upload_document_reference_service.py | 55 ++++++---
.../test_upload_document_reference_service.py | 106 +++++++++++++-----
lambdas/tests/unit/utils/test_file_utils.py | 28 ++++-
lambdas/utils/file_utils.py | 27 +++--
7 files changed, 193 insertions(+), 73 deletions(-)
diff --git a/lambdas/enums/lambda_error.py b/lambdas/enums/lambda_error.py
index 95b60ccdfb..d5c2a62e5f 100644
--- a/lambdas/enums/lambda_error.py
+++ b/lambdas/enums/lambda_error.py
@@ -43,7 +43,9 @@ def create_error_response(
return error_response
def to_str(
- self, params: Optional[dict] = None, details: Optional[str] = None,
+ self,
+ params: Optional[dict] = None,
+ details: Optional[str] = None,
) -> str:
message = self.value["message"]
if "%" in message and params:
@@ -59,7 +61,9 @@ def create_error_body(
**kwargs,
) -> str:
return self.create_error_response(
- params=params, details=details, **kwargs,
+ params=params,
+ details=details,
+ **kwargs,
).create()
"""
diff --git a/lambdas/services/base/s3_service.py b/lambdas/services/base/s3_service.py
index c9de2a9bea..1da0a82f68 100644
--- a/lambdas/services/base/s3_service.py
+++ b/lambdas/services/base/s3_service.py
@@ -40,7 +40,9 @@ def __init__(self, custom_aws_role=None):
if custom_aws_role:
self.iam_service = IAMService()
self.custom_client, self.expiration_time = self.iam_service.assume_role(
- self.custom_aws_role, "s3", config=self.config,
+ self.custom_aws_role,
+ "s3",
+ config=self.config,
)
# S3 Location should be a minimum of a s3_object_key but can also be a directory location in the form of
@@ -52,7 +54,9 @@ def create_upload_presigned_url(self, s3_bucket_name: str, s3_object_location: s
):
logger.info(S3Service.EXPIRED_SESSION_WARNING)
self.custom_client, self.expiration_time = self.iam_service.assume_role(
- self.custom_aws_role, "s3", config=self.config,
+ self.custom_aws_role,
+ "s3",
+ config=self.config,
)
return self.custom_client.generate_presigned_post(
s3_bucket_name,
@@ -69,7 +73,9 @@ def create_put_presigned_url(self, s3_bucket_name: str, file_key: str):
):
logger.info(S3Service.EXPIRED_SESSION_WARNING)
self.custom_client, self.expiration_time = self.iam_service.assume_role(
- self.custom_aws_role, "s3", config=self.config,
+ self.custom_aws_role,
+ "s3",
+ config=self.config,
)
logger.info("Generating presigned URL")
return self.custom_client.generate_presigned_url(
@@ -86,7 +92,9 @@ def create_download_presigned_url(self, s3_bucket_name: str, file_key: str):
):
logger.info(S3Service.EXPIRED_SESSION_WARNING)
self.custom_client, self.expiration_time = self.iam_service.assume_role(
- self.custom_aws_role, "s3", config=self.config,
+ self.custom_aws_role,
+ "s3",
+ config=self.config,
)
logger.info("Generating presigned URL")
return self.custom_client.generate_presigned_url(
@@ -150,17 +158,26 @@ def copy_across_bucket(
raise e
def delete_object(
- self, s3_bucket_name: str, file_key: str, version_id: str | None = None,
+ self,
+ s3_bucket_name: str,
+ file_key: str,
+ version_id: str | None = None,
):
if version_id is None:
return self.client.delete_object(Bucket=s3_bucket_name, Key=file_key)
return self.client.delete_object(
- Bucket=s3_bucket_name, Key=file_key, VersionId=version_id,
+ Bucket=s3_bucket_name,
+ Key=file_key,
+ VersionId=version_id,
)
def create_object_tag(
- self, s3_bucket_name: str, file_key: str, tag_key: str, tag_value: str,
+ self,
+ s3_bucket_name: str,
+ file_key: str,
+ tag_key: str,
+ tag_value: str,
):
return self.client.put_object_tagging(
Bucket=s3_bucket_name,
@@ -256,5 +273,7 @@ def upload_file_obj(
def save_or_create_file(self, source_bucket: str, file_key: str, body: bytes):
return self.client.put_object(
- Bucket=source_bucket, Key=file_key, Body=BytesIO(body),
+ Bucket=source_bucket,
+ Key=file_key,
+ Body=BytesIO(body),
)
diff --git a/lambdas/services/get_document_upload_status.py b/lambdas/services/get_document_upload_status.py
index f6c67c6b67..7c06d6fac9 100644
--- a/lambdas/services/get_document_upload_status.py
+++ b/lambdas/services/get_document_upload_status.py
@@ -31,7 +31,9 @@ def _determine_document_status(self, doc_ref, nhs_number):
return doc_ref.doc_status, None
def get_document_references_by_id(
- self, nhs_number: str, document_ids: list[str],
+ self,
+ nhs_number: str,
+ document_ids: list[str],
) -> dict:
"""
Checks the status of a list of documents for a given patient.
@@ -44,7 +46,8 @@ def get_document_references_by_id(
A dictionary with a list of document IDs and their corresponding statuses.
"""
found_docs = self.document_service.get_batch_document_references_by_id(
- document_ids, SupportedDocumentTypes.LG,
+ document_ids,
+ SupportedDocumentTypes.LG,
)
found_docs_by_id = {doc.id: doc for doc in found_docs}
results = {}
diff --git a/lambdas/services/upload_document_reference_service.py b/lambdas/services/upload_document_reference_service.py
index f869b26eb1..56804f28b5 100644
--- a/lambdas/services/upload_document_reference_service.py
+++ b/lambdas/services/upload_document_reference_service.py
@@ -44,7 +44,9 @@ def __init__(self):
self.bucket_router = DocTypeS3BucketRouter()
def handle_upload_document_reference_request(
- self, object_key: str, object_size: int = 0,
+ self,
+ object_key: str,
+ object_size: int = 0,
):
"""Handle the upload document reference request with comprehensive error handling"""
if not object_key:
@@ -60,13 +62,16 @@ def handle_upload_document_reference_request(
self._get_infrastructure_for_document_key(object_parts)
preliminary_document_reference = self._fetch_preliminary_document_reference(
- document_key, nhs_number,
+ document_key,
+ nhs_number,
)
if not preliminary_document_reference:
return
self._process_preliminary_document_reference(
- preliminary_document_reference, object_key, object_size,
+ preliminary_document_reference,
+ object_key,
+ object_size,
)
except Exception as e:
@@ -93,7 +98,9 @@ def _get_infrastructure_for_document_key(self, object_parts: list[str]) -> None:
raise InvalidDocTypeException(400, LambdaError.DocTypeDB)
def _fetch_preliminary_document_reference(
- self, document_key: str, nhs_number: str | None = None,
+ self,
+ document_key: str,
+ nhs_number: str | None = None,
) -> Optional[DocumentReference]:
"""Fetch document reference from the database"""
try:
@@ -148,14 +155,19 @@ def _process_preliminary_document_reference(
"""Process the preliminary (uploading) document reference with virus scanning and file operations"""
try:
virus_scan_result = self._perform_virus_scan(
- preliminary_document_reference, object_key,
+ preliminary_document_reference,
+ object_key,
)
if virus_scan_result == VirusScanResult.CLEAN:
is_file_protected = False
if getattr(preliminary_document_reference, "file_name", None):
- file_type_extension = preliminary_document_reference.file_name.split('.')[-1].lower()
- is_file_protected = self.is_file_invalid(object_key, file_type_extension)
+ file_type_extension = (
+ preliminary_document_reference.file_name.split(".")[-1].lower()
+ )
+ is_file_protected = self.is_file_invalid(
+ object_key, file_type_extension
+ )
if is_file_protected:
logger.warning(
f"Document {preliminary_document_reference.id} is password protected or corrupt, "
@@ -333,26 +345,30 @@ def _finalize_and_supersede_with_transaction(self, new_document: DocumentReferen
logger.error(
f"Unexpected error while finalizing document for {new_document.nhs_number}: {e}",
)
-
+
new_document.doc_status = "cancelled"
new_document.uploaded = False
new_document.uploading = False
new_document.file_size = None
self._update_dynamo_table(new_document)
self.delete_file_from_bucket(
- new_document.file_location, new_document.s3_version_id,
+ new_document.file_location,
+ new_document.s3_version_id,
)
def document_reference_key(self, document_id):
return {DocumentReferenceMetadataFields.ID.value: document_id}
def _perform_virus_scan(
- self, document_reference: DocumentReference, object_key: str,
+ self,
+ document_reference: DocumentReference,
+ object_key: str,
) -> VirusScanResult:
"""Perform a virus scan on the document"""
try:
return self.virus_scan_service.scan_file(
- object_key, nhs_number=document_reference.nhs_number,
+ object_key,
+ nhs_number=document_reference.nhs_number,
)
except Exception as e:
@@ -362,12 +378,14 @@ def _perform_virus_scan(
return VirusScanResult.ERROR
def _process_clean_document(
- self, document_reference: DocumentReference, object_key: str,
+ self,
+ document_reference: DocumentReference,
+ object_key: str,
):
"""Process a document that passed virus scanning"""
try:
self.copy_files_from_staging_bucket(document_reference, object_key)
-
+
logger.info(
f"Successfully processed clean document: {document_reference.id}",
)
@@ -380,7 +398,9 @@ def _process_clean_document(
raise FileProcessingException(f"Failed to process clean document: {str(e)}")
def copy_files_from_staging_bucket(
- self, document_reference: DocumentReference, source_file_key: str,
+ self,
+ document_reference: DocumentReference,
+ source_file_key: str,
):
"""Copy files from staging bucket to destination bucket"""
try:
@@ -402,7 +422,8 @@ def copy_files_from_staging_bucket(
)
document_reference.s3_bucket_name = self.destination_bucket_name
document_reference.file_location = document_reference._build_s3_location(
- self.destination_bucket_name, dest_file_key,
+ self.destination_bucket_name,
+ dest_file_key,
)
document_reference.s3_version_id = copy_result.get("VersionId")
return copy_result
@@ -474,6 +495,8 @@ def _update_dynamo_table(
)
def is_file_invalid(self, object_key: str, file_type_extension: str) -> bool:
- entire_object = self.s3_service.get_object_stream(self.staging_s3_bucket_name, object_key)
+ entire_object = self.s3_service.get_object_stream(
+ self.staging_s3_bucket_name, object_key
+ )
file_stream = io.BytesIO(entire_object.read())
return check_file_locked_or_corrupt(file_stream, file_type_extension)
diff --git a/lambdas/tests/unit/services/test_upload_document_reference_service.py b/lambdas/tests/unit/services/test_upload_document_reference_service.py
index 3d85e5df70..d8b6a5e865 100644
--- a/lambdas/tests/unit/services/test_upload_document_reference_service.py
+++ b/lambdas/tests/unit/services/test_upload_document_reference_service.py
@@ -61,7 +61,10 @@ def service(set_env, mock_virus_scan_service, mocker):
service.virus_scan_service = MockVirusScanService()
service.s3_service = Mock()
mocker.patch("io.BytesIO", return_value=None)
- mocker.patch("services.upload_document_reference_service.check_file_locked_or_corrupt", return_value=False)
+ mocker.patch(
+ "services.upload_document_reference_service.check_file_locked_or_corrupt",
+ return_value=False,
+ )
return service
@@ -80,7 +83,9 @@ def test_handle_upload_document_reference_request_with_none_object_key(service):
def test_handle_upload_document_reference_request_success(
- service, mock_document_reference, mocker,
+ service,
+ mock_document_reference,
+ mocker,
):
object_key = "staging/test-doc-id"
object_size = 1111
@@ -94,7 +99,10 @@ def test_handle_upload_document_reference_request_success(
"VersionId": "test-version-id",
}
mocker.patch("io.BytesIO", return_value=None)
- mocker.patch("services.upload_document_reference_service.check_file_locked_or_corrupt", return_value=False)
+ mocker.patch(
+ "services.upload_document_reference_service.check_file_locked_or_corrupt",
+ return_value=False,
+ )
# First call fetches preliminary doc, second call fetches existing final docs to supersede
service.document_service.fetch_documents_from_table.side_effect = [
[mock_document_reference],
@@ -154,7 +162,8 @@ def test_fetch_preliminary_document_reference_no_documents_found(service):
def test_fetch_preliminary_document_reference_multiple_documents_warning(
- service, mock_document_reference,
+ service,
+ mock_document_reference,
):
"""Test handling when multiple documents are found"""
document_key = "test-doc-id"
@@ -181,23 +190,30 @@ def test_fetch_preliminary_document_reference_exception(service):
def test__process_preliminary_document_reference_clean_virus_scan(
- service, mock_document_reference, mocker,
+ service,
+ mock_document_reference,
+ mocker,
):
"""Test processing document reference with a clean virus scan"""
object_key = "staging/test-doc-id"
mocker.patch.object(
- service, "_perform_virus_scan", return_value=VirusScanResult.CLEAN,
+ service,
+ "_perform_virus_scan",
+ return_value=VirusScanResult.CLEAN,
)
mock_delete = mocker.patch.object(service, "delete_file_from_staging_bucket")
mock_process_clean = mocker.patch.object(service, "_process_clean_document")
mock_finalize_transaction = mocker.patch.object(
- service, "_finalize_and_supersede_with_transaction",
+ service,
+ "_finalize_and_supersede_with_transaction",
)
service._process_preliminary_document_reference(
- mock_document_reference, object_key, 1222,
+ mock_document_reference,
+ object_key,
+ 1222,
)
mock_process_clean.assert_called_once()
@@ -209,26 +225,33 @@ def test__process_preliminary_document_reference_clean_virus_scan(
def test__process_preliminary_document_reference_infected_virus_scan(
- service, mock_document_reference, mocker,
+ service,
+ mock_document_reference,
+ mocker,
):
"""Test processing document reference with an infected virus scan"""
object_key = "staging/test-doc-id"
mocker.patch.object(
- service, "_perform_virus_scan", return_value=VirusScanResult.INFECTED,
+ service,
+ "_perform_virus_scan",
+ return_value=VirusScanResult.INFECTED,
)
mock_delete = mocker.patch.object(service, "delete_file_from_staging_bucket")
mock_process_clean = mocker.patch.object(service, "_process_clean_document")
mock_update_dynamo = mocker.patch.object(service, "_update_dynamo_table")
service._process_preliminary_document_reference(
- mock_document_reference, object_key, 1222,
+ mock_document_reference,
+ object_key,
+ 1222,
)
mock_process_clean.assert_not_called()
mock_update_dynamo.assert_called_once()
mock_delete.assert_called_once_with(object_key)
+
def test_perform_virus_scan_returns_clean_hardcoded(service, mock_document_reference):
"""Test virus scan returns hardcoded CLEAN result"""
object_key = "staging/test-doc-id"
@@ -237,7 +260,9 @@ def test_perform_virus_scan_returns_clean_hardcoded(service, mock_document_refer
def test_perform_virus_scan_exception_returns_infected(
- service, mock_document_reference, mocker,
+ service,
+ mock_document_reference,
+ mocker,
):
"""Test virus scan exception handling returns INFECTED for safety"""
mock_virus_service = mocker.patch.object(service, "virus_scan_service")
@@ -264,7 +289,9 @@ def test_process_clean_document_success(service, mock_document_reference, mocker
def test_process_clean_document_exception_restores_original_values(
- service, mock_document_reference, mocker,
+ service,
+ mock_document_reference,
+ mocker,
):
"""Test that original values are restored when processing fails"""
object_key = "staging/test-doc-id"
@@ -273,7 +300,9 @@ def test_process_clean_document_exception_restores_original_values(
original_location = "original-location"
mocker.patch.object(
- service, "copy_files_from_staging_bucket", side_effect=Exception("Copy failed"),
+ service,
+ "copy_files_from_staging_bucket",
+ side_effect=Exception("Copy failed"),
)
with pytest.raises(FileProcessingException):
service._process_clean_document(
@@ -327,7 +356,8 @@ def test_delete_file_from_staging_bucket_success(service):
service.delete_file_from_staging_bucket(source_file_key)
service.s3_service.delete_object.assert_called_once_with(
- MOCK_STAGING_STORE_BUCKET, source_file_key,
+ MOCK_STAGING_STORE_BUCKET,
+ source_file_key,
)
@@ -340,7 +370,8 @@ def test_delete_pdm_file_from_staging_bucket_success(service):
service.delete_file_from_staging_bucket(source_file_key)
service.s3_service.delete_object.assert_called_once_with(
- MOCK_STAGING_STORE_BUCKET, source_file_key,
+ MOCK_STAGING_STORE_BUCKET,
+ source_file_key,
)
@@ -393,7 +424,10 @@ def test_update_dynamo_table_client_error(service, mock_document_reference):
"""Test handling of ClientError during DynamoDB update"""
client_error = ClientError(
error_response={
- "Error": {"Code": "ResourceNotFoundException", "Message": "Table not found"},
+ "Error": {
+ "Code": "ResourceNotFoundException",
+ "Message": "Table not found",
+ },
},
operation_name="UpdateItem",
)
@@ -450,7 +484,8 @@ def test_document_key_extraction_from_object_key_for_lg(
def test_finalize_and_supersede_with_transaction_with_existing_finals(
- service, mock_document_reference,
+ service,
+ mock_document_reference,
):
"""Test transaction-based finalisation with existing final documents to supersede"""
new_doc = mock_document_reference
@@ -515,7 +550,8 @@ def test_finalize_and_supersede_with_transaction_with_existing_finals(
def test_finalize_and_supersede_with_transaction_no_existing_docs(
- service, mock_document_reference,
+ service,
+ mock_document_reference,
):
"""Test transaction-based finalization when no existing final documents found"""
new_doc = mock_document_reference
@@ -539,7 +575,8 @@ def test_finalize_and_supersede_with_transaction_no_existing_docs(
def test_finalize_and_supersede_with_transaction_multiple_existing(
- service, mock_document_reference,
+ service,
+ mock_document_reference,
):
"""Test transaction-based finalization superseding multiple existing final documents"""
new_doc = mock_document_reference
@@ -575,7 +612,8 @@ def test_finalize_and_supersede_with_transaction_multiple_existing(
def test_finalize_and_supersede_with_transaction_skips_same_id(
- service, mock_document_reference,
+ service,
+ mock_document_reference,
):
"""Test that transaction skips documents with the same ID"""
new_doc = mock_document_reference
@@ -603,7 +641,8 @@ def test_finalize_and_supersede_with_transaction_skips_same_id(
def test_finalize_and_supersede_with_transaction_handles_transaction_cancelled(
- service, mock_document_reference,
+ service,
+ mock_document_reference,
):
new_doc = mock_document_reference
@@ -643,25 +682,34 @@ def test_handle_upload_document_reference_request_no_document_found(service):
def test_process_preliminary_document_reference_exception_during_processing(
- service, mock_document_reference, mocker,
+ service,
+ mock_document_reference,
+ mocker,
):
"""Test that exceptions during processing are properly raised"""
object_key = "staging/test-doc-id"
mocker.patch.object(
- service, "_perform_virus_scan", return_value=VirusScanResult.CLEAN,
+ service,
+ "_perform_virus_scan",
+ return_value=VirusScanResult.CLEAN,
)
mocker.patch.object(
- service, "_process_clean_document", side_effect=Exception("Processing failed"),
+ service,
+ "_process_clean_document",
+ side_effect=Exception("Processing failed"),
)
with pytest.raises(Exception) as exc_info:
service._process_preliminary_document_reference(
- mock_document_reference, object_key, 1222,
+ mock_document_reference,
+ object_key,
+ 1222,
)
assert "Processing failed" in str(exc_info.value)
+
def test_get_infrastructure_for_document_key_non_pdm(service):
assert service.table_name == ""
infra = service._get_infrastructure_for_document_key(object_parts=["1234", "123"])
@@ -695,6 +743,7 @@ def test_get_infra_invalid_doc_type(monkeypatch, service):
with pytest.raises(InvalidDocTypeException):
service._get_infrastructure_for_document_key(["fhir_upload", "999999"])
+
def test_is_file_invalid_calls_correct_functions(service, mocker):
"""Test that is_file_invalid calls the right functions in the correct order"""
object_key = "test-folder/test-file.docx"
@@ -716,10 +765,9 @@ def test_is_file_invalid_calls_correct_functions(service, mocker):
assert result is True
service.s3_service.get_object_stream.assert_called_once_with(
- service.staging_s3_bucket_name, object_key,
+ service.staging_s3_bucket_name,
+ object_key,
)
mock_stream.read.assert_called_once_with()
mock_bytesio.assert_called_once_with(file_content)
mock_check.assert_called_once_with(mock_file_stream, file_extension)
-
-
diff --git a/lambdas/tests/unit/utils/test_file_utils.py b/lambdas/tests/unit/utils/test_file_utils.py
index 6bda04566c..a6dd202289 100644
--- a/lambdas/tests/unit/utils/test_file_utils.py
+++ b/lambdas/tests/unit/utils/test_file_utils.py
@@ -16,7 +16,9 @@ def test_convert_csv_dictionary_to_bytes():
]
result_bytes = convert_csv_dictionary_to_bytes(
- headers=headers, csv_dict_data=metadata_csv_data, encoding="utf-8",
+ headers=headers,
+ csv_dict_data=metadata_csv_data,
+ encoding="utf-8",
)
result_str = result_bytes.decode("utf-8")
@@ -24,6 +26,7 @@ def test_convert_csv_dictionary_to_bytes():
assert result_str == expected_output
+
@pytest.mark.parametrize(
"file_extension,file_content,expected_result",
[
@@ -37,6 +40,7 @@ def test_skipped_file_types(file_extension, file_content, expected_result):
result = check_file_locked_or_corrupt(file_stream, file_extension)
assert result == expected_result
+
@pytest.mark.parametrize(
"file_extension,is_encrypted,expected_result",
[
@@ -67,10 +71,15 @@ def test_office_files(mock_office_file, file_extension, is_encrypted, expected_r
mock_office_file.assert_called_once_with(file_stream)
mock_instance.is_encrypted.assert_called_once()
+
@pytest.mark.parametrize(
"file_extension,file_content,expected_result",
[
- ("rtf", b"{\\rtf1\\ansi\\deff0 {\\fonttbl {\\f0 Times New Roman;}}\\f0\\fs60 Hello!}", False),
+ (
+ "rtf",
+ b"{\\rtf1\\ansi\\deff0 {\\fonttbl {\\f0 Times New Roman;}}\\f0\\fs60 Hello!}",
+ False,
+ ),
("rtf", b"This is not an RTF file", True),
("csv", b"name,age,city\nAlice,30,NYC\nBob,25,LA", False),
("csv", b"\xff\xfe Invalid UTF-8", True),
@@ -85,9 +94,16 @@ def test_text_based_files(file_extension, file_content, expected_result):
result = check_file_locked_or_corrupt(file_stream, file_extension)
assert result == expected_result
+
@pytest.mark.parametrize(
"file_extension",
- ["jpg", "jpeg", "png", "tiff", "tif",],
+ [
+ "jpg",
+ "jpeg",
+ "png",
+ "tiff",
+ "tif",
+ ],
ids=["jpg", "jpeg", "png", "tiff", "tif"],
)
@patch("utils.file_utils.Image.open")
@@ -102,6 +118,7 @@ def test_image_files_valid(mock_image_open, file_extension):
mock_image_open.assert_called_once_with(file_stream)
mock_img.verify.assert_called_once()
+
@pytest.mark.parametrize(
"file_extension",
["jpg", "png", "tiff"],
@@ -115,6 +132,7 @@ def test_image_files_corrupt(mock_image_open, file_extension):
assert result is True
+
@pytest.mark.parametrize(
"file_extension",
["unknown", "mp4", "mp3", "avi", "mov"],
@@ -125,6 +143,7 @@ def test_unsupported_file_extensions(file_extension):
result = check_file_locked_or_corrupt(file_stream, file_extension)
assert result is False
+
@pytest.mark.parametrize(
"file_extension",
["docx", "xlsx", "pptx", "doc", "xls"],
@@ -138,6 +157,3 @@ def test_office_file_exception_returns_true(mock_office_file, file_extension):
result = check_file_locked_or_corrupt(file_stream, file_extension)
assert result is True
-
-
-
diff --git a/lambdas/utils/file_utils.py b/lambdas/utils/file_utils.py
index 4623c3bfeb..48578b729d 100644
--- a/lambdas/utils/file_utils.py
+++ b/lambdas/utils/file_utils.py
@@ -7,8 +7,11 @@
logger = LoggingService(__name__)
+
def convert_csv_dictionary_to_bytes(
- headers: list[str], csv_dict_data: list[dict], encoding: str = "utf-8",
+ headers: list[str],
+ csv_dict_data: list[dict],
+ encoding: str = "utf-8",
) -> bytes:
csv_buffer = BytesIO()
csv_text_wrapper = TextIOWrapper(csv_buffer, encoding=encoding, newline="")
@@ -29,23 +32,23 @@ def convert_csv_dictionary_to_bytes(
def check_file_locked_or_corrupt(file_stream, ext):
file_stream.seek(0)
- text_exts = ['rtf', 'csv', 'json', 'txt', 'xml']
- media_exts = ['jpg', 'jpeg', 'png', 'tiff', 'tif']
+ text_exts = ["rtf", "csv", "json", "txt", "xml"]
+ media_exts = ["jpg", "jpeg", "png", "tiff", "tif"]
try:
- if ext == 'pdf' or ext == 'zip':
+ if ext == "pdf" or ext == "zip":
# Skipping PDF check, as this is covered by the antivirus scan
logger.info(f"Skipping check for {ext} files")
return False
- elif ext in ['docx', 'xlsx', 'pptx', 'doc', 'xls', 'ppt']:
+ elif ext in ["docx", "xlsx", "pptx", "doc", "xls", "ppt"]:
office_file = msoffcrypto.OfficeFile(file_stream)
encrypt = office_file.is_encrypted()
return encrypt
elif ext in text_exts:
sample = file_stream.read(1024)
- sample.decode('utf-8')
- if ext == 'rtf' and not sample.startswith(b'{\\rtf1'):
+ sample.decode("utf-8")
+ if ext == "rtf" and not sample.startswith(b"{\\rtf1"):
return True
return False
@@ -55,9 +58,13 @@ def check_file_locked_or_corrupt(file_stream, ext):
return False
else:
- logger.info(f"File with extension {ext} is not supported for locked/corrupt check, treating as valid.")
+ logger.info(
+ f"File with extension {ext} is not supported for locked/corrupt check, treating as valid."
+ )
return False
except Exception as e:
- logger.error(f"Error checking file validity for .{ext}: {type(e).__name__} - {str(e)}")
- return True
\ No newline at end of file
+ logger.error(
+ f"Error checking file validity for .{ext}: {type(e).__name__} - {str(e)}"
+ )
+ return True
From e38cd2c3fb554b5d7f5252715ed7a2e4417ad7ce Mon Sep 17 00:00:00 2001
From: NogaNHS <127490765+NogaNHS@users.noreply.github.com>
Date: Wed, 11 Feb 2026 13:35:25 +0000
Subject: [PATCH 08/13] add test for document upload status service
---
.../upload_document_reference_service.py | 6 +-
.../test_get_document_upload_status.py | 82 ++++++++++++++-----
lambdas/utils/file_utils.py | 4 +-
3 files changed, 69 insertions(+), 23 deletions(-)
diff --git a/lambdas/services/upload_document_reference_service.py b/lambdas/services/upload_document_reference_service.py
index 56804f28b5..5917047d9a 100644
--- a/lambdas/services/upload_document_reference_service.py
+++ b/lambdas/services/upload_document_reference_service.py
@@ -166,7 +166,8 @@ def _process_preliminary_document_reference(
preliminary_document_reference.file_name.split(".")[-1].lower()
)
is_file_protected = self.is_file_invalid(
- object_key, file_type_extension
+ object_key,
+ file_type_extension,
)
if is_file_protected:
logger.warning(
@@ -496,7 +497,8 @@ def _update_dynamo_table(
def is_file_invalid(self, object_key: str, file_type_extension: str) -> bool:
entire_object = self.s3_service.get_object_stream(
- self.staging_s3_bucket_name, object_key
+ self.staging_s3_bucket_name,
+ object_key,
)
file_stream = io.BytesIO(entire_object.read())
return check_file_locked_or_corrupt(file_stream, file_type_extension)
diff --git a/lambdas/tests/unit/services/test_get_document_upload_status.py b/lambdas/tests/unit/services/test_get_document_upload_status.py
index b646a327f0..3000e11a6c 100644
--- a/lambdas/tests/unit/services/test_get_document_upload_status.py
+++ b/lambdas/tests/unit/services/test_get_document_upload_status.py
@@ -15,9 +15,12 @@ def mock_document_service():
@pytest.fixture
-def get_document_upload_status_service(mock_document_service):
+def get_document_upload_status_service(mock_document_service, mocker):
+ mocker.patch(
+ "services.get_document_upload_status.DocumentService",
+ return_value=mock_document_service,
+ )
service = GetDocumentUploadStatusService()
- service.document_service = mock_document_service
return service
@@ -80,11 +83,13 @@ def test_get_document_references_by_id_found_documents(
)
result = get_document_upload_status_service.get_document_references_by_id(
- nhs_number, document_ids
+ nhs_number,
+ document_ids,
)
mock_document_service.get_batch_document_references_by_id.assert_called_once_with(
- document_ids, SupportedDocumentTypes.LG
+ document_ids,
+ SupportedDocumentTypes.LG,
)
assert len(result) == 2
assert result["doc-id-1"]["status"] == "final"
@@ -101,11 +106,12 @@ def test_get_document_references_by_id_not_found_documents(
nhs_number = "1234567890"
document_ids = ["doc-id-1", "non-existent-id"]
mock_document_service.get_batch_document_references_by_id.return_value = [
- sample_document_references[0]
+ sample_document_references[0],
]
result = get_document_upload_status_service.get_document_references_by_id(
- nhs_number, document_ids
+ nhs_number,
+ document_ids,
)
assert len(result) == 2
@@ -123,11 +129,12 @@ def test_get_document_references_by_id_access_denied(
nhs_number = "1234567890"
document_ids = ["doc-id-3"]
mock_document_service.get_batch_document_references_by_id.return_value = [
- sample_document_references[2]
+ sample_document_references[2],
]
result = get_document_upload_status_service.get_document_references_by_id(
- nhs_number, document_ids
+ nhs_number,
+ document_ids,
)
assert len(result) == 1
@@ -143,11 +150,12 @@ def test_get_document_references_by_id_infected_document(
nhs_number = "1234567890"
document_ids = ["doc-id-4"]
mock_document_service.get_batch_document_references_by_id.return_value = [
- sample_document_references[3]
+ sample_document_references[3],
]
result = get_document_upload_status_service.get_document_references_by_id(
- nhs_number, document_ids
+ nhs_number,
+ document_ids,
)
assert len(result) == 1
@@ -155,8 +163,38 @@ def test_get_document_references_by_id_infected_document(
assert result["doc-id-4"]["error_code"] == DocumentStatus.INFECTED.code
+def test_get_document_references_by_id_invalid_document(
+ get_document_upload_status_service,
+ mock_document_service,
+):
+ nhs_number = "1234567890"
+ document_ids = ["doc-id-invalid"]
+
+ cancelled_doc = DocumentReference(
+ id="doc-id-invalid",
+ nhs_number="1234567890",
+ file_name="invalid_file.pdf",
+ doc_status=DocumentStatus.CANCELLED.display,
+ virus_scanner_result=VirusScanResult.INVALID,
+ )
+
+ mock_document_service.get_batch_document_references_by_id.return_value = [
+ cancelled_doc,
+ ]
+
+ result = get_document_upload_status_service.get_document_references_by_id(
+ nhs_number,
+ document_ids,
+ )
+
+ assert len(result) == 1
+ assert result["doc-id-invalid"]["status"] == DocumentStatus.INVALID.display
+ assert result["doc-id-invalid"]["error_code"] == DocumentStatus.INVALID.code
+
+
def test_get_document_references_by_id_cancelled_document(
- get_document_upload_status_service, mock_document_service
+ get_document_upload_status_service,
+ mock_document_service,
):
nhs_number = "1234567890"
document_ids = ["doc-id-cancelled"]
@@ -171,11 +209,12 @@ def test_get_document_references_by_id_cancelled_document(
)
mock_document_service.get_batch_document_references_by_id.return_value = [
- cancelled_doc
+ cancelled_doc,
]
result = get_document_upload_status_service.get_document_references_by_id(
- nhs_number, document_ids
+ nhs_number,
+ document_ids,
)
assert len(result) == 1
@@ -191,11 +230,12 @@ def test_get_document_references_by_id_deleted_document(
nhs_number = "1234567890"
document_ids = ["doc-id-5"]
mock_document_service.get_batch_document_references_by_id.return_value = [
- sample_document_references[4]
+ sample_document_references[4],
]
result = get_document_upload_status_service.get_document_references_by_id(
- nhs_number, document_ids
+ nhs_number,
+ document_ids,
)
assert len(result) == 0
@@ -216,7 +256,8 @@ def test_get_document_references_by_id_multiple_mixed_statuses(
]
result = get_document_upload_status_service.get_document_references_by_id(
- nhs_number, document_ids
+ nhs_number,
+ document_ids,
)
assert len(result) == 4
@@ -236,18 +277,21 @@ def test_get_document_references_by_id_multiple_mixed_statuses(
def test_get_document_references_by_id_no_results(
- get_document_upload_status_service, mock_document_service
+ get_document_upload_status_service,
+ mock_document_service,
):
nhs_number = "1234567890"
document_ids = ["doc-id-6"]
mock_document_service.get_batch_document_references_by_id.return_value = []
result = get_document_upload_status_service.get_document_references_by_id(
- nhs_number, document_ids
+ nhs_number,
+ document_ids,
)
mock_document_service.get_batch_document_references_by_id.assert_called_once_with(
- document_ids, SupportedDocumentTypes.LG
+ document_ids,
+ SupportedDocumentTypes.LG,
)
assert result["doc-id-6"]["status"] == DocumentStatus.NOT_FOUND.display
assert result["doc-id-6"]["error_code"] == DocumentStatus.NOT_FOUND.code
diff --git a/lambdas/utils/file_utils.py b/lambdas/utils/file_utils.py
index 48578b729d..3ffa85b15a 100644
--- a/lambdas/utils/file_utils.py
+++ b/lambdas/utils/file_utils.py
@@ -59,12 +59,12 @@ def check_file_locked_or_corrupt(file_stream, ext):
else:
logger.info(
- f"File with extension {ext} is not supported for locked/corrupt check, treating as valid."
+ f"File with extension {ext} is not supported for locked/corrupt check, treating as valid.",
)
return False
except Exception as e:
logger.error(
- f"Error checking file validity for .{ext}: {type(e).__name__} - {str(e)}"
+ f"Error checking file validity for .{ext}: {type(e).__name__} - {str(e)}",
)
return True
From 1d288398da911663bd9727db49e51e9b0cece4a4 Mon Sep 17 00:00:00 2001
From: adamwhitingnhs
Date: Thu, 12 Feb 2026 16:25:33 +0000
Subject: [PATCH 09/13] pr comments
---
lambdas/services/base/s3_service.py | 2 +-
lambdas/services/get_document_upload_status.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/lambdas/services/base/s3_service.py b/lambdas/services/base/s3_service.py
index 1da0a82f68..a97e403d7c 100644
--- a/lambdas/services/base/s3_service.py
+++ b/lambdas/services/base/s3_service.py
@@ -235,7 +235,7 @@ def get_file_size(self, s3_bucket_name: str, object_key: str) -> int:
def get_head_object(self, bucket: str, key: str):
return self.client.head_object(Bucket=bucket, Key=key)
- def get_object_stream(self, bucket: str, key: str, byte_range: str = None):
+ def get_object_stream(self, bucket: str, key: str, byte_range: str | None = None):
params = {"Bucket": bucket, "Key": key}
if byte_range:
params["Range"] = byte_range
diff --git a/lambdas/services/get_document_upload_status.py b/lambdas/services/get_document_upload_status.py
index 7c06d6fac9..734671a3ec 100644
--- a/lambdas/services/get_document_upload_status.py
+++ b/lambdas/services/get_document_upload_status.py
@@ -24,7 +24,7 @@ def _determine_document_status(self, doc_ref, nhs_number):
if doc_ref.doc_status == "cancelled":
if doc_ref.virus_scanner_result == VirusScanResult.INFECTED:
return DocumentStatus.INFECTED.display, DocumentStatus.INFECTED.code
- elif doc_ref.virus_scanner_result == VirusScanResult.INVALID:
+ if doc_ref.virus_scanner_result == VirusScanResult.INVALID:
return DocumentStatus.INVALID.display, DocumentStatus.INVALID.code
return DocumentStatus.CANCELLED.display, DocumentStatus.CANCELLED.code
From ab75e936c42fdcf9cc536975c18efa9be06fc0a2 Mon Sep 17 00:00:00 2001
From: adamwhitingnhs
Date: Thu, 12 Feb 2026 16:47:50 +0000
Subject: [PATCH 10/13] move file extensions to a const
---
lambdas/utils/constants/file_extensions.py | 2 ++
lambdas/utils/file_utils.py | 7 +++----
2 files changed, 5 insertions(+), 4 deletions(-)
create mode 100644 lambdas/utils/constants/file_extensions.py
diff --git a/lambdas/utils/constants/file_extensions.py b/lambdas/utils/constants/file_extensions.py
new file mode 100644
index 0000000000..bab8f38492
--- /dev/null
+++ b/lambdas/utils/constants/file_extensions.py
@@ -0,0 +1,2 @@
+TEXT_FILE_EXTENSIONS = ["rtf", "csv", "json", "txt", "xml"]
+MEDIA_FILE_EXTENSIONS = ["jpg", "jpeg", "png", "tiff", "tif"]
diff --git a/lambdas/utils/file_utils.py b/lambdas/utils/file_utils.py
index 3ffa85b15a..be1fd3f893 100644
--- a/lambdas/utils/file_utils.py
+++ b/lambdas/utils/file_utils.py
@@ -4,6 +4,7 @@
import msoffcrypto
from PIL import Image
from utils.audit_logging_setup import LoggingService
+from utils.constants.file_extensions import MEDIA_FILE_EXTENSIONS, TEXT_FILE_EXTENSIONS
logger = LoggingService(__name__)
@@ -32,8 +33,6 @@ def convert_csv_dictionary_to_bytes(
def check_file_locked_or_corrupt(file_stream, ext):
file_stream.seek(0)
- text_exts = ["rtf", "csv", "json", "txt", "xml"]
- media_exts = ["jpg", "jpeg", "png", "tiff", "tif"]
try:
if ext == "pdf" or ext == "zip":
# Skipping PDF check, as this is covered by the antivirus scan
@@ -45,14 +44,14 @@ def check_file_locked_or_corrupt(file_stream, ext):
encrypt = office_file.is_encrypted()
return encrypt
- elif ext in text_exts:
+ elif ext in TEXT_FILE_EXTENSIONS:
sample = file_stream.read(1024)
sample.decode("utf-8")
if ext == "rtf" and not sample.startswith(b"{\\rtf1"):
return True
return False
- elif ext in media_exts:
+ elif ext in MEDIA_FILE_EXTENSIONS:
with Image.open(file_stream) as img:
img.verify()
return False
From 84c4ed8edf2f0591bb56b1a0d2e1c8f8ca8cb4dd Mon Sep 17 00:00:00 2001
From: NogaNHS <127490765+NogaNHS@users.noreply.github.com>
Date: Thu, 12 Feb 2026 16:56:34 +0000
Subject: [PATCH 11/13] [PRMP-1384] text Add Microsoft Office file extensions
to constants
---
lambdas/utils/constants/file_extensions.py | 1 +
lambdas/utils/file_utils.py | 5 +++--
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/lambdas/utils/constants/file_extensions.py b/lambdas/utils/constants/file_extensions.py
index bab8f38492..4e04dfd004 100644
--- a/lambdas/utils/constants/file_extensions.py
+++ b/lambdas/utils/constants/file_extensions.py
@@ -1,2 +1,3 @@
TEXT_FILE_EXTENSIONS = ["rtf", "csv", "json", "txt", "xml"]
MEDIA_FILE_EXTENSIONS = ["jpg", "jpeg", "png", "tiff", "tif"]
+MICROSOFT_OFFICE_FILE_EXTENSIONS = ["docx", "xlsx", "pptx", "doc", "xls", "ppt"]
diff --git a/lambdas/utils/file_utils.py b/lambdas/utils/file_utils.py
index be1fd3f893..26445077b9 100644
--- a/lambdas/utils/file_utils.py
+++ b/lambdas/utils/file_utils.py
@@ -4,7 +4,8 @@
import msoffcrypto
from PIL import Image
from utils.audit_logging_setup import LoggingService
-from utils.constants.file_extensions import MEDIA_FILE_EXTENSIONS, TEXT_FILE_EXTENSIONS
+from utils.constants.file_extensions import MEDIA_FILE_EXTENSIONS, TEXT_FILE_EXTENSIONS, \
+ MICROSOFT_OFFICE_FILE_EXTENSIONS
logger = LoggingService(__name__)
@@ -39,7 +40,7 @@ def check_file_locked_or_corrupt(file_stream, ext):
logger.info(f"Skipping check for {ext} files")
return False
- elif ext in ["docx", "xlsx", "pptx", "doc", "xls", "ppt"]:
+ elif ext in MICROSOFT_OFFICE_FILE_EXTENSIONS:
office_file = msoffcrypto.OfficeFile(file_stream)
encrypt = office_file.is_encrypted()
return encrypt
From 1ab851a73763ba62e61160960e2e68aceefaa467 Mon Sep 17 00:00:00 2001
From: adamwhitingnhs
Date: Thu, 12 Feb 2026 17:07:40 +0000
Subject: [PATCH 12/13] fix linting
---
lambdas/ruff.toml | 2 +-
lambdas/services/base/s3_service.py | 3 +--
lambdas/utils/file_utils.py | 15 +++++++--------
3 files changed, 9 insertions(+), 11 deletions(-)
diff --git a/lambdas/ruff.toml b/lambdas/ruff.toml
index 10a91ceaf7..9b92d3839a 100644
--- a/lambdas/ruff.toml
+++ b/lambdas/ruff.toml
@@ -33,7 +33,7 @@ line-length = 130
# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
# McCabe complexity (`C901`) by default.
# COM812: Enforce trailing commas on multi-line constructs.
-select = ["E", "F", "COM812"]
+select = ["E", "F", "COM812", "RET505"]
ignore = []
# Allow autofix for all enabled rules (when `--fix`) is provided.
diff --git a/lambdas/services/base/s3_service.py b/lambdas/services/base/s3_service.py
index a97e403d7c..ef04724e61 100644
--- a/lambdas/services/base/s3_service.py
+++ b/lambdas/services/base/s3_service.py
@@ -151,8 +151,7 @@ def copy_across_bucket(
if_none_match,
False,
)
- else:
- raise e
+ raise e
else:
logger.error(f"Copy failed: {e}")
raise e
diff --git a/lambdas/utils/file_utils.py b/lambdas/utils/file_utils.py
index 26445077b9..a0f6bf2375 100644
--- a/lambdas/utils/file_utils.py
+++ b/lambdas/utils/file_utils.py
@@ -40,28 +40,27 @@ def check_file_locked_or_corrupt(file_stream, ext):
logger.info(f"Skipping check for {ext} files")
return False
- elif ext in MICROSOFT_OFFICE_FILE_EXTENSIONS:
+ if ext in MICROSOFT_OFFICE_FILE_EXTENSIONS:
office_file = msoffcrypto.OfficeFile(file_stream)
encrypt = office_file.is_encrypted()
return encrypt
- elif ext in TEXT_FILE_EXTENSIONS:
+ if ext in TEXT_FILE_EXTENSIONS:
sample = file_stream.read(1024)
sample.decode("utf-8")
if ext == "rtf" and not sample.startswith(b"{\\rtf1"):
return True
return False
- elif ext in MEDIA_FILE_EXTENSIONS:
+ if ext in MEDIA_FILE_EXTENSIONS:
with Image.open(file_stream) as img:
img.verify()
return False
- else:
- logger.info(
- f"File with extension {ext} is not supported for locked/corrupt check, treating as valid.",
- )
- return False
+ logger.info(
+ f"File with extension {ext} is not supported for locked/corrupt check, treating as valid.",
+ )
+ return False
except Exception as e:
logger.error(
From c332c3d7c755baf73bf7c128dad497a86fb70a19 Mon Sep 17 00:00:00 2001
From: adamwhitingnhs
Date: Thu, 12 Feb 2026 17:13:15 +0000
Subject: [PATCH 13/13] fix import order
---
lambdas/utils/file_utils.py | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/lambdas/utils/file_utils.py b/lambdas/utils/file_utils.py
index a0f6bf2375..90e5d9ed0b 100644
--- a/lambdas/utils/file_utils.py
+++ b/lambdas/utils/file_utils.py
@@ -4,8 +4,11 @@
import msoffcrypto
from PIL import Image
from utils.audit_logging_setup import LoggingService
-from utils.constants.file_extensions import MEDIA_FILE_EXTENSIONS, TEXT_FILE_EXTENSIONS, \
- MICROSOFT_OFFICE_FILE_EXTENSIONS
+from utils.constants.file_extensions import (
+ MEDIA_FILE_EXTENSIONS,
+ MICROSOFT_OFFICE_FILE_EXTENSIONS,
+ TEXT_FILE_EXTENSIONS,
+)
logger = LoggingService(__name__)