diff --git a/.github/workflows/automated-pr-validator.yml b/.github/workflows/automated-pr-validator.yml index a290624edd..ba656d5869 100644 --- a/.github/workflows/automated-pr-validator.yml +++ b/.github/workflows/automated-pr-validator.yml @@ -218,7 +218,7 @@ jobs: id: changed-files run: | git remote set-branches origin main && git fetch --depth 1 origin main && git branch main origin/main - echo "CHANGED_FILES=$(git diff main --name-only | grep '.py$' | tr '\n' ' ')" >> $GITHUB_OUTPUT + echo "CHANGED_FILES=$(git diff main --name-status | grep -E '^[^D].*\.py$' | cut -f2 | tr '\n' ' ')" >> $GITHUB_OUTPUT - name: Run black id: black diff --git a/.github/workflows/base-lambda-layer-reusable-publish-all.yml b/.github/workflows/base-lambda-layer-reusable-publish-all.yml index be97edb37f..a7e49b91b9 100644 --- a/.github/workflows/base-lambda-layer-reusable-publish-all.yml +++ b/.github/workflows/base-lambda-layer-reusable-publish-all.yml @@ -87,3 +87,15 @@ jobs: lambda_layer_name: alerting_lambda_layer secrets: AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} + + deploy_files_lambda_layer: + name: Deploy files_lambda_layer + uses: ./.github/workflows/base-lambda-layer-reusable-publish.yml + with: + environment: ${{ inputs.environment}} + python_version: ${{ inputs.python_version }} + build_branch: ${{ inputs.build_branch }} + sandbox: ${{ inputs.sandbox }} + lambda_layer_name: files_lambda_layer + secrets: + AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} \ No newline at end of file diff --git a/.github/workflows/base-lambdas-reusable-deploy-all.yml b/.github/workflows/base-lambdas-reusable-deploy-all.yml index 495c36fb46..32d347ae45 100644 --- a/.github/workflows/base-lambdas-reusable-deploy-all.yml +++ b/.github/workflows/base-lambdas-reusable-deploy-all.yml @@ -695,7 +695,7 @@ jobs: sandbox: ${{ inputs.sandbox }} lambda_handler_name: document_reference_virus_scan_handler lambda_aws_name: DocumentReferenceVirusScanCheck - lambda_layer_names: "core_lambda_layer" + lambda_layer_names: "core_lambda_layer,files_lambda_layer" secrets: AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} diff --git a/.lintstagedrc b/.lintstagedrc index 5ac098b07a..0cfe966e6a 100644 --- a/.lintstagedrc +++ b/.lintstagedrc @@ -7,8 +7,8 @@ "./app/node_modules/prettier/bin/prettier.cjs --write" ], "*.py": [ + "./lambdas/venv/bin/ruff check --fix", "./lambdas/venv/bin/python3 -m black", - "./lambdas/venv/bin/ruff check ./lambdas", "./lambdas/venv/bin/python3 -m isort --profile black", ] } \ No newline at end of file diff --git a/Makefile b/Makefile index f9739a9c22..7dd4df5c7c 100644 --- a/Makefile +++ b/Makefile @@ -7,6 +7,7 @@ GITHUB_REQUIREMENTS=$(REQUIREMENTS_PATH)/requirements_github_runner.txt TEST_REQUIREMENTS=$(REQUIREMENTS_PATH)/requirements_test.txt CORE_REQUIREMENTS=$(LAMBDA_LAYER_REQUIREMENTS_PATH)/requirements_core_lambda_layer.txt DATA_REQUIREMENTS=$(LAMBDA_LAYER_REQUIREMENTS_PATH)/requirements_data_lambda_layer.txt +FILES_REQUIREMENTS=$(LAMBDA_LAYER_REQUIREMENTS_PATH)/requirements_files_lambda_layer.txt REPORTS_REQUIREMENTS=$(LAMBDA_LAYER_REQUIREMENTS_PATH)/requirements_reports_lambda_layer.txt ALERTING_REQUIREMENTS=$(LAMBDA_LAYER_REQUIREMENTS_PATH)/requirements_alerting_lambda_layer.txt EDGE_REQUIREMENTS=$(REQUIREMENTS_PATH)/requirements_edge_lambda.txt @@ -86,18 +87,19 @@ format: @if [ $(FORMAT_ALL) = true ]; then \ CHANGED_FILES=''; \ else \ - CHANGED_FILES=$$(git diff main --name-only | grep '.py$$' | xargs); \ + CHANGED_FILES=$$(git diff main --name-status | grep -E '^[^D].*\.py$$' | cut -f2 | xargs); \ echo $$CHANGED_FILES; \ if [ -z "$$CHANGED_FILES" ]; then echo "No changed files to format"; exit 0; fi; \ fi; \ - $(VENV_PATH_PREFIX)/bin/python3 -m black $$CHANGED_FILES; \ $(VENV_PATH_PREFIX)/bin/ruff check $$CHANGED_FILES --fix; \ + $(VENV_PATH_PREFIX)/bin/python3 -m black $$CHANGED_FILES; \ $(VENV_PATH_PREFIX)/bin/python3 -m isort --profile black $$CHANGED_FILES sort-requirements: sort -o $(TEST_REQUIREMENTS) $(TEST_REQUIREMENTS) sort -o $(CORE_REQUIREMENTS) $(CORE_REQUIREMENTS) sort -o $(DATA_REQUIREMENTS) $(DATA_REQUIREMENTS) + sort -o $(FILES_REQUIREMENTS) $(FILES_REQUIREMENTS) sort -o $(REPORTS_REQUIREMENTS) $(REPORTS_REQUIREMENTS) sort -o $(ALERTING_REQUIREMENTS) $(ALERTING_REQUIREMENTS) @@ -106,6 +108,7 @@ check-packages: ./lambdas/venv/bin/pip-audit -r $(TEST_REQUIREMENTS) ./lambdas/venv/bin/pip-audit -r $(CORE_REQUIREMENTS) ./lambdas/venv/bin/pip-audit -r $(DATA_REQUIREMENTS) + ./lambdas/venv/bin/pip-audit -r $(FILES_REQUIREMENTS) ./lambdas/venv/bin/pip-audit -r $(REPORTS_REQUIREMENTS) ./lambdas/venv/bin/pip-audit -r $(ALERTING_REQUIREMENTS) @@ -206,6 +209,7 @@ env: @./lambdas/venv/bin/pip3 install -r $(TEST_REQUIREMENTS) --no-cache-dir @./lambdas/venv/bin/pip3 install -r $(CORE_REQUIREMENTS) --no-cache-dir @./lambdas/venv/bin/pip3 install -r $(DATA_REQUIREMENTS) --no-cache-dir + @./lambdas/venv/bin/pip3 install -r $(FILES_REQUIREMENTS) --no-cache-dir @./lambdas/venv/bin/pip3 install -r $(REPORTS_REQUIREMENTS) --no-cache-dir @./lambdas/venv/bin/pip3 install -r $(ALERTING_REQUIREMENTS) --no-cache-dir @echo " " diff --git a/lambdas/enums/document_status.py b/lambdas/enums/document_status.py index 2a16cd5bec..65ccac376b 100644 --- a/lambdas/enums/document_status.py +++ b/lambdas/enums/document_status.py @@ -6,6 +6,7 @@ class DocumentStatus(Enum): FORBIDDEN = ("forbidden", "UC_4003") NOT_FOUND = ("not-found", "UC_4004") INFECTED = ("infected", "UC_4005") + INVALID = ("invalid", "UC_4006") @property def code(self): diff --git a/lambdas/enums/lambda_error.py b/lambdas/enums/lambda_error.py index 57563fd3db..d5c2a62e5f 100644 --- a/lambdas/enums/lambda_error.py +++ b/lambdas/enums/lambda_error.py @@ -43,7 +43,9 @@ def create_error_response( return error_response def to_str( - self, params: Optional[dict] = None, details: Optional[str] = None + self, + params: Optional[dict] = None, + details: Optional[str] = None, ) -> str: message = self.value["message"] if "%" in message and params: @@ -59,7 +61,9 @@ def create_error_body( **kwargs, ) -> str: return self.create_error_response( - params=params, details=details, **kwargs + params=params, + details=details, + **kwargs, ).create() """ @@ -440,6 +444,10 @@ def create_error_body( "err_code": "UC_4005", "message": "Some of the given document references are not referring to clean files", } + UploadConfirmResultFilesInvalid = { + "err_code": "UC_4006", + "message": "Some of the given document references are password protected or corrupted", + } UploadConfirmResultAWSFailure = { "err_code": "UC_5004", "message": "Error occurred with an AWS service", diff --git a/lambdas/enums/virus_scan_result.py b/lambdas/enums/virus_scan_result.py index 484d9e9027..ade769ab5c 100644 --- a/lambdas/enums/virus_scan_result.py +++ b/lambdas/enums/virus_scan_result.py @@ -7,6 +7,7 @@ class VirusScanResult(StrEnum): INFECTED_ALLOWED = "InfectedAllowed" UNSCANNABLE = "Unscannable" ERROR = "Error" + INVALID = "Invalid" SCAN_RESULT_TAG_KEY = "scan-result" diff --git a/lambdas/requirements/layers/requirements_files_lambda_layer.txt b/lambdas/requirements/layers/requirements_files_lambda_layer.txt new file mode 100644 index 0000000000..0d4bff021d --- /dev/null +++ b/lambdas/requirements/layers/requirements_files_lambda_layer.txt @@ -0,0 +1 @@ +msoffcrypto-tool==6.0.0 \ No newline at end of file diff --git a/lambdas/ruff.toml b/lambdas/ruff.toml index 10a91ceaf7..9b92d3839a 100644 --- a/lambdas/ruff.toml +++ b/lambdas/ruff.toml @@ -33,7 +33,7 @@ line-length = 130 # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or # McCabe complexity (`C901`) by default. # COM812: Enforce trailing commas on multi-line constructs. -select = ["E", "F", "COM812"] +select = ["E", "F", "COM812", "RET505"] ignore = [] # Allow autofix for all enabled rules (when `--fix`) is provided. diff --git a/lambdas/services/base/s3_service.py b/lambdas/services/base/s3_service.py index 3ee42b4e24..ef04724e61 100644 --- a/lambdas/services/base/s3_service.py +++ b/lambdas/services/base/s3_service.py @@ -40,7 +40,9 @@ def __init__(self, custom_aws_role=None): if custom_aws_role: self.iam_service = IAMService() self.custom_client, self.expiration_time = self.iam_service.assume_role( - self.custom_aws_role, "s3", config=self.config + self.custom_aws_role, + "s3", + config=self.config, ) # S3 Location should be a minimum of a s3_object_key but can also be a directory location in the form of @@ -48,11 +50,13 @@ def __init__(self, custom_aws_role=None): def create_upload_presigned_url(self, s3_bucket_name: str, s3_object_location: str): if self.custom_client: if datetime.now(timezone.utc) > self.expiration_time - timedelta( - minutes=10 + minutes=10, ): logger.info(S3Service.EXPIRED_SESSION_WARNING) self.custom_client, self.expiration_time = self.iam_service.assume_role( - self.custom_aws_role, "s3", config=self.config + self.custom_aws_role, + "s3", + config=self.config, ) return self.custom_client.generate_presigned_post( s3_bucket_name, @@ -65,11 +69,13 @@ def create_upload_presigned_url(self, s3_bucket_name: str, s3_object_location: s def create_put_presigned_url(self, s3_bucket_name: str, file_key: str): if self.custom_client: if datetime.now(timezone.utc) > self.expiration_time - timedelta( - minutes=10 + minutes=10, ): logger.info(S3Service.EXPIRED_SESSION_WARNING) self.custom_client, self.expiration_time = self.iam_service.assume_role( - self.custom_aws_role, "s3", config=self.config + self.custom_aws_role, + "s3", + config=self.config, ) logger.info("Generating presigned URL") return self.custom_client.generate_presigned_url( @@ -82,11 +88,13 @@ def create_put_presigned_url(self, s3_bucket_name: str, file_key: str): def create_download_presigned_url(self, s3_bucket_name: str, file_key: str): if self.custom_client: if datetime.now(timezone.utc) > self.expiration_time - timedelta( - minutes=10 + minutes=10, ): logger.info(S3Service.EXPIRED_SESSION_WARNING) self.custom_client, self.expiration_time = self.iam_service.assume_role( - self.custom_aws_role, "s3", config=self.config + self.custom_aws_role, + "s3", + config=self.config, ) logger.info("Generating presigned URL") return self.custom_client.generate_presigned_url( @@ -143,24 +151,32 @@ def copy_across_bucket( if_none_match, False, ) - else: - raise e + raise e else: logger.error(f"Copy failed: {e}") raise e def delete_object( - self, s3_bucket_name: str, file_key: str, version_id: str | None = None + self, + s3_bucket_name: str, + file_key: str, + version_id: str | None = None, ): if version_id is None: return self.client.delete_object(Bucket=s3_bucket_name, Key=file_key) return self.client.delete_object( - Bucket=s3_bucket_name, Key=file_key, VersionId=version_id + Bucket=s3_bucket_name, + Key=file_key, + VersionId=version_id, ) def create_object_tag( - self, s3_bucket_name: str, file_key: str, tag_key: str, tag_value: str + self, + s3_bucket_name: str, + file_key: str, + tag_key: str, + tag_value: str, ): return self.client.put_object_tagging( Bucket=s3_bucket_name, @@ -168,7 +184,7 @@ def create_object_tag( Tagging={ "TagSet": [ {"Key": tag_key, "Value": tag_value}, - ] + ], }, ) @@ -182,7 +198,7 @@ def get_tag_value(self, s3_bucket_name: str, file_key: str, tag_key: str) -> str return key_value_pair["Value"] raise TagNotFoundException( - f"Object {file_key} doesn't have a tag of key {tag_key}" + f"Object {file_key} doesn't have a tag of key {tag_key}", ) def file_exist_on_s3(self, s3_bucket_name: str, file_key: str) -> bool: @@ -218,8 +234,11 @@ def get_file_size(self, s3_bucket_name: str, object_key: str) -> int: def get_head_object(self, bucket: str, key: str): return self.client.head_object(Bucket=bucket, Key=key) - def get_object_stream(self, bucket: str, key: str): - response = self.client.get_object(Bucket=bucket, Key=key) + def get_object_stream(self, bucket: str, key: str, byte_range: str | None = None): + params = {"Bucket": bucket, "Key": key} + if byte_range: + params["Range"] = byte_range + response = self.client.get_object(**params) return response.get("Body") def stream_s3_object_to_memory(self, bucket: str, key: str) -> BytesIO: @@ -247,11 +266,13 @@ def upload_file_obj( logger.info(f"Uploaded file object to s3://{s3_bucket_name}/{file_key}") except ClientError as e: logger.error( - f"Failed to upload file object to s3://{s3_bucket_name}/{file_key} - {e}" + f"Failed to upload file object to s3://{s3_bucket_name}/{file_key} - {e}", ) raise e def save_or_create_file(self, source_bucket: str, file_key: str, body: bytes): return self.client.put_object( - Bucket=source_bucket, Key=file_key, Body=BytesIO(body) + Bucket=source_bucket, + Key=file_key, + Body=BytesIO(body), ) diff --git a/lambdas/services/get_document_upload_status.py b/lambdas/services/get_document_upload_status.py index 113b08908b..734671a3ec 100644 --- a/lambdas/services/get_document_upload_status.py +++ b/lambdas/services/get_document_upload_status.py @@ -24,12 +24,16 @@ def _determine_document_status(self, doc_ref, nhs_number): if doc_ref.doc_status == "cancelled": if doc_ref.virus_scanner_result == VirusScanResult.INFECTED: return DocumentStatus.INFECTED.display, DocumentStatus.INFECTED.code + if doc_ref.virus_scanner_result == VirusScanResult.INVALID: + return DocumentStatus.INVALID.display, DocumentStatus.INVALID.code return DocumentStatus.CANCELLED.display, DocumentStatus.CANCELLED.code return doc_ref.doc_status, None def get_document_references_by_id( - self, nhs_number: str, document_ids: list[str] + self, + nhs_number: str, + document_ids: list[str], ) -> dict: """ Checks the status of a list of documents for a given patient. @@ -42,7 +46,8 @@ def get_document_references_by_id( A dictionary with a list of document IDs and their corresponding statuses. """ found_docs = self.document_service.get_batch_document_references_by_id( - document_ids, SupportedDocumentTypes.LG + document_ids, + SupportedDocumentTypes.LG, ) found_docs_by_id = {doc.id: doc for doc in found_docs} results = {} diff --git a/lambdas/services/pdf_stitch_service.py b/lambdas/services/pdf_stitch_service.py deleted file mode 100755 index d08a20d1ab..0000000000 --- a/lambdas/services/pdf_stitch_service.py +++ /dev/null @@ -1,34 +0,0 @@ -import os -from uuid import uuid4 - -from pypdf import PdfReader, PdfWriter -from utils.audit_logging_setup import LoggingService - -logger = LoggingService(__name__) - - -def stitch_pdf(filenames: list[str], temp_folder: str = "/tmp/") -> str: - """ - Given a list of local PDF files, stitch them into one file and return the local file path of a resulting file. - - Example usage: - filenames = ["file1.pdf", "file2.pdf", "file3.pdf"] - tmp_folder = "/tmp/" - stitch_pdf(filename, tmp_folder) - - Result: - "/tmp/(filename_of_stitched_file).pdf" - """ - merger = PdfWriter() - for filename in filenames: - merger.append(filename) - output_filename = os.path.join(temp_folder, f"{str(uuid4())}.pdf") - merger.write(output_filename) - return output_filename - - -def count_page_number(filename: str) -> int: - """ - Return the total number of pages in a PDF file - """ - return len(PdfReader(filename).pages) diff --git a/lambdas/services/upload_document_reference_service.py b/lambdas/services/upload_document_reference_service.py index 685d2e1544..5917047d9a 100644 --- a/lambdas/services/upload_document_reference_service.py +++ b/lambdas/services/upload_document_reference_service.py @@ -1,3 +1,4 @@ +import io import os from typing import Optional @@ -21,6 +22,7 @@ FileProcessingException, TransactionConflictException, ) +from utils.file_utils import check_file_locked_or_corrupt from utils.lambda_exceptions import InvalidDocTypeException from utils.s3_utils import DocTypeS3BucketRouter from utils.utilities import get_virus_scan_service @@ -42,7 +44,9 @@ def __init__(self): self.bucket_router = DocTypeS3BucketRouter() def handle_upload_document_reference_request( - self, object_key: str, object_size: int = 0 + self, + object_key: str, + object_size: int = 0, ): """Handle the upload document reference request with comprehensive error handling""" if not object_key: @@ -58,13 +62,16 @@ def handle_upload_document_reference_request( self._get_infrastructure_for_document_key(object_parts) preliminary_document_reference = self._fetch_preliminary_document_reference( - document_key, nhs_number + document_key, + nhs_number, ) if not preliminary_document_reference: return self._process_preliminary_document_reference( - preliminary_document_reference, object_key, object_size + preliminary_document_reference, + object_key, + object_size, ) except Exception as e: @@ -86,12 +93,14 @@ def _get_infrastructure_for_document_key(self, object_parts: list[str]) -> None: self.destination_bucket_name = self.bucket_router.resolve(doc_type) except KeyError: logger.error( - f"SNOMED code {doc_type.code} - {doc_type.display_name} is not supported" + f"SNOMED code {doc_type.code} - {doc_type.display_name} is not supported", ) raise InvalidDocTypeException(400, LambdaError.DocTypeDB) def _fetch_preliminary_document_reference( - self, document_key: str, nhs_number: str | None = None + self, + document_key: str, + nhs_number: str | None = None, ) -> Optional[DocumentReference]: """Fetch document reference from the database""" try: @@ -101,7 +110,7 @@ def _fetch_preliminary_document_reference( else: if not nhs_number: logger.error( - f"Failed to process object key with ID: {document_key}" + f"Failed to process object key with ID: {document_key}", ) raise FileProcessingException(400, LambdaError.DocRefInvalidFiles) @@ -117,24 +126,24 @@ def _fetch_preliminary_document_reference( if not documents: logger.error( - f"No document with the following key found in {self.table_name} table: {document_key}" + f"No document with the following key found in {self.table_name} table: {document_key}", ) logger.info("Skipping this object") return None if len(documents) > 1: logger.warning( - f"Multiple documents found for key {document_key}, using first one" + f"Multiple documents found for key {document_key}, using first one", ) return documents[0] except ClientError as e: logger.error( - f"Error fetching document reference for key {document_key}: {str(e)}" + f"Error fetching document reference for key {document_key}: {str(e)}", ) raise DocumentServiceException( - f"Failed to fetch document reference: {str(e)}" + f"Failed to fetch document reference: {str(e)}", ) def _process_preliminary_document_reference( @@ -146,20 +155,37 @@ def _process_preliminary_document_reference( """Process the preliminary (uploading) document reference with virus scanning and file operations""" try: virus_scan_result = self._perform_virus_scan( - preliminary_document_reference, object_key + preliminary_document_reference, + object_key, ) - preliminary_document_reference.virus_scanner_result = virus_scan_result if virus_scan_result == VirusScanResult.CLEAN: - self._process_clean_document( - preliminary_document_reference, - object_key, - ) + is_file_protected = False + if getattr(preliminary_document_reference, "file_name", None): + file_type_extension = ( + preliminary_document_reference.file_name.split(".")[-1].lower() + ) + is_file_protected = self.is_file_invalid( + object_key, + file_type_extension, + ) + if is_file_protected: + logger.warning( + f"Document {preliminary_document_reference.id} is password protected or corrupt, " + f"marking as such in database", + ) + virus_scan_result = VirusScanResult.INVALID + else: + self._process_clean_document( + preliminary_document_reference, + object_key, + ) else: logger.warning( - f"Document {preliminary_document_reference.id} failed virus scan" + f"Document {preliminary_document_reference.id} failed virus scan", ) + preliminary_document_reference.virus_scanner_result = virus_scan_result preliminary_document_reference.file_size = object_size preliminary_document_reference.uploaded = True preliminary_document_reference.uploading = False @@ -173,7 +199,7 @@ def _process_preliminary_document_reference( and self.doc_type.code != SnomedCodes.PATIENT_DATA.value.code ): self._finalize_and_supersede_with_transaction( - preliminary_document_reference + preliminary_document_reference, ) # Update NRL Pointer @@ -184,7 +210,7 @@ def _process_preliminary_document_reference( except Exception as e: logger.error( - f"Error processing document reference {preliminary_document_reference.id}: {str(e)}" + f"Error processing document reference {preliminary_document_reference.id}: {str(e)}", ) raise @@ -199,7 +225,7 @@ def _finalize_and_supersede_with_transaction(self, new_document: DocumentReferen """ try: logger.info( - f"Checking for existing final documents to supersede for NHS number {new_document.nhs_number}" + f"Checking for existing final documents to supersede for NHS number {new_document.nhs_number}", ) existing_docs: list[DocumentReference] = ( @@ -243,7 +269,7 @@ def _finalize_and_supersede_with_transaction(self, new_document: DocumentReferen # Supersede existing final documents if existing_docs: logger.info( - f"Superseding {len(existing_docs)} existing final document(s) for NHS number {new_document.nhs_number}" + f"Superseding {len(existing_docs)} existing final document(s) for NHS number {new_document.nhs_number}", ) for doc in existing_docs: @@ -297,77 +323,85 @@ def _finalize_and_supersede_with_transaction(self, new_document: DocumentReferen f" and superseded {len(existing_docs)} document(s)" if existing_docs else "" - ) + ), ) except ClientError as e: error_code = e.response.get("Error", {}).get("Code", "") if error_code == "TransactionCanceledException": logger.error( - f"Transaction cancelled - concurrent update detected for NHS number {new_document.nhs_number}" + f"Transaction cancelled - concurrent update detected for NHS number {new_document.nhs_number}", ) raise TransactionConflictException( f"Concurrent update detected while finalizing document for NHS number {new_document.nhs_number}. " - f"Another process may have already finalized a document for this patient." + f"Another process may have already finalized a document for this patient.", ) raise except Exception as e: if isinstance(e, TransactionConflictException): logger.error( - f"Cancelling preliminary document {new_document.id} due to transaction conflict" + f"Cancelling preliminary document {new_document.id} due to transaction conflict", ) else: logger.error( - f"Unexpected error while finalizing document for {new_document.nhs_number}: {e}" + f"Unexpected error while finalizing document for {new_document.nhs_number}: {e}", ) - + new_document.doc_status = "cancelled" new_document.uploaded = False new_document.uploading = False new_document.file_size = None self._update_dynamo_table(new_document) self.delete_file_from_bucket( - new_document.file_location, new_document.s3_version_id + new_document.file_location, + new_document.s3_version_id, ) def document_reference_key(self, document_id): return {DocumentReferenceMetadataFields.ID.value: document_id} def _perform_virus_scan( - self, document_reference: DocumentReference, object_key: str + self, + document_reference: DocumentReference, + object_key: str, ) -> VirusScanResult: """Perform a virus scan on the document""" try: return self.virus_scan_service.scan_file( - object_key, nhs_number=document_reference.nhs_number + object_key, + nhs_number=document_reference.nhs_number, ) except Exception as e: logger.error( - f"Virus scan failed for document {document_reference.id}: {str(e)}" + f"Virus scan failed for document {document_reference.id}: {str(e)}", ) return VirusScanResult.ERROR def _process_clean_document( - self, document_reference: DocumentReference, object_key: str + self, + document_reference: DocumentReference, + object_key: str, ): """Process a document that passed virus scanning""" try: self.copy_files_from_staging_bucket(document_reference, object_key) - + logger.info( - f"Successfully processed clean document: {document_reference.id}" + f"Successfully processed clean document: {document_reference.id}", ) except Exception as e: logger.error( - f"Error processing clean document {document_reference.id}: {str(e)}" + f"Error processing clean document {document_reference.id}: {str(e)}", ) document_reference.doc_status = "cancelled" raise FileProcessingException(f"Failed to process clean document: {str(e)}") def copy_files_from_staging_bucket( - self, document_reference: DocumentReference, source_file_key: str + self, + document_reference: DocumentReference, + source_file_key: str, ): """Copy files from staging bucket to destination bucket""" try: @@ -389,7 +423,8 @@ def copy_files_from_staging_bucket( ) document_reference.s3_bucket_name = self.destination_bucket_name document_reference.file_location = document_reference._build_s3_location( - self.destination_bucket_name, dest_file_key + self.destination_bucket_name, + dest_file_key, ) document_reference.s3_version_id = copy_result.get("VersionId") return copy_result @@ -397,7 +432,7 @@ def copy_files_from_staging_bucket( except ClientError as e: logger.error(f"Error copying files from staging bucket: {str(e)}") raise FileProcessingException( - f"Failed to copy file from staging bucket: {str(e)}" + f"Failed to copy file from staging bucket: {str(e)}", ) def delete_file_from_staging_bucket(self, source_file_key: str): @@ -413,10 +448,10 @@ def delete_file_from_bucket(self, file_location: str, version_id: str): """Delete file from bucket""" try: s3_bucket_name, source_file_key = DocumentReference._parse_s3_location( - file_location + file_location, ) logger.info( - f"Deleting file from bucket: {s3_bucket_name}/{source_file_key}" + f"Deleting file from bucket: {s3_bucket_name}/{source_file_key}", ) self.s3_service.delete_object(s3_bucket_name, source_file_key, version_id) @@ -457,5 +492,13 @@ def _update_dynamo_table( except ClientError as e: logger.error(f"Error updating DynamoDB table: {str(e)}") raise DocumentServiceException( - f"Failed to update document in database: {str(e)}" + f"Failed to update document in database: {str(e)}", ) + + def is_file_invalid(self, object_key: str, file_type_extension: str) -> bool: + entire_object = self.s3_service.get_object_stream( + self.staging_s3_bucket_name, + object_key, + ) + file_stream = io.BytesIO(entire_object.read()) + return check_file_locked_or_corrupt(file_stream, file_type_extension) diff --git a/lambdas/tests/unit/services/test_get_document_upload_status.py b/lambdas/tests/unit/services/test_get_document_upload_status.py index b646a327f0..3000e11a6c 100644 --- a/lambdas/tests/unit/services/test_get_document_upload_status.py +++ b/lambdas/tests/unit/services/test_get_document_upload_status.py @@ -15,9 +15,12 @@ def mock_document_service(): @pytest.fixture -def get_document_upload_status_service(mock_document_service): +def get_document_upload_status_service(mock_document_service, mocker): + mocker.patch( + "services.get_document_upload_status.DocumentService", + return_value=mock_document_service, + ) service = GetDocumentUploadStatusService() - service.document_service = mock_document_service return service @@ -80,11 +83,13 @@ def test_get_document_references_by_id_found_documents( ) result = get_document_upload_status_service.get_document_references_by_id( - nhs_number, document_ids + nhs_number, + document_ids, ) mock_document_service.get_batch_document_references_by_id.assert_called_once_with( - document_ids, SupportedDocumentTypes.LG + document_ids, + SupportedDocumentTypes.LG, ) assert len(result) == 2 assert result["doc-id-1"]["status"] == "final" @@ -101,11 +106,12 @@ def test_get_document_references_by_id_not_found_documents( nhs_number = "1234567890" document_ids = ["doc-id-1", "non-existent-id"] mock_document_service.get_batch_document_references_by_id.return_value = [ - sample_document_references[0] + sample_document_references[0], ] result = get_document_upload_status_service.get_document_references_by_id( - nhs_number, document_ids + nhs_number, + document_ids, ) assert len(result) == 2 @@ -123,11 +129,12 @@ def test_get_document_references_by_id_access_denied( nhs_number = "1234567890" document_ids = ["doc-id-3"] mock_document_service.get_batch_document_references_by_id.return_value = [ - sample_document_references[2] + sample_document_references[2], ] result = get_document_upload_status_service.get_document_references_by_id( - nhs_number, document_ids + nhs_number, + document_ids, ) assert len(result) == 1 @@ -143,11 +150,12 @@ def test_get_document_references_by_id_infected_document( nhs_number = "1234567890" document_ids = ["doc-id-4"] mock_document_service.get_batch_document_references_by_id.return_value = [ - sample_document_references[3] + sample_document_references[3], ] result = get_document_upload_status_service.get_document_references_by_id( - nhs_number, document_ids + nhs_number, + document_ids, ) assert len(result) == 1 @@ -155,8 +163,38 @@ def test_get_document_references_by_id_infected_document( assert result["doc-id-4"]["error_code"] == DocumentStatus.INFECTED.code +def test_get_document_references_by_id_invalid_document( + get_document_upload_status_service, + mock_document_service, +): + nhs_number = "1234567890" + document_ids = ["doc-id-invalid"] + + cancelled_doc = DocumentReference( + id="doc-id-invalid", + nhs_number="1234567890", + file_name="invalid_file.pdf", + doc_status=DocumentStatus.CANCELLED.display, + virus_scanner_result=VirusScanResult.INVALID, + ) + + mock_document_service.get_batch_document_references_by_id.return_value = [ + cancelled_doc, + ] + + result = get_document_upload_status_service.get_document_references_by_id( + nhs_number, + document_ids, + ) + + assert len(result) == 1 + assert result["doc-id-invalid"]["status"] == DocumentStatus.INVALID.display + assert result["doc-id-invalid"]["error_code"] == DocumentStatus.INVALID.code + + def test_get_document_references_by_id_cancelled_document( - get_document_upload_status_service, mock_document_service + get_document_upload_status_service, + mock_document_service, ): nhs_number = "1234567890" document_ids = ["doc-id-cancelled"] @@ -171,11 +209,12 @@ def test_get_document_references_by_id_cancelled_document( ) mock_document_service.get_batch_document_references_by_id.return_value = [ - cancelled_doc + cancelled_doc, ] result = get_document_upload_status_service.get_document_references_by_id( - nhs_number, document_ids + nhs_number, + document_ids, ) assert len(result) == 1 @@ -191,11 +230,12 @@ def test_get_document_references_by_id_deleted_document( nhs_number = "1234567890" document_ids = ["doc-id-5"] mock_document_service.get_batch_document_references_by_id.return_value = [ - sample_document_references[4] + sample_document_references[4], ] result = get_document_upload_status_service.get_document_references_by_id( - nhs_number, document_ids + nhs_number, + document_ids, ) assert len(result) == 0 @@ -216,7 +256,8 @@ def test_get_document_references_by_id_multiple_mixed_statuses( ] result = get_document_upload_status_service.get_document_references_by_id( - nhs_number, document_ids + nhs_number, + document_ids, ) assert len(result) == 4 @@ -236,18 +277,21 @@ def test_get_document_references_by_id_multiple_mixed_statuses( def test_get_document_references_by_id_no_results( - get_document_upload_status_service, mock_document_service + get_document_upload_status_service, + mock_document_service, ): nhs_number = "1234567890" document_ids = ["doc-id-6"] mock_document_service.get_batch_document_references_by_id.return_value = [] result = get_document_upload_status_service.get_document_references_by_id( - nhs_number, document_ids + nhs_number, + document_ids, ) mock_document_service.get_batch_document_references_by_id.assert_called_once_with( - document_ids, SupportedDocumentTypes.LG + document_ids, + SupportedDocumentTypes.LG, ) assert result["doc-id-6"]["status"] == DocumentStatus.NOT_FOUND.display assert result["doc-id-6"]["error_code"] == DocumentStatus.NOT_FOUND.code diff --git a/lambdas/tests/unit/services/test_pdf_stitch_service.py b/lambdas/tests/unit/services/test_pdf_stitch_service.py deleted file mode 100644 index 2634e1a1fb..0000000000 --- a/lambdas/tests/unit/services/test_pdf_stitch_service.py +++ /dev/null @@ -1,68 +0,0 @@ -import os -import tempfile -from io import BytesIO - -import pytest -from pypdf import PdfWriter -from pypdf.errors import PyPdfError -from services.pdf_stitch_service import count_page_number, stitch_pdf - - -def test_stitch_pdf(): - test_pdf_folder = "tests/unit/helpers/data/pdf/" - input_test_files = [ - f"{test_pdf_folder}/{filename}" - for filename in ["file1.pdf", "file2.pdf", "file3.pdf"] - ] - - stitched_file = stitch_pdf(input_test_files) - assert count_page_number(stitched_file) == sum( - count_page_number(filepath) for filepath in input_test_files - ) - - os.remove(stitched_file) - - -def test_stitch_pdf_with_given_desc_folder(): - test_pdf_folder = "tests/unit/helpers/data/pdf/" - test_desc_folder = tempfile.mkdtemp() - - input_test_files = [ - f"{test_pdf_folder}/{filename}" - for filename in ["file1.pdf", "file2.pdf", "file3.pdf"] - ] - - stitched_file = stitch_pdf(input_test_files, test_desc_folder) - - assert stitched_file.startswith(test_desc_folder) - - os.remove(stitched_file) - - -def test_stitch_pdf_raise_error_if_fail_to_perform_stitching(): - test_pdf_folder = "tests/unit/helpers/data/pdf/" - input_test_files = [ - f"{test_pdf_folder}/{filename}" for filename in ["invalid_pdf.pdf", "file1.pdf"] - ] - - with pytest.raises(PyPdfError): - stitch_pdf(input_test_files) - - -def test_stitch_pdf_raise_error_when_input_file_not_found(): - test_file = "non-exist-file.pdf" - - with pytest.raises(FileNotFoundError): - stitch_pdf([test_file]) - - -def create_in_memory_pdf(page_count: int = 1) -> BytesIO: - # Creates a PDF in memory with the received number of pages - writer = PdfWriter() - for _ in range(page_count): - writer.add_blank_page(width=72, height=72) - - stream = BytesIO() - writer.write(stream) - stream.seek(0) - return stream diff --git a/lambdas/tests/unit/services/test_upload_document_reference_service.py b/lambdas/tests/unit/services/test_upload_document_reference_service.py index e030d461fd..d8b6a5e865 100644 --- a/lambdas/tests/unit/services/test_upload_document_reference_service.py +++ b/lambdas/tests/unit/services/test_upload_document_reference_service.py @@ -11,7 +11,7 @@ FinalOrPreliminaryAndNotSuperseded, PreliminaryStatus, ) -from utils.exceptions import DocumentServiceException, FileProcessingException, TransactionConflictException +from utils.exceptions import DocumentServiceException, FileProcessingException from utils.lambda_exceptions import InvalidDocTypeException from lambdas.enums.snomed_codes import SnomedCodes @@ -23,6 +23,7 @@ def mock_document_reference(): doc_ref = DocumentReference.model_construct() doc_ref.id = "test-doc-id" doc_ref.nhs_number = "9000000001" + doc_ref.file_name = "test-file.txt" doc_ref.s3_file_key = "original/test-key" doc_ref.s3_bucket_name = "original-bucket" doc_ref.file_location = "original-location" @@ -31,7 +32,7 @@ def mock_document_reference(): doc_ref.doc_status = "preliminary" doc_ref.version = "1" doc_ref._build_s3_location = Mock( - return_value="s3://test-lg-bucket/9000000001/test-doc-id" + return_value="s3://test-lg-bucket/9000000001/test-doc-id", ) return doc_ref @@ -41,13 +42,13 @@ def mock_virus_scan_service( mocker, ): mock = mocker.patch( - "services.upload_document_reference_service.get_virus_scan_service" + "services.upload_document_reference_service.get_virus_scan_service", ) yield mock @pytest.fixture -def service(set_env, mock_virus_scan_service): +def service(set_env, mock_virus_scan_service, mocker): with patch.multiple( "services.upload_document_reference_service", DocumentService=Mock(), @@ -59,6 +60,11 @@ def service(set_env, mock_virus_scan_service): service.dynamo_service = Mock() service.virus_scan_service = MockVirusScanService() service.s3_service = Mock() + mocker.patch("io.BytesIO", return_value=None) + mocker.patch( + "services.upload_document_reference_service.check_file_locked_or_corrupt", + return_value=False, + ) return service @@ -71,33 +77,39 @@ def test_handle_upload_document_reference_request_with_empty_object_key(service) def test_handle_upload_document_reference_request_with_none_object_key(service): """Test handling of a None object key""" - service.handle_upload_document_reference_request(None, 122) + service.handle_upload_document_reference_request("", 122) service.document_service.fetch_documents_from_table.assert_not_called() def test_handle_upload_document_reference_request_success( - service, mock_document_reference, mocker + service, + mock_document_reference, + mocker, ): - """Test successful handling of the upload document reference request""" object_key = "staging/test-doc-id" object_size = 1111 mock_document_reference2 = Mock(spec=DocumentReference) + mock_document_reference2.file_name = "filename2.txt" mock_document_reference2.id = "another-doc-id" mock_document_reference2.doc_status = "final" mock_document_reference2.version = "1" service.s3_service.copy_across_bucket.return_value = { - "VersionId": "test-version-id" + "VersionId": "test-version-id", } - + mocker.patch("io.BytesIO", return_value=None) + mocker.patch( + "services.upload_document_reference_service.check_file_locked_or_corrupt", + return_value=False, + ) # First call fetches preliminary doc, second call fetches existing final docs to supersede service.document_service.fetch_documents_from_table.side_effect = [ [mock_document_reference], [mock_document_reference2], ] service.virus_scan_service.scan_file = mocker.MagicMock( - return_value=VirusScanResult.CLEAN + return_value=VirusScanResult.CLEAN, ) service.handle_upload_document_reference_request(object_key, object_size) @@ -114,7 +126,7 @@ def test_handle_upload_document_reference_request_with_exception(service): object_key = "staging/test-doc-id" service.document_service.fetch_documents_from_table.side_effect = Exception( - "Test error" + "Test error", ) service.handle_upload_document_reference_request(object_key) @@ -125,7 +137,7 @@ def test_fetch_preliminary_document_reference_success(service, mock_document_ref document_key = "test-doc-id" service.table_name = "dev_LloydGeorgeReferenceMetadata" service.document_service.fetch_documents_from_table.return_value = [ - mock_document_reference + mock_document_reference, ] result = service._fetch_preliminary_document_reference(document_key) @@ -150,7 +162,8 @@ def test_fetch_preliminary_document_reference_no_documents_found(service): def test_fetch_preliminary_document_reference_multiple_documents_warning( - service, mock_document_reference + service, + mock_document_reference, ): """Test handling when multiple documents are found""" document_key = "test-doc-id" @@ -177,23 +190,30 @@ def test_fetch_preliminary_document_reference_exception(service): def test__process_preliminary_document_reference_clean_virus_scan( - service, mock_document_reference, mocker + service, + mock_document_reference, + mocker, ): """Test processing document reference with a clean virus scan""" object_key = "staging/test-doc-id" mocker.patch.object( - service, "_perform_virus_scan", return_value=VirusScanResult.CLEAN + service, + "_perform_virus_scan", + return_value=VirusScanResult.CLEAN, ) mock_delete = mocker.patch.object(service, "delete_file_from_staging_bucket") mock_process_clean = mocker.patch.object(service, "_process_clean_document") mock_finalize_transaction = mocker.patch.object( - service, "_finalize_and_supersede_with_transaction" + service, + "_finalize_and_supersede_with_transaction", ) service._process_preliminary_document_reference( - mock_document_reference, object_key, 1222 + mock_document_reference, + object_key, + 1222, ) mock_process_clean.assert_called_once() @@ -205,26 +225,33 @@ def test__process_preliminary_document_reference_clean_virus_scan( def test__process_preliminary_document_reference_infected_virus_scan( - service, mock_document_reference, mocker + service, + mock_document_reference, + mocker, ): """Test processing document reference with an infected virus scan""" object_key = "staging/test-doc-id" mocker.patch.object( - service, "_perform_virus_scan", return_value=VirusScanResult.INFECTED + service, + "_perform_virus_scan", + return_value=VirusScanResult.INFECTED, ) mock_delete = mocker.patch.object(service, "delete_file_from_staging_bucket") mock_process_clean = mocker.patch.object(service, "_process_clean_document") mock_update_dynamo = mocker.patch.object(service, "_update_dynamo_table") service._process_preliminary_document_reference( - mock_document_reference, object_key, 1222 + mock_document_reference, + object_key, + 1222, ) mock_process_clean.assert_not_called() mock_update_dynamo.assert_called_once() mock_delete.assert_called_once_with(object_key) + def test_perform_virus_scan_returns_clean_hardcoded(service, mock_document_reference): """Test virus scan returns hardcoded CLEAN result""" object_key = "staging/test-doc-id" @@ -233,7 +260,9 @@ def test_perform_virus_scan_returns_clean_hardcoded(service, mock_document_refer def test_perform_virus_scan_exception_returns_infected( - service, mock_document_reference, mocker + service, + mock_document_reference, + mocker, ): """Test virus scan exception handling returns INFECTED for safety""" mock_virus_service = mocker.patch.object(service, "virus_scan_service") @@ -260,7 +289,9 @@ def test_process_clean_document_success(service, mock_document_reference, mocker def test_process_clean_document_exception_restores_original_values( - service, mock_document_reference, mocker + service, + mock_document_reference, + mocker, ): """Test that original values are restored when processing fails""" object_key = "staging/test-doc-id" @@ -269,7 +300,9 @@ def test_process_clean_document_exception_restores_original_values( original_location = "original-location" mocker.patch.object( - service, "copy_files_from_staging_bucket", side_effect=Exception("Copy failed") + service, + "copy_files_from_staging_bucket", + side_effect=Exception("Copy failed"), ) with pytest.raises(FileProcessingException): service._process_clean_document( @@ -306,7 +339,7 @@ def test_copy_files_from_staging_bucket_client_error(service, mock_document_refe source_file_key = "staging/test-doc-id" client_error = ClientError( error_response={ - "Error": {"Code": "NoSuchBucket", "Message": "Bucket does not exist"} + "Error": {"Code": "NoSuchBucket", "Message": "Bucket does not exist"}, }, operation_name="CopyObject", ) @@ -323,7 +356,8 @@ def test_delete_file_from_staging_bucket_success(service): service.delete_file_from_staging_bucket(source_file_key) service.s3_service.delete_object.assert_called_once_with( - MOCK_STAGING_STORE_BUCKET, source_file_key + MOCK_STAGING_STORE_BUCKET, + source_file_key, ) @@ -336,7 +370,8 @@ def test_delete_pdm_file_from_staging_bucket_success(service): service.delete_file_from_staging_bucket(source_file_key) service.s3_service.delete_object.assert_called_once_with( - MOCK_STAGING_STORE_BUCKET, source_file_key + MOCK_STAGING_STORE_BUCKET, + source_file_key, ) @@ -345,7 +380,7 @@ def test_delete_file_from_staging_bucket_client_error(service): source_file_key = "staging/test-doc-id" client_error = ClientError( error_response={ - "Error": {"Code": "NoSuchKey", "Message": "Key does not exist"} + "Error": {"Code": "NoSuchKey", "Message": "Key does not exist"}, }, operation_name="DeleteObject", ) @@ -389,7 +424,10 @@ def test_update_dynamo_table_client_error(service, mock_document_reference): """Test handling of ClientError during DynamoDB update""" client_error = ClientError( error_response={ - "Error": {"Code": "ResourceNotFoundException", "Message": "Table not found"} + "Error": { + "Code": "ResourceNotFoundException", + "Message": "Table not found", + }, }, operation_name="UpdateItem", ) @@ -418,7 +456,7 @@ def test_document_key_extraction_from_object_key_for_lg( # First call returns preliminary doc, second call returns empty list (no existing finals) service.s3_service.copy_across_bucket.return_value = { - "VersionId": "test-version-id" + "VersionId": "test-version-id", } service.document_service.fetch_documents_from_table.side_effect = [ @@ -446,7 +484,8 @@ def test_document_key_extraction_from_object_key_for_lg( def test_finalize_and_supersede_with_transaction_with_existing_finals( - service, mock_document_reference, mocker + service, + mock_document_reference, ): """Test transaction-based finalisation with existing final documents to supersede""" new_doc = mock_document_reference @@ -463,7 +502,7 @@ def test_finalize_and_supersede_with_transaction_with_existing_finals( service.table_name = "dev_LloydGeorgeReferenceMetadata" service.document_service.fetch_documents_from_table.return_value = [ - existing_final_doc + existing_final_doc, ] mock_build_update = Mock(return_value={"Update": "transaction1"}) @@ -511,7 +550,8 @@ def test_finalize_and_supersede_with_transaction_with_existing_finals( def test_finalize_and_supersede_with_transaction_no_existing_docs( - service, mock_document_reference, mocker + service, + mock_document_reference, ): """Test transaction-based finalization when no existing final documents found""" new_doc = mock_document_reference @@ -535,7 +575,8 @@ def test_finalize_and_supersede_with_transaction_no_existing_docs( def test_finalize_and_supersede_with_transaction_multiple_existing( - service, mock_document_reference, mocker + service, + mock_document_reference, ): """Test transaction-based finalization superseding multiple existing final documents""" new_doc = mock_document_reference @@ -571,7 +612,8 @@ def test_finalize_and_supersede_with_transaction_multiple_existing( def test_finalize_and_supersede_with_transaction_skips_same_id( - service, mock_document_reference, mocker + service, + mock_document_reference, ): """Test that transaction skips documents with the same ID""" new_doc = mock_document_reference @@ -599,7 +641,8 @@ def test_finalize_and_supersede_with_transaction_skips_same_id( def test_finalize_and_supersede_with_transaction_handles_transaction_cancelled( - service, mock_document_reference + service, + mock_document_reference, ): new_doc = mock_document_reference @@ -639,25 +682,34 @@ def test_handle_upload_document_reference_request_no_document_found(service): def test_process_preliminary_document_reference_exception_during_processing( - service, mock_document_reference, mocker + service, + mock_document_reference, + mocker, ): """Test that exceptions during processing are properly raised""" object_key = "staging/test-doc-id" mocker.patch.object( - service, "_perform_virus_scan", return_value=VirusScanResult.CLEAN + service, + "_perform_virus_scan", + return_value=VirusScanResult.CLEAN, ) mocker.patch.object( - service, "_process_clean_document", side_effect=Exception("Processing failed") + service, + "_process_clean_document", + side_effect=Exception("Processing failed"), ) with pytest.raises(Exception) as exc_info: service._process_preliminary_document_reference( - mock_document_reference, object_key, 1222 + mock_document_reference, + object_key, + 1222, ) assert "Processing failed" in str(exc_info.value) + def test_get_infrastructure_for_document_key_non_pdm(service): assert service.table_name == "" infra = service._get_infrastructure_for_document_key(object_parts=["1234", "123"]) @@ -690,3 +742,32 @@ def test_get_infra_invalid_doc_type(monkeypatch, service): # Call function and assert the exception is raised with pytest.raises(InvalidDocTypeException): service._get_infrastructure_for_document_key(["fhir_upload", "999999"]) + + +def test_is_file_invalid_calls_correct_functions(service, mocker): + """Test that is_file_invalid calls the right functions in the correct order""" + object_key = "test-folder/test-file.docx" + file_extension = "docx" + file_content = b"fake docx file content" + + mock_stream = Mock() + mock_stream.read.return_value = file_content + service.s3_service.get_object_stream.return_value = mock_stream + mock_bytesio = mocker.patch("services.upload_document_reference_service.io.BytesIO") + mock_file_stream = Mock() + mock_bytesio.return_value = mock_file_stream + mock_check = mocker.patch( + "services.upload_document_reference_service.check_file_locked_or_corrupt", + return_value=True, + ) + + result = service.is_file_invalid(object_key, file_extension) + + assert result is True + service.s3_service.get_object_stream.assert_called_once_with( + service.staging_s3_bucket_name, + object_key, + ) + mock_stream.read.assert_called_once_with() + mock_bytesio.assert_called_once_with(file_content) + mock_check.assert_called_once_with(mock_file_stream, file_extension) diff --git a/lambdas/tests/unit/utils/test_file_utils.py b/lambdas/tests/unit/utils/test_file_utils.py index 3a225f1841..a6dd202289 100644 --- a/lambdas/tests/unit/utils/test_file_utils.py +++ b/lambdas/tests/unit/utils/test_file_utils.py @@ -1,4 +1,11 @@ -from utils.file_utils import convert_csv_dictionary_to_bytes +from io import BytesIO +from unittest.mock import MagicMock, Mock, patch + +import pytest +from utils.file_utils import ( + check_file_locked_or_corrupt, + convert_csv_dictionary_to_bytes, +) def test_convert_csv_dictionary_to_bytes(): @@ -9,10 +16,144 @@ def test_convert_csv_dictionary_to_bytes(): ] result_bytes = convert_csv_dictionary_to_bytes( - headers=headers, csv_dict_data=metadata_csv_data, encoding="utf-8" + headers=headers, + csv_dict_data=metadata_csv_data, + encoding="utf-8", ) result_str = result_bytes.decode("utf-8") expected_output = "id,name,age\r\n1,Alice,30\r\n2,Bob,25\r\n" assert result_str == expected_output + + +@pytest.mark.parametrize( + "file_extension,file_content,expected_result", + [ + ("pdf", b"%PDF-1.4\nsome content", False), + ("zip", b"PK\x03\x04some zip content", False), + ], + ids=["pdf_file", "zip_file"], +) +def test_skipped_file_types(file_extension, file_content, expected_result): + file_stream = BytesIO(file_content) + result = check_file_locked_or_corrupt(file_stream, file_extension) + assert result == expected_result + + +@pytest.mark.parametrize( + "file_extension,is_encrypted,expected_result", + [ + ("docx", False, False), + ("docx", True, True), + ("xlsx", False, False), + ("xlsx", True, True), + ("pptx", False, False), + ("pptx", True, True), + ("doc", False, False), + ("doc", True, True), + ("xls", False, False), + ("xls", True, True), + ("ppt", False, False), + ("ppt", True, True), + ], +) +@patch("utils.file_utils.msoffcrypto.OfficeFile") +def test_office_files(mock_office_file, file_extension, is_encrypted, expected_result): + mock_instance = Mock() + mock_instance.is_encrypted.return_value = is_encrypted + mock_office_file.return_value = mock_instance + + file_stream = BytesIO(b"fake office content") + result = check_file_locked_or_corrupt(file_stream, file_extension) + + assert result == expected_result + mock_office_file.assert_called_once_with(file_stream) + mock_instance.is_encrypted.assert_called_once() + + +@pytest.mark.parametrize( + "file_extension,file_content,expected_result", + [ + ( + "rtf", + b"{\\rtf1\\ansi\\deff0 {\\fonttbl {\\f0 Times New Roman;}}\\f0\\fs60 Hello!}", + False, + ), + ("rtf", b"This is not an RTF file", True), + ("csv", b"name,age,city\nAlice,30,NYC\nBob,25,LA", False), + ("csv", b"\xff\xfe Invalid UTF-8", True), + ("json", b'{"key": "value", "number": 123}', False), + ("txt", b"This is a simple text file.\nWith multiple lines.", False), + ("txt", b"", False), + ("xml", b'data', False), + ], +) +def test_text_based_files(file_extension, file_content, expected_result): + file_stream = BytesIO(file_content) + result = check_file_locked_or_corrupt(file_stream, file_extension) + assert result == expected_result + + +@pytest.mark.parametrize( + "file_extension", + [ + "jpg", + "jpeg", + "png", + "tiff", + "tif", + ], + ids=["jpg", "jpeg", "png", "tiff", "tif"], +) +@patch("utils.file_utils.Image.open") +def test_image_files_valid(mock_image_open, file_extension): + mock_img = MagicMock() + mock_image_open.return_value.__enter__.return_value = mock_img + + file_stream = BytesIO(b"fake image content") + result = check_file_locked_or_corrupt(file_stream, file_extension) + + assert result is False + mock_image_open.assert_called_once_with(file_stream) + mock_img.verify.assert_called_once() + + +@pytest.mark.parametrize( + "file_extension", + ["jpg", "png", "tiff"], +) +@patch("utils.file_utils.Image.open") +def test_image_files_corrupt(mock_image_open, file_extension): + mock_image_open.side_effect = Exception("Corrupt image") + + file_stream = BytesIO(b"corrupt image data") + result = check_file_locked_or_corrupt(file_stream, file_extension) + + assert result is True + + +@pytest.mark.parametrize( + "file_extension", + ["unknown", "mp4", "mp3", "avi", "mov"], + ids=["unknown", "mp4", "mp3", "avi", "mov"], +) +def test_unsupported_file_extensions(file_extension): + file_stream = BytesIO(b"some content") + result = check_file_locked_or_corrupt(file_stream, file_extension) + assert result is False + + +@pytest.mark.parametrize( + "file_extension", + ["docx", "xlsx", "pptx", "doc", "xls"], + ids=["docx", "xlsx", "pptx", "doc", "xls"], +) +@patch("utils.file_utils.msoffcrypto.OfficeFile") +def test_office_file_exception_returns_true(mock_office_file, file_extension): + mock_office_file.side_effect = Exception("Unable to process file") + + file_stream = BytesIO(b"corrupt office content") + result = check_file_locked_or_corrupt(file_stream, file_extension) + + assert result is True diff --git a/lambdas/utils/constants/file_extensions.py b/lambdas/utils/constants/file_extensions.py new file mode 100644 index 0000000000..4e04dfd004 --- /dev/null +++ b/lambdas/utils/constants/file_extensions.py @@ -0,0 +1,3 @@ +TEXT_FILE_EXTENSIONS = ["rtf", "csv", "json", "txt", "xml"] +MEDIA_FILE_EXTENSIONS = ["jpg", "jpeg", "png", "tiff", "tif"] +MICROSOFT_OFFICE_FILE_EXTENSIONS = ["docx", "xlsx", "pptx", "doc", "xls", "ppt"] diff --git a/lambdas/utils/file_utils.py b/lambdas/utils/file_utils.py index ec24c67bc3..90e5d9ed0b 100644 --- a/lambdas/utils/file_utils.py +++ b/lambdas/utils/file_utils.py @@ -1,9 +1,22 @@ import csv from io import BytesIO, TextIOWrapper +import msoffcrypto +from PIL import Image +from utils.audit_logging_setup import LoggingService +from utils.constants.file_extensions import ( + MEDIA_FILE_EXTENSIONS, + MICROSOFT_OFFICE_FILE_EXTENSIONS, + TEXT_FILE_EXTENSIONS, +) + +logger = LoggingService(__name__) + def convert_csv_dictionary_to_bytes( - headers: list[str], csv_dict_data: list[dict], encoding: str = "utf-8" + headers: list[str], + csv_dict_data: list[dict], + encoding: str = "utf-8", ) -> bytes: csv_buffer = BytesIO() csv_text_wrapper = TextIOWrapper(csv_buffer, encoding=encoding, newline="") @@ -20,3 +33,40 @@ def convert_csv_dictionary_to_bytes( csv_buffer.close() return result + + +def check_file_locked_or_corrupt(file_stream, ext): + file_stream.seek(0) + try: + if ext == "pdf" or ext == "zip": + # Skipping PDF check, as this is covered by the antivirus scan + logger.info(f"Skipping check for {ext} files") + return False + + if ext in MICROSOFT_OFFICE_FILE_EXTENSIONS: + office_file = msoffcrypto.OfficeFile(file_stream) + encrypt = office_file.is_encrypted() + return encrypt + + if ext in TEXT_FILE_EXTENSIONS: + sample = file_stream.read(1024) + sample.decode("utf-8") + if ext == "rtf" and not sample.startswith(b"{\\rtf1"): + return True + return False + + if ext in MEDIA_FILE_EXTENSIONS: + with Image.open(file_stream) as img: + img.verify() + return False + + logger.info( + f"File with extension {ext} is not supported for locked/corrupt check, treating as valid.", + ) + return False + + except Exception as e: + logger.error( + f"Error checking file validity for .{ext}: {type(e).__name__} - {str(e)}", + ) + return True diff --git a/poetry.lock b/poetry.lock index c329268964..4bf25323d7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -205,7 +205,7 @@ version = "2.0.0" description = "Foreign Function Interface for Python calling C code." optional = false python-versions = ">=3.9" -groups = ["core-lambda"] +groups = ["core-lambda", "files-lambda"] markers = "platform_python_implementation != \"PyPy\"" files = [ {file = "cffi-2.0.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44"}, @@ -517,56 +517,74 @@ toml = ["tomli ; python_full_version <= \"3.11.0a6\""] [[package]] name = "cryptography" -version = "44.0.1" +version = "46.0.5" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false -python-versions = "!=3.9.0,!=3.9.1,>=3.7" -groups = ["core-lambda"] -files = [ - {file = "cryptography-44.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:bf688f615c29bfe9dfc44312ca470989279f0e94bb9f631f85e3459af8efc009"}, - {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd7c7e2d71d908dc0f8d2027e1604102140d84b155e658c20e8ad1304317691f"}, - {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887143b9ff6bad2b7570da75a7fe8bbf5f65276365ac259a5d2d5147a73775f2"}, - {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:322eb03ecc62784536bc173f1483e76747aafeb69c8728df48537eb431cd1911"}, - {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:21377472ca4ada2906bc313168c9dc7b1d7ca417b63c1c3011d0c74b7de9ae69"}, - {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:df978682c1504fc93b3209de21aeabf2375cb1571d4e61907b3e7a2540e83026"}, - {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:eb3889330f2a4a148abead555399ec9a32b13b7c8ba969b72d8e500eb7ef84cd"}, - {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:8e6a85a93d0642bd774460a86513c5d9d80b5c002ca9693e63f6e540f1815ed0"}, - {file = "cryptography-44.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6f76fdd6fd048576a04c5210d53aa04ca34d2ed63336d4abd306d0cbe298fddf"}, - {file = "cryptography-44.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6c8acf6f3d1f47acb2248ec3ea261171a671f3d9428e34ad0357148d492c7864"}, - {file = "cryptography-44.0.1-cp37-abi3-win32.whl", hash = "sha256:24979e9f2040c953a94bf3c6782e67795a4c260734e5264dceea65c8f4bae64a"}, - {file = "cryptography-44.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:fd0ee90072861e276b0ff08bd627abec29e32a53b2be44e41dbcdf87cbee2b00"}, - {file = "cryptography-44.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:a2d8a7045e1ab9b9f803f0d9531ead85f90c5f2859e653b61497228b18452008"}, - {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8272f257cf1cbd3f2e120f14c68bff2b6bdfcc157fafdee84a1b795efd72862"}, - {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e8d181e90a777b63f3f0caa836844a1182f1f265687fac2115fcf245f5fbec3"}, - {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:436df4f203482f41aad60ed1813811ac4ab102765ecae7a2bbb1dbb66dcff5a7"}, - {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4f422e8c6a28cf8b7f883eb790695d6d45b0c385a2583073f3cec434cc705e1a"}, - {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:72198e2b5925155497a5a3e8c216c7fb3e64c16ccee11f0e7da272fa93b35c4c"}, - {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:2a46a89ad3e6176223b632056f321bc7de36b9f9b93b2cc1cccf935a3849dc62"}, - {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:53f23339864b617a3dfc2b0ac8d5c432625c80014c25caac9082314e9de56f41"}, - {file = "cryptography-44.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:888fcc3fce0c888785a4876ca55f9f43787f4c5c1cc1e2e0da71ad481ff82c5b"}, - {file = "cryptography-44.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:00918d859aa4e57db8299607086f793fa7813ae2ff5a4637e318a25ef82730f7"}, - {file = "cryptography-44.0.1-cp39-abi3-win32.whl", hash = "sha256:9b336599e2cb77b1008cb2ac264b290803ec5e8e89d618a5e978ff5eb6f715d9"}, - {file = "cryptography-44.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:e403f7f766ded778ecdb790da786b418a9f2394f36e8cc8b796cc056ab05f44f"}, - {file = "cryptography-44.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1f9a92144fa0c877117e9748c74501bea842f93d21ee00b0cf922846d9d0b183"}, - {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:610a83540765a8d8ce0f351ce42e26e53e1f774a6efb71eb1b41eb01d01c3d12"}, - {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:5fed5cd6102bb4eb843e3315d2bf25fede494509bddadb81e03a859c1bc17b83"}, - {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:f4daefc971c2d1f82f03097dc6f216744a6cd2ac0f04c68fb935ea2ba2a0d420"}, - {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:94f99f2b943b354a5b6307d7e8d19f5c423a794462bde2bf310c770ba052b1c4"}, - {file = "cryptography-44.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d9c5b9f698a83c8bd71e0f4d3f9f839ef244798e5ffe96febfa9714717db7af7"}, - {file = "cryptography-44.0.1.tar.gz", hash = "sha256:f51f5705ab27898afda1aaa430f34ad90dc117421057782022edf0600bec5f14"}, +python-versions = "!=3.9.0,!=3.9.1,>=3.8" +groups = ["core-lambda", "files-lambda"] +files = [ + {file = "cryptography-46.0.5-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:351695ada9ea9618b3500b490ad54c739860883df6c1f555e088eaf25b1bbaad"}, + {file = "cryptography-46.0.5-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c18ff11e86df2e28854939acde2d003f7984f721eba450b56a200ad90eeb0e6b"}, + {file = "cryptography-46.0.5-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d7e3d356b8cd4ea5aff04f129d5f66ebdc7b6f8eae802b93739ed520c47c79b"}, + {file = "cryptography-46.0.5-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:50bfb6925eff619c9c023b967d5b77a54e04256c4281b0e21336a130cd7fc263"}, + {file = "cryptography-46.0.5-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:803812e111e75d1aa73690d2facc295eaefd4439be1023fefc4995eaea2af90d"}, + {file = "cryptography-46.0.5-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3ee190460e2fbe447175cda91b88b84ae8322a104fc27766ad09428754a618ed"}, + {file = "cryptography-46.0.5-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:f145bba11b878005c496e93e257c1e88f154d278d2638e6450d17e0f31e558d2"}, + {file = "cryptography-46.0.5-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:e9251e3be159d1020c4030bd2e5f84d6a43fe54b6c19c12f51cde9542a2817b2"}, + {file = "cryptography-46.0.5-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:47fb8a66058b80e509c47118ef8a75d14c455e81ac369050f20ba0d23e77fee0"}, + {file = "cryptography-46.0.5-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:4c3341037c136030cb46e4b1e17b7418ea4cbd9dd207e4a6f3b2b24e0d4ac731"}, + {file = "cryptography-46.0.5-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:890bcb4abd5a2d3f852196437129eb3667d62630333aacc13dfd470fad3aaa82"}, + {file = "cryptography-46.0.5-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:80a8d7bfdf38f87ca30a5391c0c9ce4ed2926918e017c29ddf643d0ed2778ea1"}, + {file = "cryptography-46.0.5-cp311-abi3-win32.whl", hash = "sha256:60ee7e19e95104d4c03871d7d7dfb3d22ef8a9b9c6778c94e1c8fcc8365afd48"}, + {file = "cryptography-46.0.5-cp311-abi3-win_amd64.whl", hash = "sha256:38946c54b16c885c72c4f59846be9743d699eee2b69b6988e0a00a01f46a61a4"}, + {file = "cryptography-46.0.5-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:94a76daa32eb78d61339aff7952ea819b1734b46f73646a07decb40e5b3448e2"}, + {file = "cryptography-46.0.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5be7bf2fb40769e05739dd0046e7b26f9d4670badc7b032d6ce4db64dddc0678"}, + {file = "cryptography-46.0.5-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe346b143ff9685e40192a4960938545c699054ba11d4f9029f94751e3f71d87"}, + {file = "cryptography-46.0.5-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:c69fd885df7d089548a42d5ec05be26050ebcd2283d89b3d30676eb32ff87dee"}, + {file = "cryptography-46.0.5-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:8293f3dea7fc929ef7240796ba231413afa7b68ce38fd21da2995549f5961981"}, + {file = "cryptography-46.0.5-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:1abfdb89b41c3be0365328a410baa9df3ff8a9110fb75e7b52e66803ddabc9a9"}, + {file = "cryptography-46.0.5-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:d66e421495fdb797610a08f43b05269e0a5ea7f5e652a89bfd5a7d3c1dee3648"}, + {file = "cryptography-46.0.5-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:4e817a8920bfbcff8940ecfd60f23d01836408242b30f1a708d93198393a80b4"}, + {file = "cryptography-46.0.5-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:68f68d13f2e1cb95163fa3b4db4bf9a159a418f5f6e7242564fc75fcae667fd0"}, + {file = "cryptography-46.0.5-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:a3d1fae9863299076f05cb8a778c467578262fae09f9dc0ee9b12eb4268ce663"}, + {file = "cryptography-46.0.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c4143987a42a2397f2fc3b4d7e3a7d313fbe684f67ff443999e803dd75a76826"}, + {file = "cryptography-46.0.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:7d731d4b107030987fd61a7f8ab512b25b53cef8f233a97379ede116f30eb67d"}, + {file = "cryptography-46.0.5-cp314-cp314t-win32.whl", hash = "sha256:c3bcce8521d785d510b2aad26ae2c966092b7daa8f45dd8f44734a104dc0bc1a"}, + {file = "cryptography-46.0.5-cp314-cp314t-win_amd64.whl", hash = "sha256:4d8ae8659ab18c65ced284993c2265910f6c9e650189d4e3f68445ef82a810e4"}, + {file = "cryptography-46.0.5-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:4108d4c09fbbf2789d0c926eb4152ae1760d5a2d97612b92d508d96c861e4d31"}, + {file = "cryptography-46.0.5-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d1f30a86d2757199cb2d56e48cce14deddf1f9c95f1ef1b64ee91ea43fe2e18"}, + {file = "cryptography-46.0.5-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:039917b0dc418bb9f6edce8a906572d69e74bd330b0b3fea4f79dab7f8ddd235"}, + {file = "cryptography-46.0.5-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:ba2a27ff02f48193fc4daeadf8ad2590516fa3d0adeeb34336b96f7fa64c1e3a"}, + {file = "cryptography-46.0.5-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:61aa400dce22cb001a98014f647dc21cda08f7915ceb95df0c9eaf84b4b6af76"}, + {file = "cryptography-46.0.5-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3ce58ba46e1bc2aac4f7d9290223cead56743fa6ab94a5d53292ffaac6a91614"}, + {file = "cryptography-46.0.5-cp38-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:420d0e909050490d04359e7fdb5ed7e667ca5c3c402b809ae2563d7e66a92229"}, + {file = "cryptography-46.0.5-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:582f5fcd2afa31622f317f80426a027f30dc792e9c80ffee87b993200ea115f1"}, + {file = "cryptography-46.0.5-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:bfd56bb4b37ed4f330b82402f6f435845a5f5648edf1ad497da51a8452d5d62d"}, + {file = "cryptography-46.0.5-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a3d507bb6a513ca96ba84443226af944b0f7f47dcc9a399d110cd6146481d24c"}, + {file = "cryptography-46.0.5-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9f16fbdf4da055efb21c22d81b89f155f02ba420558db21288b3d0035bafd5f4"}, + {file = "cryptography-46.0.5-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ced80795227d70549a411a4ab66e8ce307899fad2220ce5ab2f296e687eacde9"}, + {file = "cryptography-46.0.5-cp38-abi3-win32.whl", hash = "sha256:02f547fce831f5096c9a567fd41bc12ca8f11df260959ecc7c3202555cc47a72"}, + {file = "cryptography-46.0.5-cp38-abi3-win_amd64.whl", hash = "sha256:556e106ee01aa13484ce9b0239bca667be5004efb0aabbed28d353df86445595"}, + {file = "cryptography-46.0.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:3b4995dc971c9fb83c25aa44cf45f02ba86f71ee600d81091c2f0cbae116b06c"}, + {file = "cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:bc84e875994c3b445871ea7181d424588171efec3e185dced958dad9e001950a"}, + {file = "cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:2ae6971afd6246710480e3f15824ed3029a60fc16991db250034efd0b9fb4356"}, + {file = "cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:d861ee9e76ace6cf36a6a89b959ec08e7bc2493ee39d07ffe5acb23ef46d27da"}, + {file = "cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:2b7a67c9cd56372f3249b39699f2ad479f6991e62ea15800973b956f4b73e257"}, + {file = "cryptography-46.0.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8456928655f856c6e1533ff59d5be76578a7157224dbd9ce6872f25055ab9ab7"}, + {file = "cryptography-46.0.5.tar.gz", hash = "sha256:abace499247268e3757271b2f1e244b36b06f8515cf27c4d49468fc9eb16e93d"}, ] [package.dependencies] -cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""} +cffi = {version = ">=2.0.0", markers = "python_full_version >= \"3.9.0\" and platform_python_implementation != \"PyPy\""} [package.extras] -docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=3.0.0) ; python_version >= \"3.8\""] +docs = ["sphinx (>=5.3.0)", "sphinx-inline-tabs", "sphinx-rtd-theme (>=3.0.0)"] docstest = ["pyenchant (>=3)", "readme-renderer (>=30.0)", "sphinxcontrib-spelling (>=7.3.1)"] -nox = ["nox (>=2024.4.15)", "nox[uv] (>=2024.3.2) ; python_version >= \"3.8\""] -pep8test = ["check-sdist ; python_version >= \"3.8\"", "click (>=8.0.1)", "mypy (>=1.4)", "ruff (>=0.3.6)"] +nox = ["nox[uv] (>=2024.4.15)"] +pep8test = ["check-sdist", "click (>=8.0.1)", "mypy (>=1.14)", "ruff (>=0.11.11)"] sdist = ["build (>=1.0.0)"] ssh = ["bcrypt (>=3.1.5)"] -test = ["certifi (>=2024)", "cryptography-vectors (==44.0.1)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"] +test = ["certifi (>=2024)", "cryptography-vectors (==46.0.5)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"] test-randomorder = ["pytest-randomly"] [[package]] @@ -1190,6 +1208,22 @@ files = [ {file = "msgpack-1.1.2.tar.gz", hash = "sha256:3b60763c1373dd60f398488069bcdc703cd08a711477b5d480eecc9f9626f47e"}, ] +[[package]] +name = "msoffcrypto-tool" +version = "6.0.0" +description = "Python tool and library for decrypting and encrypting MS Office files using a password or other keys" +optional = false +python-versions = "<4.0,>=3.10" +groups = ["files-lambda"] +files = [ + {file = "msoffcrypto_tool-6.0.0-py3-none-any.whl", hash = "sha256:46c394ed5d9641e802fc79bf3fb0666a53748b23fa8c4aa634ae9d30d46fe397"}, + {file = "msoffcrypto_tool-6.0.0.tar.gz", hash = "sha256:9a5ebc4c0096b42e5d7ebc2350afdc92dc511061e935ca188468094fdd032bbe"}, +] + +[package.dependencies] +cryptography = ">=39.0" +olefile = ">=0.46" + [[package]] name = "mypy-extensions" version = "1.1.0" @@ -1219,6 +1253,21 @@ rsa = ["cryptography (>=3.0.0)"] signals = ["blinker (>=1.4.0)"] signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] +[[package]] +name = "olefile" +version = "0.47" +description = "Python package to parse, read and write Microsoft OLE2 files (Structured Storage or Compound Document, Microsoft Office)" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["files-lambda"] +files = [ + {file = "olefile-0.47-py2.py3-none-any.whl", hash = "sha256:543c7da2a7adadf21214938bb79c83ea12b473a4b6ee4ad4bf854e7715e13d1f"}, + {file = "olefile-0.47.zip", hash = "sha256:599383381a0bf3dfbd932ca0ca6515acd174ed48870cbf7fee123d698c192c1c"}, +] + +[package.extras] +tests = ["pytest", "pytest-cov"] + [[package]] name = "openpyxl" version = "3.1.5" @@ -1664,7 +1713,7 @@ version = "2.23" description = "C parser in Python" optional = false python-versions = ">=3.8" -groups = ["core-lambda"] +groups = ["core-lambda", "files-lambda"] markers = "platform_python_implementation != \"PyPy\" and implementation_name != \"PyPy\"" files = [ {file = "pycparser-2.23-py3-none-any.whl", hash = "sha256:e5c6e8d3fbad53479cab09ac03729e0a9faf2bee3db8208a550daf5af81a5934"}, @@ -2450,4 +2499,4 @@ requests = "*" [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "77d0249d2dd6c9fbb02b0e434ade72ac221f4df82e8d98ec49250fe0b7ff74df" +content-hash = "9d5fc7af2841c90590cd173e32abcf400d064cd303e49a5b2ac33964170a0e9b" diff --git a/pyproject.toml b/pyproject.toml index 4abaa1617d..fe89cf8388 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,9 @@ pikepdf = "8.4.0" [tool.poetry.group.data_lambda.dependencies] polars = "1.31.0" +[tool.poetry.group.files_lambda.dependencies] +msoffcrypto-tool = "6.0.0" + [tool.poetry.group.reports_lambda.dependencies] openpyxl = "^3.1.5" reportlab = "^4.3.1"