diff --git a/.github/workflows/base-lambdas-reusable-deploy-all.yml b/.github/workflows/base-lambdas-reusable-deploy-all.yml index 495c36fb46..c2ec863b83 100644 --- a/.github/workflows/base-lambdas-reusable-deploy-all.yml +++ b/.github/workflows/base-lambdas-reusable-deploy-all.yml @@ -434,6 +434,20 @@ jobs: secrets: AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} + deploy_s3_data_collection_lambda: + name: Deploy S3 data collection lambda + uses: ./.github/workflows/base-lambdas-reusable-deploy.yml + with: + environment: ${{ inputs.environment}} + python_version: ${{ inputs.python_version }} + build_branch: ${{ inputs.build_branch}} + sandbox: ${{ inputs.sandbox }} + lambda_handler_name: data_collection_handler + lambda_aws_name: ReportS3Content + lambda_layer_names: "core_lambda_layer,data_lambda_layer" + secrets: + AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} + deploy_statistical_report_lambda: name: Deploy statistical report lambda uses: ./.github/workflows/base-lambdas-reusable-deploy.yml @@ -838,3 +852,17 @@ jobs: lambda_layer_names: "core_lambda_layer" secrets: AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} + +# deploy_report_s3_content_lambda: +# name: Deploy Report S3 Content Lambda +# uses: ./.github/workflows/base-lambdas-reusable-deploy.yml +# with: +# environment: ${{ inputs.environment }} +# python_version: ${{ inputs.python_version }} +# build_branch: ${{ inputs.build_branch }} +# sandbox: ${{ inputs.sandbox }} +# lambda_handler_name: report_s3_content_handler +# lambda_aws_name: ReportS3Content +# lambda_layer_names: "core_lambda_layer" +# secrets: +# AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} diff --git a/.github/workflows/full-deploy-to-pre-prod.yml b/.github/workflows/full-deploy-to-pre-prod.yml index de251f2c25..d121c2be97 100644 --- a/.github/workflows/full-deploy-to-pre-prod.yml +++ b/.github/workflows/full-deploy-to-pre-prod.yml @@ -136,6 +136,17 @@ jobs: secrets: AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} + deploy_s3_data_collection: + name: Deploy S3 Data Collection + needs: [ "tag_and_release" ] + uses: ./.github/workflows/s3-base-data-collection.yml + with: + build_branch: ${{ needs.tag_and_release.outputs.version }} + environment: pre-prod + sandbox: pre-prod + secrets: + AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} + run_fhir_api_e2e_tests: name: Run FHIR API E2E Tests needs: ["deploy_all_lambdas"] diff --git a/.github/workflows/full-deploy-to-prod.yml b/.github/workflows/full-deploy-to-prod.yml index b9dae923ee..f2a43132e6 100644 --- a/.github/workflows/full-deploy-to-prod.yml +++ b/.github/workflows/full-deploy-to-prod.yml @@ -75,3 +75,13 @@ jobs: sandbox: prod secrets: AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} + + deploy_s3_data_collection: + name: Deploy S3 Data Collection + uses: ./.github/workflows/s3_base-data-collection.yml + with: + build_branch: ${{ inputs.tag_version }} + environment: prod + sandbox: prod + secrets: + AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} diff --git a/.github/workflows/full-deploy-to-sandbox.yml b/.github/workflows/full-deploy-to-sandbox.yml index bf5621cf20..16e15bd9cb 100644 --- a/.github/workflows/full-deploy-to-sandbox.yml +++ b/.github/workflows/full-deploy-to-sandbox.yml @@ -166,6 +166,17 @@ jobs: secrets: AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} + deploy_s3_data_collection: + name: "Deploy S3 Data Collection (ECS task image)" + uses: ./.github/workflows/s3-base-data-collection.yml + needs: ["deploy_all_lambdas"] + with: + build_branch: ${{ inputs.build_branch }} + sandbox: ${{ inputs.sandbox }} + environment: ${{ inputs.environment }} + secrets: + AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} + bulk_upload: name: "Run Bulk Upload" if: ${{ inputs.bulk_upload }} diff --git a/.github/workflows/lambdas-dev-to-main-ci.yml b/.github/workflows/lambdas-dev-to-main-ci.yml index f67055b175..f5b38a651e 100644 --- a/.github/workflows/lambdas-dev-to-main-ci.yml +++ b/.github/workflows/lambdas-dev-to-main-ci.yml @@ -80,6 +80,18 @@ jobs: secrets: AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} + deploy_s3_data_collection: + name: Deploy S3 Data Collection + needs: [ "run_tests" ] + uses: ./.github/workflows/s3-base-data-collection.yml + if: github.ref == 'refs/heads/main' + with: + build_branch: ${{ github.event.pull_request.head.ref }} + environment: development + sandbox: ndr-dev + secrets: + AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} + notify-slack: name: Notify Slack on Failure runs-on: ubuntu-latest @@ -91,6 +103,7 @@ jobs: publish_all_lambda_layers, deploy_all_lambdas, deploy_data_collection, + deploy_s3_data_collection, ] if: failure() && github.event_name == 'push' && github.ref == 'refs/heads/main' steps: @@ -160,6 +173,10 @@ jobs: { "type": "mrkdwn", "text": "*deploy_data_collection:* ${{ needs.deploy_data_collection.result == 'success' && ':white_check_mark:' || ':x:' }}" + }, + { + "type": "mrkdwn", + "text": "*deploy_s3_data_collection:* ${{ needs.deploy_s3_data_collection.result == 'success' && ':white_check_mark:' || ':x:' }}" } ] }, diff --git a/.github/workflows/s3-base-data-collection.yml b/.github/workflows/s3-base-data-collection.yml new file mode 100644 index 0000000000..4d53e7e1c2 --- /dev/null +++ b/.github/workflows/s3-base-data-collection.yml @@ -0,0 +1,119 @@ +name: "Z-BASE Deploy S3 Data Collection: Build data collection image" + +run-name: "${{ github.event.inputs.build_branch }} | ${{ github.event.inputs.environment }} | ${{ github.event.inputs.sandbox }}" + +on: + workflow_call: + inputs: + build_branch: + required: true + type: string + environment: + required: true + type: string + sandbox: + required: true + type: string + secrets: + AWS_ASSUME_ROLE: + required: true + +permissions: + pull-requests: write + id-token: write + contents: read + +jobs: + s3_data_collection_build_docker_image: + runs-on: ubuntu-latest + environment: ${{ inputs.environment }} + defaults: + run: + working-directory: lambdas + + steps: + - uses: actions/checkout@v6 + with: + repository: "NHSDigital/national-document-repository" + ref: ${{ inputs.build_branch }} + fetch-depth: "0" + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v5 + with: + role-to-assume: ${{ secrets.AWS_ASSUME_ROLE }} + role-skip-session-tagging: true + aws-region: ${{ vars.AWS_REGION }} + mask-aws-account-id: true + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v2 + + - name: Build, tag, and push image to Amazon ECR + id: build-image + env: + ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + ECR_REPOSITORY: ${{ inputs.sandbox }}-s3-data-collection + IMAGE_TAG: latest + IMAGE_TAG_SHA: ${{ github.sha }} + run: | + set -euo pipefail + + docker build \ + -t "$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" \ + -t "$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG_SHA" \ + -f ecs/s3_data_collection/Dockerfile \ + . + + docker push "$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" + docker push "$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG_SHA" + + echo "image=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG_SHA" >> "$GITHUB_OUTPUT" + + - name: Download current task definition (raw) + id: download-task + run: | + set -euo pipefail + + aws ecs describe-task-definition \ + --task-definition "${{ inputs.sandbox }}-task-s3-data-collection" \ + --query taskDefinition \ + > task-definition-raw.json + + echo "revision=$(jq -r .revision task-definition-raw.json)" >> "$GITHUB_OUTPUT" + + - name: Render task definition with new image + id: task-def + uses: aws-actions/amazon-ecs-render-task-definition@v1 + with: + task-definition: lambdas/task-definition-raw.json + container-name: ${{ inputs.sandbox }}-container-s3-data-collection + image: ${{ steps.build-image.outputs.image }} + + - name: Sanitize task definition JSON for registration + run: | + set -euo pipefail + + jq 'del( + .taskDefinitionArn, + .revision, + .status, + .requiresAttributes, + .compatibilities, + .registeredAt, + .registeredBy + )' \ + "${{ steps.task-def.outputs.task-definition }}" \ + > task-definition.json + + - name: Register new ECS task definition revision + run: | + set -euo pipefail + aws ecs register-task-definition --cli-input-json file://task-definition.json + + - name: De-register previous revision (optional) + run: | + set -euo pipefail + aws ecs deregister-task-definition \ + --task-definition "${{ inputs.sandbox }}-task-s3-data-collection:${{ steps.download-task.outputs.revision }}" diff --git a/.github/workflows/s3-data-collection-deploy-to-sandbox.yml b/.github/workflows/s3-data-collection-deploy-to-sandbox.yml new file mode 100644 index 0000000000..20693494a4 --- /dev/null +++ b/.github/workflows/s3-data-collection-deploy-to-sandbox.yml @@ -0,0 +1,58 @@ +name: "SANDBOX S3 Data Collection - Publish Data Collection Image to ECR" + +run-name: "${{ github.event.inputs.build_branch }} | ${{ github.event.inputs.sandbox }} | ${{ github.event.inputs.environment }}" + +on: + workflow_dispatch: + inputs: + build_branch: + description: "Feature branch to push." + required: true + type: string + default: "main" + sandbox: + description: "Which Sandbox to push to." + required: true + type: string + default: "ndr" + environment: + description: "Which Environment settings to use." + required: true + type: string + default: "development" + workflow_call: + inputs: + build_branch: + description: "Feature branch to push." + required: true + type: string + default: "main" + sandbox: + description: "Which Sandbox to push to." + required: true + type: string + default: "ndr" + environment: + description: "Which Environment settings to use." + required: true + type: string + default: "development" + secrets: + AWS_ASSUME_ROLE: + required: true + +permissions: + pull-requests: write + id-token: write # This is required for requesting the JWT + contents: read # This is required for actions/checkout + +jobs: + push_image: + name: Push Image + uses: ./.github/workflows/s3-base-data-collection.yml + with: + build_branch: ${{ inputs.build_branch }} + environment: ${{ inputs.environment }} + sandbox: ${{ inputs.sandbox }} + secrets: + AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} diff --git a/lambdas/ecs/s3_data_collection/Dockerfile b/lambdas/ecs/s3_data_collection/Dockerfile new file mode 100644 index 0000000000..6c0e5ec097 --- /dev/null +++ b/lambdas/ecs/s3_data_collection/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.11 + +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONPATH=/lambdas + +WORKDIR /lambdas +COPY requirements /lambdas/requirements + +RUN pip install -r requirements/layers/requirements_core_lambda_layer.txt +RUN pip install -r requirements/layers/requirements_data_lambda_layer.txt + +COPY . /lambdas + +ENTRYPOINT ["python", "ecs/s3_data_collection/main.py"] diff --git a/lambdas/ecs/s3_data_collection/__init__.py b/lambdas/ecs/s3_data_collection/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lambdas/ecs/s3_data_collection/main.py b/lambdas/ecs/s3_data_collection/main.py new file mode 100644 index 0000000000..14ae36f82e --- /dev/null +++ b/lambdas/ecs/s3_data_collection/main.py @@ -0,0 +1,23 @@ +import importlib +import logging +import sys + +from services.reporting.report_s3_content_service import ReportS3ContentService + + +def setup_logging_for_local_script(): + importlib.reload(logging) + logging.basicConfig( + level=logging.INFO, + format="[%(asctime)s] %(levelname)s [%(name)s.%(funcName)s:%(lineno)d] %(message)s", + datefmt="%d/%b/%Y %H:%M:%S", + stream=sys.stdout, + ) + + +if __name__ == "__main__": + setup_logging_for_local_script() + logging.getLogger(__name__).info("Starting Report S3 content ECS task") + + service = ReportS3ContentService() + service.process_s3_content() diff --git a/lambdas/handlers/report_s3_content_handler.py b/lambdas/handlers/report_s3_content_handler.py new file mode 100644 index 0000000000..70eb87dcdc --- /dev/null +++ b/lambdas/handlers/report_s3_content_handler.py @@ -0,0 +1,35 @@ +from services.reporting.report_s3_content_service import ReportS3ContentService +from utils.audit_logging_setup import LoggingService +from utils.decorators.ensure_env_var import ensure_environment_variables +from utils.decorators.handle_lambda_exceptions import handle_lambda_exceptions +from utils.decorators.override_error_check import override_error_check +from utils.decorators.set_audit_arg import set_request_context_for_logging + +logger = LoggingService(__name__) + +@ensure_environment_variables(names=["LLOYD_GEORGE_BUCKET_NAME"]) +@ensure_environment_variables(names=["BULK_STAGING_BUCKET_NAME"]) +@ensure_environment_variables(names=["STATISTICAL_REPORTS_BUCKET"]) +@override_error_check +@handle_lambda_exceptions +@set_request_context_for_logging +def lambda_handler(event, context): + logger.info("Report S3 content lambda invoked") + + service = ReportS3ContentService() + + service.process_s3_content( + ) + + + +# class _EcsContext: +# aws_request_id = "ecs-run" +# function_name = "ReportS3ContentEcsTask" +# invoked_function_arn = "ecs://ReportS3ContentEcsTask" +# log_group_name = "ecs" +# log_stream_name = "ecs" +# +# +# if __name__ == "__main__": +# lambda_handler({}, _EcsContext()) diff --git a/lambdas/services/base/s3_service.py b/lambdas/services/base/s3_service.py index 3ee42b4e24..4a9bdb0a3d 100644 --- a/lambdas/services/base/s3_service.py +++ b/lambdas/services/base/s3_service.py @@ -255,3 +255,24 @@ def save_or_create_file(self, source_bucket: str, file_key: str, body: bytes): return self.client.put_object( Bucket=source_bucket, Key=file_key, Body=BytesIO(body) ) + + def list_all_object_versions(self, bucket_name: str): + paginator = self.client.get_paginator("list_object_versions") + versions, delete_markers = [], [] + + for page in paginator.paginate(Bucket=bucket_name): + versions.extend(page.get("Versions", [])) + delete_markers.extend(page.get("DeleteMarkers", [])) + + return versions, delete_markers + + def get_object_tags_versioned(self, bucket: str, key: str, version_id: str | None): + try: + params = {"Bucket": bucket, "Key": key} + if version_id: + params["VersionId"] = version_id + response = self.client.get_object_tagging(**params) + return response.get("TagSet", []) + except ClientError: + return [] + diff --git a/lambdas/services/reporting/csv_report_generator_service.py b/lambdas/services/reporting/csv_report_generator_service.py new file mode 100644 index 0000000000..683ede526e --- /dev/null +++ b/lambdas/services/reporting/csv_report_generator_service.py @@ -0,0 +1,66 @@ +import csv +from io import StringIO +from utils.audit_logging_setup import LoggingService + +logger = LoggingService(__name__) + + +class CsvReportGenerator: + def generate_s3_inventory_csv(self, bucket, versions, delete_markers) -> str: + logger.info(f"Generating S3 inventory CSV for bucket {bucket}") + + output = StringIO() + writer = csv.writer(output) + + writer.writerow( + [ + "bucket", + "key", + "entry_type", + "version_id", + "is_latest", + "last_modified", + "size", + "etag", + "storage_class", + "tags", + ] + ) + + for version in versions: + tags = version.get("Tags", []) + tag_str = ";".join(f"{t['Key']}={t['Value']}" for t in tags) + + writer.writerow( + [ + bucket, + version["Key"], + "VERSION", + version["VersionId"], + version["IsLatest"], + version["LastModified"].isoformat(), + version.get("Size"), + version.get("ETag"), + version.get("StorageClass"), + tag_str, + ] + ) + + for marker in delete_markers: + writer.writerow( + [ + bucket, + marker["Key"], + "DELETE_MARKER", + marker["VersionId"], + marker["IsLatest"], + marker["LastModified"].isoformat(), + None, + None, + None, + "", + ] + ) + + logger.info(f"Finished CSV generation for {bucket}") + return output.getvalue() diff --git a/lambdas/services/reporting/report_s3_content_service.py b/lambdas/services/reporting/report_s3_content_service.py new file mode 100644 index 0000000000..8802e27b18 --- /dev/null +++ b/lambdas/services/reporting/report_s3_content_service.py @@ -0,0 +1,62 @@ +import os +from concurrent.futures import ThreadPoolExecutor, as_completed +from io import BytesIO + +from services.base.s3_service import S3Service +from services.reporting.csv_report_generator_service import CsvReportGenerator +from utils.audit_logging_setup import LoggingService + +logger = LoggingService(__name__) + + +class ReportS3ContentService: + def __init__(self): + self.lloyd_george_bucket = os.getenv("LLOYD_GEORGE_BUCKET_NAME") + self.bulk_staging_store = os.getenv("BULK_STAGING_BUCKET_NAME") + self.statistic_reports_bucket = os.getenv("STATISTICAL_REPORTS_BUCKET") + self.s3_service = S3Service() + self.csv_generator = CsvReportGenerator() + self.max_objects = 4500000 #To avoid prod from melting + + def _fetch_tags(self, bucket, version): + tags = self.s3_service.get_object_tags_versioned( + bucket, version["Key"], version["VersionId"] + ) + version["Tags"] = tags + return version + + def process_s3_content(self): + for bucket in [self.lloyd_george_bucket, self.bulk_staging_store]: + logger.info(f"Listing versions for bucket {bucket}") + versions, delete_markers = self.s3_service.list_all_object_versions(bucket) + # versions = self.s3_service.list_all_objects(bucket) + delete_markers = [] + if len(versions) > self.max_objects: + logger.warning(f"Limiting versions from {len(versions)} to {self.max_objects}") + versions = versions[: self.max_objects] + # versions = [ + # v for v in versions + # if v["IsLatest"] + # ] + logger.info(f"Fetching tags in parallel for {len(versions)} versions") + + with ThreadPoolExecutor(max_workers=20) as executor: + futures = [ + executor.submit(self._fetch_tags, bucket, v) for v in versions + ] + for _ in as_completed(futures): + pass + + logger.info(f"Generating CSV for bucket {bucket}") + csv_content = self.csv_generator.generate_s3_inventory_csv( + bucket, versions, delete_markers + ) + + logger.info(f"Uploading report for bucket {bucket}") + self.s3_service.upload_file_obj( + BytesIO(csv_content.encode("utf-8")), + self.statistic_reports_bucket, + f"s3-content-report/{bucket}-inventory.csv", + ) + + logger.info(f"Completed report for {bucket}") \ No newline at end of file diff --git a/lambdas/tests/unit/handlers/test_report_s3_content_handler.py b/lambdas/tests/unit/handlers/test_report_s3_content_handler.py new file mode 100644 index 0000000000..75976cb2f4 --- /dev/null +++ b/lambdas/tests/unit/handlers/test_report_s3_content_handler.py @@ -0,0 +1,43 @@ +import pytest +from types import SimpleNamespace +from lambdas.handlers.report_s3_content_handler import lambda_handler + + +@pytest.fixture(autouse=True) +def patch_env_vars(monkeypatch): + env_vars = { + "LLOYD_GEORGE_BUCKET_NAME": "bucket-a", + "STATISTICAL_REPORTS_BUCKET": "bucket-b", + "BULK_STAGING_BUCKET_NAME": "bucket-c", + } + for key, value in env_vars.items(): + monkeypatch.setenv(key, value) + + +@pytest.fixture +def lambda_context(): + return SimpleNamespace(aws_request_id="test-request-id") + + +def test_lambda_handler_invokes_service(mocker, lambda_context): + mock_service_cls = mocker.patch( + "lambdas.handlers.report_s3_content_handler.ReportS3ContentService" + ) + mock_service = mock_service_cls.return_value + + lambda_handler({}, lambda_context) + + mock_service_cls.assert_called_once() + mock_service.process_s3_content.assert_called_once() + + +def test_lambda_handler_runs_without_event_data(mocker, lambda_context): + mock_service_cls = mocker.patch( + "lambdas.handlers.report_s3_content_handler.ReportS3ContentService" + ) + mock_service = mock_service_cls.return_value + + lambda_handler({}, lambda_context) + + mock_service_cls.assert_called_once() + mock_service.process_s3_content.assert_called_once() diff --git a/lambdas/tests/unit/services/reporting/test_csv_report_generator_service.py b/lambdas/tests/unit/services/reporting/test_csv_report_generator_service.py new file mode 100644 index 0000000000..fba9fd9cf3 --- /dev/null +++ b/lambdas/tests/unit/services/reporting/test_csv_report_generator_service.py @@ -0,0 +1,27 @@ +from datetime import datetime, timezone +from services.reporting.csv_report_generator_service import CsvReportGenerator + + +def test_generate_s3_inventory_csv(): + generator = CsvReportGenerator() + + versions = [ + { + "Key": "file1.txt", + "VersionId": "v1", + "IsLatest": True, + "LastModified": datetime(2024, 1, 1, tzinfo=timezone.utc), + "Size": 123, + "ETag": "etag1", + "StorageClass": "STANDARD", + "Tags": [{"Key": "autodelete", "Value": "true"}], + } + ] + + delete_markers = [] + + csv_output = generator.generate_s3_inventory_csv("bucket-a", versions, delete_markers) + + assert "bucket-a" in csv_output + assert "file1.txt" in csv_output + assert "autodelete=true" in csv_output diff --git a/lambdas/tests/unit/services/reporting/test_report_s3_content_service.py b/lambdas/tests/unit/services/reporting/test_report_s3_content_service.py new file mode 100644 index 0000000000..5c8c8354f3 --- /dev/null +++ b/lambdas/tests/unit/services/reporting/test_report_s3_content_service.py @@ -0,0 +1,34 @@ +from unittest.mock import MagicMock +from services.reporting.report_s3_content_service import ReportS3ContentService + + +def test_process_s3_content(mocker): + service = ReportS3ContentService() + service.lloyd_george_bucket = "bucket-a" + service.bulk_staging_store = "bucket-b" + service.statistic_reports_bucket = "reports-bucket" + service.s3_service = mocker.Mock() + service.csv_generator = mocker.Mock() + + fake_versions = [ + { + "Key": "file1.txt", + "VersionId": "v1", + "IsLatest": True, + "LastModified": mocker.Mock(), + } + ] + fake_delete_markers = [] + + service.s3_service.list_all_object_versions.return_value = ( + fake_versions, + fake_delete_markers, + ) + service.s3_service.get_object_tags_versioned.return_value = [ + {"Key": "autodelete", "Value": "true"} + ] + service.csv_generator.generate_s3_inventory_csv.return_value = "csv-data" + + service.process_s3_content() + + assert service.s3_service.upload_file_obj.call_count == 2