Merge branch 'fix/reprocess-starts-two-executions' into 'develop'

rstrahan · rstrahan · commit b26869e87ffc · 2025-10-03T21:40:24.000Z
Fix duplicate Step Functions executions on document reprocess

See merge request genaiic-reusable-assets/engagement-artifacts/genaiic-idp-accelerator!341
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,9 +24,13 @@ SPDX-License-Identifier: MIT-0
 
 
 ### Fixed
-- Problem with setting correctly formatted WAF IPv4 CIDR range - #73
-
+- **Problem with setting correctly formatted WAF IPv4 CIDR range** - #73
 
+- **Duplicate Step Functions Executions on Document Reprocess - [GitHub Issue #66](https://github.com/aws-solutions-library-samples/accelerated-intelligent-document-processing-on-aws/issues/66)**
+  - Eliminated duplicate workflow executions when reprocessing large documents (>40MB, 500+ pages)
+  - **Root Cause**: S3 `copy_object` operations were triggering multiple "Object Created" events for large files, causing `queue_sender` to create duplicate document entries and workflow executions
+  - **Solution**: Refactored `reprocess_document_resolver` to directly create fresh Document objects and queue to SQS, completely bypassing S3 event notifications
+  - **Benefits**: Eliminates unnecessary S3 copy operations (cost savings)
 
 ## [0.3.18]
 
diff --git a/src/lambda/reprocess_document_resolver/index.py b/src/lambda/reprocess_document_resolver/index.py
@@ -5,53 +5,125 @@
 import os
 import boto3
 import logging
-from datetime import datetime
+from datetime import datetime, timezone, timedelta
+
+# Import IDP Common modules
+from idp_common.models import Document, Status
+from idp_common.docs_service import create_document_service
 
 logger = logging.getLogger()
 logger.setLevel(os.environ.get('LOG_LEVEL', 'INFO'))
 
+# Initialize AWS clients
+sqs_client = boto3.client('sqs')
 s3_client = boto3.client('s3')
 
-def handler(event, context):
-    logger.info(f"Event: {json.dumps(event)}")
+# Initialize document service (same as queue_sender - defaults to AppSync)
+document_service = create_document_service()
 
+# Environment variables
+queue_url = os.environ.get('QUEUE_URL')
+input_bucket = os.environ.get('INPUT_BUCKET')
+output_bucket = os.environ.get('OUTPUT_BUCKET')
+retentionDays = int(os.environ.get('DATA_RETENTION_IN_DAYS', '365'))
+
+def handler(event, context):
+    logger.info(f"Reprocess resolver invoked with event: {json.dumps(event)}")
+    
     try:
-        # Get the input bucket name from the environment variable
-        input_bucket = os.environ.get('INPUT_BUCKET')
+        # Validate environment variables
         if not input_bucket:
             raise Exception("INPUT_BUCKET environment variable is not set")
-
-        # Extract object keys from the arguments
+        if not output_bucket:
+            raise Exception("OUTPUT_BUCKET environment variable is not set")
+        if not queue_url:
+            raise Exception("QUEUE_URL environment variable is not set")
+        
+        # Extract arguments from GraphQL event
         args = event.get('arguments', {})
         object_keys = args.get('objectKeys', [])
         
         if not object_keys:
-            return {
-                'statusCode': 400,
-                'body': 'No document keys provided'
-            }
-
-        logger.info(f"Reprocessing documents: {object_keys}")
+            logger.error("objectKeys is required but not provided")
+            return False
         
-        # Copy each object over itself to trigger the S3 event notification
-        for key in object_keys:
-            logger.info(f"Reprocessing document: {key}")
-            
-            # Copy the object to itself using the copy_object API
-            s3_client.copy_object(
-                Bucket=input_bucket,
-                CopySource={'Bucket': input_bucket, 'Key': key},
-                Key=key,
-                MetadataDirective='REPLACE',
-                Metadata={
-                    'reprocessed': 'true',
-                    'reprocessed_timestamp': datetime.utcnow().isoformat()
-                }
-            )
-            
-            logger.info(f"Successfully reprocessed document: {key}")
-
+        logger.info(f"Reprocessing {len(object_keys)} documents")
+        
+        # Process each document
+        success_count = 0
+        for object_key in object_keys:
+            try:
+                reprocess_document(object_key)
+                success_count += 1
+            except Exception as e:
+                logger.error(f"Error reprocessing document {object_key}: {str(e)}", exc_info=True)
+                # Continue with other documents even if one fails
+        
+        logger.info(f"Successfully queued {success_count}/{len(object_keys)} documents for reprocessing")
         return True
+        
     except Exception as e:
-        logger.error(f"Error reprocessing documents: {str(e)}")
-        raise e
+        logger.error(f"Error in reprocess handler: {str(e)}", exc_info=True)
+        raise e
+
+def reprocess_document(object_key):
+    """
+    Reprocess a document by creating a fresh Document object and queueing it.
+    This exactly mirrors the queue_sender pattern for consistency and avoids
+    S3 copy operations that can trigger duplicate events for large files.
+    """
+    logger.info(f"Reprocessing document: {object_key}")
+    
+    # Verify file exists in S3
+    try:
+        s3_client.head_object(Bucket=input_bucket, Key=object_key)
+    except Exception as e:
+        raise ValueError(f"Document {object_key} not found in S3 bucket {input_bucket}: {str(e)}")
+    
+    # Create a fresh Document object (same as queue_sender does)
+    current_time = datetime.now(timezone.utc).isoformat()
+    
+    document = Document(
+        id=object_key,  # Document ID is the object key
+        input_bucket=input_bucket,
+        input_key=object_key,
+        output_bucket=output_bucket,
+        status=Status.QUEUED,
+        queued_time=current_time,
+        initial_event_time=current_time,
+        pages={},
+        sections=[],
+    )
+    
+    logger.info(f"Created fresh document object for reprocessing: {object_key}")
+    
+    # Calculate expiry date (same as queue_sender)
+    expires_after = int((datetime.now(timezone.utc) + timedelta(days=retentionDays)).timestamp())
+    
+    # Create document in DynamoDB via document service (same as queue_sender - uses AppSync by default)
+    logger.info(f"Creating document via document service: {document.input_key}")
+    created_key = document_service.create_document(document, expires_after=expires_after)
+    logger.info(f"Document created with key: {created_key}")
+    
+    # Send serialized document to SQS queue (same as queue_sender)
+    doc_json = document.to_json()
+    message = {
+        'QueueUrl': queue_url,
+        'MessageBody': doc_json,
+        'MessageAttributes': {
+            'EventType': {
+                'StringValue': 'DocumentReprocessed',
+                'DataType': 'String'
+            },
+            'ObjectKey': {
+                'StringValue': object_key,
+                'DataType': 'String'
+            }
+        }
+    }
+    logger.info(f"Sending document to SQS queue: {object_key}")
+    response = sqs_client.send_message(**message)
+    logger.info(f"SQS response: {response}")
+    
+    logger.info(f"Successfully reprocessed document: {object_key}")
+    return response.get('MessageId')
diff --git a/src/lambda/reprocess_document_resolver/requirements.txt b/src/lambda/reprocess_document_resolver/requirements.txt
@@ -1 +1 @@
-boto3>=1.28.0
+./lib/idp_common_pkg[docs_service]  # idp_common package with Document model and document service integration
diff --git a/template.yaml b/template.yaml
@@ -5689,12 +5689,25 @@ Resources:
         Variables:
           LOG_LEVEL: !Ref LogLevel
           INPUT_BUCKET: !Ref InputBucket
+          OUTPUT_BUCKET: !Ref OutputBucket
+          QUEUE_URL: !Ref DocumentQueue
+          APPSYNC_API_URL: !GetAtt GraphQLApi.GraphQLUrl
+          DATA_RETENTION_IN_DAYS: !Ref DataRetentionInDays
       LoggingConfig:
         LogGroup: !Ref ReprocessDocumentResolverFunctionLogGroup
       Policies:
-        - S3CrudPolicy:
+        - DynamoDBCrudPolicy:
+            TableName: !Ref TrackingTable
+        - SQSSendMessagePolicy:
+            QueueName: !GetAtt DocumentQueue.QueueName
+        - S3ReadPolicy:
             BucketName: !Ref InputBucket
         - Statement:
+            - Effect: Allow
+              Action:
+                - appsync:GraphQL
+              Resource:
+                - !Sub "${GraphQLApi.Arn}/types/Mutation/*"
             - Effect: Allow
               Action:
                 - kms:Encrypt

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-boto3>=1.28.0`
	`1`	`+./lib/idp_common_pkg[docs_service] # idp_common package with Document model and document service integration`