55import os
66import boto3
77import logging
8- from datetime import datetime
8+ from datetime import datetime , timezone , timedelta
9+
10+ # Import IDP Common modules
11+ from idp_common .models import Document , Status
12+ from idp_common .docs_service import create_document_service
913
1014logger = logging .getLogger ()
1115logger .setLevel (os .environ .get ('LOG_LEVEL' , 'INFO' ))
1216
17+ # Initialize AWS clients
18+ sqs_client = boto3 .client ('sqs' )
1319s3_client = boto3 .client ('s3' )
1420
15- def handler ( event , context ):
16- logger . info ( f"Event: { json . dumps ( event ) } " )
21+ # Initialize document service (same as queue_sender - defaults to AppSync)
22+ document_service = create_document_service ( )
1723
24+ # Environment variables
25+ queue_url = os .environ .get ('QUEUE_URL' )
26+ input_bucket = os .environ .get ('INPUT_BUCKET' )
27+ output_bucket = os .environ .get ('OUTPUT_BUCKET' )
28+ retentionDays = int (os .environ .get ('DATA_RETENTION_IN_DAYS' , '365' ))
29+
30+ def handler (event , context ):
31+ logger .info (f"Reprocess resolver invoked with event: { json .dumps (event )} " )
32+
1833 try :
19- # Get the input bucket name from the environment variable
20- input_bucket = os .environ .get ('INPUT_BUCKET' )
34+ # Validate environment variables
2135 if not input_bucket :
2236 raise Exception ("INPUT_BUCKET environment variable is not set" )
23-
24- # Extract object keys from the arguments
37+ if not output_bucket :
38+ raise Exception ("OUTPUT_BUCKET environment variable is not set" )
39+ if not queue_url :
40+ raise Exception ("QUEUE_URL environment variable is not set" )
41+
42+ # Extract arguments from GraphQL event
2543 args = event .get ('arguments' , {})
2644 object_keys = args .get ('objectKeys' , [])
2745
2846 if not object_keys :
29- return {
30- 'statusCode' : 400 ,
31- 'body' : 'No document keys provided'
32- }
33-
34- logger .info (f"Reprocessing documents: { object_keys } " )
47+ logger .error ("objectKeys is required but not provided" )
48+ return False
3549
36- # Copy each object over itself to trigger the S3 event notification
37- for key in object_keys :
38- logger .info (f"Reprocessing document: { key } " )
39-
40- # Copy the object to itself using the copy_object API
41- s3_client .copy_object (
42- Bucket = input_bucket ,
43- CopySource = {'Bucket' : input_bucket , 'Key' : key },
44- Key = key ,
45- MetadataDirective = 'REPLACE' ,
46- Metadata = {
47- 'reprocessed' : 'true' ,
48- 'reprocessed_timestamp' : datetime .utcnow ().isoformat ()
49- }
50- )
51-
52- logger .info (f"Successfully reprocessed document: { key } " )
53-
50+ logger .info (f"Reprocessing { len (object_keys )} documents" )
51+
52+ # Process each document
53+ success_count = 0
54+ for object_key in object_keys :
55+ try :
56+ reprocess_document (object_key )
57+ success_count += 1
58+ except Exception as e :
59+ logger .error (f"Error reprocessing document { object_key } : { str (e )} " , exc_info = True )
60+ # Continue with other documents even if one fails
61+
62+ logger .info (f"Successfully queued { success_count } /{ len (object_keys )} documents for reprocessing" )
5463 return True
64+
5565 except Exception as e :
56- logger .error (f"Error reprocessing documents: { str (e )} " )
57- raise e
66+ logger .error (f"Error in reprocess handler: { str (e )} " , exc_info = True )
67+ raise e
68+
69+ def reprocess_document (object_key ):
70+ """
71+ Reprocess a document by creating a fresh Document object and queueing it.
72+ This exactly mirrors the queue_sender pattern for consistency and avoids
73+ S3 copy operations that can trigger duplicate events for large files.
74+ """
75+ logger .info (f"Reprocessing document: { object_key } " )
76+
77+ # Verify file exists in S3
78+ try :
79+ s3_client .head_object (Bucket = input_bucket , Key = object_key )
80+ except Exception as e :
81+ raise ValueError (f"Document { object_key } not found in S3 bucket { input_bucket } : { str (e )} " )
82+
83+ # Create a fresh Document object (same as queue_sender does)
84+ current_time = datetime .now (timezone .utc ).isoformat ()
85+
86+ document = Document (
87+ id = object_key , # Document ID is the object key
88+ input_bucket = input_bucket ,
89+ input_key = object_key ,
90+ output_bucket = output_bucket ,
91+ status = Status .QUEUED ,
92+ queued_time = current_time ,
93+ initial_event_time = current_time ,
94+ pages = {},
95+ sections = [],
96+ )
97+
98+ logger .info (f"Created fresh document object for reprocessing: { object_key } " )
99+
100+ # Calculate expiry date (same as queue_sender)
101+ expires_after = int ((datetime .now (timezone .utc ) + timedelta (days = retentionDays )).timestamp ())
102+
103+ # Create document in DynamoDB via document service (same as queue_sender - uses AppSync by default)
104+ logger .info (f"Creating document via document service: { document .input_key } " )
105+ created_key = document_service .create_document (document , expires_after = expires_after )
106+ logger .info (f"Document created with key: { created_key } " )
107+
108+ # Send serialized document to SQS queue (same as queue_sender)
109+ doc_json = document .to_json ()
110+ message = {
111+ 'QueueUrl' : queue_url ,
112+ 'MessageBody' : doc_json ,
113+ 'MessageAttributes' : {
114+ 'EventType' : {
115+ 'StringValue' : 'DocumentReprocessed' ,
116+ 'DataType' : 'String'
117+ },
118+ 'ObjectKey' : {
119+ 'StringValue' : object_key ,
120+ 'DataType' : 'String'
121+ }
122+ }
123+ }
124+ logger .info (f"Sending document to SQS queue: { object_key } " )
125+ response = sqs_client .send_message (** message )
126+ logger .info (f"SQS response: { response } " )
127+
128+ logger .info (f"Successfully reprocessed document: { object_key } " )
129+ return response .get ('MessageId' )
0 commit comments