@@ -1034,14 +1034,17 @@ def _get_cached_page_classifications(
10341034 cached_data = response ["Item" ]
10351035 page_classifications = {}
10361036
1037- # Extract page classifications from the cached data
1038- if "page_classifications" in cached_data :
1039- pages_data = json .loads (cached_data ["page_classifications" ])
1040- for page_id , page_data in pages_data .items ():
1037+ # Extract page classifications from separate page attributes
1038+ for attr_name , attr_value in cached_data .items ():
1039+ if attr_name .startswith ("page_" ):
1040+ page_id = attr_name [5 :] # Remove "page_" prefix
1041+
1042+ # Extract page data from DynamoDB item
1043+ page_data = attr_value
10411044 page_classifications [page_id ] = PageClassification (
10421045 page_id = page_id ,
10431046 classification = DocumentClassification (
1044- doc_type = page_data [ "doc_type" ] ,
1047+ doc_type = page_data . get ( "doc_type" , "unclassified" ) ,
10451048 confidence = page_data .get ("confidence" , 1.0 ),
10461049 metadata = page_data .get ("metadata" , {}),
10471050 ),
@@ -1050,8 +1053,9 @@ def _get_cached_page_classifications(
10501053 raw_text_uri = page_data .get ("raw_text_uri" ),
10511054 )
10521055
1056+ if page_classifications :
10531057 logger .info (
1054- f"Retrieved { len (page_classifications )} cached page classifications for document { document .id } "
1058+ f"Retrieved { len (page_classifications )} cached page classifications for document { document .id } (PK: { cache_key } ) "
10551059 )
10561060
10571061 return page_classifications
@@ -1066,7 +1070,7 @@ def _cache_successful_page_classifications(
10661070 self , document : Document , page_classifications : List [PageClassification ]
10671071 ) -> None :
10681072 """
1069- Cache successful page classifications to DynamoDB.
1073+ Cache successful page classifications to DynamoDB as separate attributes .
10701074
10711075 Args:
10721076 document: Document object
@@ -1078,43 +1082,45 @@ def _cache_successful_page_classifications(
10781082 cache_key = self ._get_cache_key (document )
10791083
10801084 try :
1081- # Filter out failed classifications
1082- successful_pages = {}
1085+ # Filter out failed classifications and prepare item structure
1086+ item = {
1087+ "PK" : cache_key ,
1088+ "SK" : "none" ,
1089+ "cached_at" : str (int (time .time ())),
1090+ "document_id" : document .id ,
1091+ "workflow_execution_arn" : document .workflow_execution_arn ,
1092+ "ExpiresAfter" : int (
1093+ (datetime .now (timezone .utc ) + timedelta (days = 1 )).timestamp ()
1094+ ),
1095+ }
1096+
1097+ successful_count = 0
10831098 for page_result in page_classifications :
10841099 # Only cache if there's no error in the metadata
10851100 if "error" not in page_result .classification .metadata :
1086- successful_pages [page_result .page_id ] = {
1101+ # Store each page as a separate attribute with "page_" prefix
1102+ page_attr_name = f"page_{ page_result .page_id } "
1103+ item [page_attr_name ] = {
10871104 "doc_type" : page_result .classification .doc_type ,
10881105 "confidence" : page_result .classification .confidence ,
10891106 "metadata" : page_result .classification .metadata ,
10901107 "image_uri" : page_result .image_uri ,
10911108 "text_uri" : page_result .text_uri ,
10921109 "raw_text_uri" : page_result .raw_text_uri ,
10931110 }
1111+ successful_count += 1
10941112
1095- if not successful_pages :
1113+ if successful_count == 0 :
10961114 logger .debug (
10971115 f"No successful page classifications to cache for document { document .id } "
10981116 )
10991117 return
11001118
1101- # Store in DynamoDB using Table resource
1102- self .cache_table .put_item (
1103- Item = {
1104- "PK" : cache_key ,
1105- "SK" : "none" ,
1106- "page_classifications" : json .dumps (successful_pages ),
1107- "cached_at" : str (int (time .time ())),
1108- "document_id" : document .id ,
1109- "workflow_execution_arn" : document .workflow_execution_arn ,
1110- "ExpiresAfter" : int (
1111- (datetime .now (timezone .utc ) + timedelta (days = 1 )).timestamp ()
1112- ),
1113- }
1114- )
1119+ # Store in DynamoDB using Table resource with separate page attributes
1120+ self .cache_table .put_item (Item = item )
11151121
11161122 logger .info (
1117- f"Cached { len ( successful_pages ) } successful page classifications for document { document .id } "
1123+ f"Cached { successful_count } successful page classifications for document { document .id } (PK: { cache_key } ) "
11181124 )
11191125
11201126 except Exception as e :
@@ -1333,6 +1339,10 @@ def classify_document(self, document: Document) -> Document:
13331339 self ._cache_successful_page_classifications (
13341340 document , successful_results
13351341 )
1342+ else :
1343+ logger .warning ("No successful page classifications to cache" )
1344+ else :
1345+ logger .warning ("No pages to classify, nothing to cache" )
13361346
13371347 error_msg = f"Error classifying document - cached partial results: { str (e )} "
13381348 document = self ._update_document_status (
0 commit comments