Skip to content

Commit 1eee667

Browse files
author
Bob Strahan
committed
store cached attributes rather than JSON string in cache. Log PK for easier debugging
1 parent 2eb316f commit 1eee667

File tree

2 files changed

+37
-26
lines changed

2 files changed

+37
-26
lines changed

lib/idp_common_pkg/idp_common/classification/service.py

Lines changed: 36 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,14 +1034,17 @@ def _get_cached_page_classifications(
10341034
cached_data = response["Item"]
10351035
page_classifications = {}
10361036

1037-
# Extract page classifications from the cached data
1038-
if "page_classifications" in cached_data:
1039-
pages_data = json.loads(cached_data["page_classifications"])
1040-
for page_id, page_data in pages_data.items():
1037+
# Extract page classifications from separate page attributes
1038+
for attr_name, attr_value in cached_data.items():
1039+
if attr_name.startswith("page_"):
1040+
page_id = attr_name[5:] # Remove "page_" prefix
1041+
1042+
# Extract page data from DynamoDB item
1043+
page_data = attr_value
10411044
page_classifications[page_id] = PageClassification(
10421045
page_id=page_id,
10431046
classification=DocumentClassification(
1044-
doc_type=page_data["doc_type"],
1047+
doc_type=page_data.get("doc_type", "unclassified"),
10451048
confidence=page_data.get("confidence", 1.0),
10461049
metadata=page_data.get("metadata", {}),
10471050
),
@@ -1050,8 +1053,9 @@ def _get_cached_page_classifications(
10501053
raw_text_uri=page_data.get("raw_text_uri"),
10511054
)
10521055

1056+
if page_classifications:
10531057
logger.info(
1054-
f"Retrieved {len(page_classifications)} cached page classifications for document {document.id}"
1058+
f"Retrieved {len(page_classifications)} cached page classifications for document {document.id} (PK: {cache_key})"
10551059
)
10561060

10571061
return page_classifications
@@ -1066,7 +1070,7 @@ def _cache_successful_page_classifications(
10661070
self, document: Document, page_classifications: List[PageClassification]
10671071
) -> None:
10681072
"""
1069-
Cache successful page classifications to DynamoDB.
1073+
Cache successful page classifications to DynamoDB as separate attributes.
10701074
10711075
Args:
10721076
document: Document object
@@ -1078,43 +1082,45 @@ def _cache_successful_page_classifications(
10781082
cache_key = self._get_cache_key(document)
10791083

10801084
try:
1081-
# Filter out failed classifications
1082-
successful_pages = {}
1085+
# Filter out failed classifications and prepare item structure
1086+
item = {
1087+
"PK": cache_key,
1088+
"SK": "none",
1089+
"cached_at": str(int(time.time())),
1090+
"document_id": document.id,
1091+
"workflow_execution_arn": document.workflow_execution_arn,
1092+
"ExpiresAfter": int(
1093+
(datetime.now(timezone.utc) + timedelta(days=1)).timestamp()
1094+
),
1095+
}
1096+
1097+
successful_count = 0
10831098
for page_result in page_classifications:
10841099
# Only cache if there's no error in the metadata
10851100
if "error" not in page_result.classification.metadata:
1086-
successful_pages[page_result.page_id] = {
1101+
# Store each page as a separate attribute with "page_" prefix
1102+
page_attr_name = f"page_{page_result.page_id}"
1103+
item[page_attr_name] = {
10871104
"doc_type": page_result.classification.doc_type,
10881105
"confidence": page_result.classification.confidence,
10891106
"metadata": page_result.classification.metadata,
10901107
"image_uri": page_result.image_uri,
10911108
"text_uri": page_result.text_uri,
10921109
"raw_text_uri": page_result.raw_text_uri,
10931110
}
1111+
successful_count += 1
10941112

1095-
if not successful_pages:
1113+
if successful_count == 0:
10961114
logger.debug(
10971115
f"No successful page classifications to cache for document {document.id}"
10981116
)
10991117
return
11001118

1101-
# Store in DynamoDB using Table resource
1102-
self.cache_table.put_item(
1103-
Item={
1104-
"PK": cache_key,
1105-
"SK": "none",
1106-
"page_classifications": json.dumps(successful_pages),
1107-
"cached_at": str(int(time.time())),
1108-
"document_id": document.id,
1109-
"workflow_execution_arn": document.workflow_execution_arn,
1110-
"ExpiresAfter": int(
1111-
(datetime.now(timezone.utc) + timedelta(days=1)).timestamp()
1112-
),
1113-
}
1114-
)
1119+
# Store in DynamoDB using Table resource with separate page attributes
1120+
self.cache_table.put_item(Item=item)
11151121

11161122
logger.info(
1117-
f"Cached {len(successful_pages)} successful page classifications for document {document.id}"
1123+
f"Cached {successful_count} successful page classifications for document {document.id} (PK: {cache_key})"
11181124
)
11191125

11201126
except Exception as e:
@@ -1333,6 +1339,10 @@ def classify_document(self, document: Document) -> Document:
13331339
self._cache_successful_page_classifications(
13341340
document, successful_results
13351341
)
1342+
else:
1343+
logger.warning("No successful page classifications to cache")
1344+
else:
1345+
logger.warning("No pages to classify, nothing to cache")
13361346

13371347
error_msg = f"Error classifying document - cached partial results: {str(e)}"
13381348
document = self._update_document_status(

lib/idp_common_pkg/idp_common/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ class Document:
109109

110110
# Processing metadata
111111
metering: Dict[str, Any] = field(default_factory=dict)
112+
metadata: Dict[str, Any] = field(default_factory=dict)
112113
evaluation_status: Optional[str] = None
113114
evaluation_report_uri: Optional[str] = None
114115
evaluation_result: Any = None # Holds the DocumentEvaluationResult object

0 commit comments

Comments
 (0)