aws-solutions-library-samples
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 4 deletions b/‎.gitignore‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎config_library/pattern-2/lending-package-sample/config_multimodal_page_boundary.yaml‎
Lines changed: 1425 additions & 0 deletions b/‎config_library/pattern-2/lending-package-sample/config_multimodal_page_boundary.yaml‎
Lines changed: 1425 additions & 0 deletions
diff --git a/‎lib/idp_common_pkg/idp_common/classification/service.py‎
Lines changed: 61 additions & 7 deletions b/‎lib/idp_common_pkg/idp_common/classification/service.py‎
Lines changed: 61 additions & 7 deletions
diff --git a/‎lib/idp_common_pkg/tests/unit/classification/test_classification_service.py‎
Lines changed: 32 additions & 0 deletions b/‎lib/idp_common_pkg/tests/unit/classification/test_classification_service.py‎
Lines changed: 32 additions & 0 deletions
@@ -17,7 +17,4 @@ __pycache__
 .ruff_cache
 .kiro
 rvl_cdip_*
-
-# IDE specific files
-.idea/
-
+notebooks/examples/data
@@ -46,6 +46,7 @@ class ClassificationService:
     # Classification method options
     MULTIMODAL_PAGE_LEVEL = "multimodalPageLevelClassification"
     TEXTBASED_HOLISTIC = "textbasedHolisticClassification"
+    MULTIMODAL_PAGE_BOUNDARY = "multimodalPageBoundaryClassification"
 
     def __init__(
         self,
@@ -132,6 +133,8 @@ def __init__(
         # Log classification method
         if self.classification_method == self.TEXTBASED_HOLISTIC:
             logger.info("Using textbased holistic packet classification method")
+        elif self.classification_method == self.MULTIMODAL_PAGE_BOUNDARY:
+            logger.info("Using multimodal page boundary classification method")
         else:
             # Default to multimodal page-level classification if value is invalid
             if self.classification_method != self.MULTIMODAL_PAGE_LEVEL:
@@ -678,16 +681,21 @@ def classify_page_bedrock(
                 )
                 if isinstance(classification_data, dict):
                     doc_type = classification_data.get("class", "")
-                    logger.debug(
+                    document_boundary = classification_data.get(
+                        "document_boundary", "continue"
+                    )
+                    logger.info(
                         f"Parsed classification response as {detected_format}: {classification_data}"
                     )
                 else:
                     # If parsing failed, try to extract classification directly from text
                     doc_type = self._extract_class_from_text(classification_text)
+                    document_boundary = "continue"
             except Exception as e:
                 logger.warning(f"Failed to parse structured data from response: {e}")
                 # Try to extract classification directly from text
                 doc_type = self._extract_class_from_text(classification_text)
+                document_boundary = "continue"
 
             # Validate classification against known document types
             if not doc_type:
@@ -710,7 +718,10 @@ def classify_page_bedrock(
                 classification=DocumentClassification(
                     doc_type=doc_type,
                     confidence=1.0,  # Default confidence
-                    metadata={"metering": metering},
+                    metadata={
+                        "metering": metering,
+                        "document_boundary": str(document_boundary).lower(),
+                    },
                 ),
                 image_uri=image_uri,
                 text_uri=text_uri,
@@ -803,7 +814,10 @@ def classify_page_sagemaker(
                     classification=DocumentClassification(
                         doc_type=doc_type,
                         confidence=1.0,  # Default confidence since SageMaker doesn't provide it
-                        metadata={"metering": metering},
+                        metadata={
+                            "metering": metering,
+                            "document_boundary": "continue",
+                        },
                     ),
                     image_uri=image_uri,
                     text_uri=text_uri,
@@ -1199,10 +1213,15 @@ def classify_document(self, document: Document) -> Document:
             )
             return self.holistic_classify_document(document)
 
-        # Default to page-by-page classification
+        # Page-level classification (with or without boundary detection)
         t0 = time.time()
+        method_desc = (
+            "page boundary"
+            if self.classification_method == self.MULTIMODAL_PAGE_BOUNDARY
+            else "page-by-page"
+        )
         logger.info(
-            f"Classifying document with {len(document.pages)} pages using page-by-page method with {self.backend} backend"
+            f"Classifying document with {len(document.pages)} pages using {method_desc} method with {self.backend} backend"
         )
 
         try:
@@ -1230,6 +1249,19 @@ def classify_document(self, document: Document) -> Document:
                         page_id
                     ].confidence = cached_result.classification.confidence
 
+                    # Copy metadata (including boundary information) to the page
+                    if hasattr(document.pages[page_id], "metadata"):
+                        document.pages[
+                            page_id
+                        ].metadata = cached_result.classification.metadata
+                    else:
+                        # If the page doesn't have a metadata attribute, add it
+                        setattr(
+                            document.pages[page_id],
+                            "metadata",
+                            cached_result.classification.metadata,
+                        )
+
                     # Merge cached metering data
                     page_metering = cached_result.classification.metadata.get(
                         "metering", {}
@@ -1278,6 +1310,19 @@ def classify_document(self, document: Document) -> Document:
                                 page_id
                             ].confidence = page_result.classification.confidence
 
+                            # Copy metadata (including boundary information) to the page
+                            if hasattr(document.pages[page_id], "metadata"):
+                                document.pages[
+                                    page_id
+                                ].metadata = page_result.classification.metadata
+                            else:
+                                # If the page doesn't have a metadata attribute, add it
+                                setattr(
+                                    document.pages[page_id],
+                                    "metadata",
+                                    page_result.classification.metadata,
+                                )
+
                             # Merge metering data
                             page_metering = page_result.classification.metadata.get(
                                 "metering", {}
@@ -1360,7 +1405,13 @@ def classify_document(self, document: Document) -> Document:
                 current_pages = [sorted_results[0]]
 
                 for result in sorted_results[1:]:
-                    if result.classification.doc_type == current_type:
+                    boundary = result.classification.metadata.get(
+                        "document_boundary", "continue"
+                    ).lower()
+                    if (
+                        result.classification.doc_type == current_type
+                        and boundary != "start"
+                    ):
                         current_pages.append(result)
                     else:
                         # Create a new section with the current group of pages
@@ -1528,7 +1579,10 @@ def _group_consecutive_pages(
         current_pages = [sorted_results[0]]
 
         for result in sorted_results[1:]:
-            if result.classification.doc_type == current_type:
+            boundary = result.classification.metadata.get(
+                "document_boundary", "continue"
+            ).lower()
+            if result.classification.doc_type == current_type and boundary != "start":
                 current_pages.append(result)
             else:
                 # Create a section with the current group
 
@@ -255,6 +255,7 @@ def test_classify_page_bedrock_success(
         assert result.classification.doc_type == "invoice"
         assert result.classification.confidence == 1.0
         assert result.classification.metadata["metering"] == {"tokens": 100}
+        assert result.classification.metadata["document_boundary"] == "continue"
         assert result.image_uri == "s3://bucket/image.jpg"
         assert result.text_uri == "s3://bucket/text.txt"
 
@@ -801,3 +802,34 @@ def test_holistic_classify_document_multiple_segments(
         assert result.pages["1"].classification == "invoice"
         assert result.pages["2"].classification == "receipt"
         assert result.pages["3"].classification == "receipt"
+
+    def test_group_consecutive_pages_with_boundary(self, service):
+        """Pages with boundary flag start new sections even with same doc type."""
+        results = [
+            PageClassification(
+                page_id="1",
+                classification=DocumentClassification(
+                    doc_type="invoice",
+                    metadata={"document_boundary": "start"},
+                ),
+            ),
+            PageClassification(
+                page_id="2",
+                classification=DocumentClassification(
+                    doc_type="invoice",
+                    metadata={"document_boundary": "continue"},
+                ),
+            ),
+            PageClassification(
+                page_id="3",
+                classification=DocumentClassification(
+                    doc_type="invoice",
+                    metadata={"document_boundary": "start"},
+                ),
+            ),
+        ]
+
+        sections = service._group_consecutive_pages(results)
+        assert len(sections) == 2
+        assert [p.page_id for p in sections[0].pages] == ["1", "2"]
+        assert [p.page_id for p in sections[1].pages] == ["3"]