@@ -646,148 +646,7 @@ def _get_text_confidence_data(self, page) -> str:
646646 )
647647 return "Text Confidence Data Unavailable"
648648
649- def _convert_bbox_to_geometry (
650- self , bbox_coords : list [float ], page_num : int
651- ) -> dict [str , Any ]:
652- """
653- Convert [x1,y1,x2,y2] coordinates to geometry format.
654-
655- Args:
656- bbox_coords: list of 4 coordinates [x1, y1, x2, y2] in 0-1000 scale
657- page_num: Page number where the bounding box appears
658-
659- Returns:
660- dictionary in geometry format compatible with pattern-1 UI
661- """
662- if len (bbox_coords ) != 4 :
663- raise ValueError (f"Expected 4 coordinates, got { len (bbox_coords )} " )
664-
665- x1 , y1 , x2 , y2 = bbox_coords
666-
667- # Ensure coordinates are in correct order
668- x1 , x2 = min (x1 , x2 ), max (x1 , x2 )
669- y1 , y2 = min (y1 , y2 ), max (y1 , y2 )
670-
671- # Convert from normalized 0-1000 scale to 0-1
672- left = x1 / 1000.0
673- top = y1 / 1000.0
674- width = (x2 - x1 ) / 1000.0
675- height = (y2 - y1 ) / 1000.0
676-
677- return {
678- "boundingBox" : {"top" : top , "left" : left , "width" : width , "height" : height },
679- "page" : page_num ,
680- }
681-
682- def _process_single_assessment_geometry (
683- self , attr_assessment : dict [str , Any ], attr_name : str = ""
684- ) -> dict [str , Any ]:
685- """
686- Process geometry data for a single assessment (with confidence key).
687-
688- Args:
689- attr_assessment: Single assessment dictionary with confidence data
690- attr_name: Name of attribute for logging
691-
692- Returns:
693- Enhanced assessment with geometry converted to proper format
694- """
695- enhanced_attr = attr_assessment .copy ()
696-
697- # Check if this assessment includes bbox data
698- if "bbox" in attr_assessment or "page" in attr_assessment :
699- # Both bbox and page are required for valid geometry
700- if "bbox" in attr_assessment and "page" in attr_assessment :
701- try :
702- bbox_coords = attr_assessment ["bbox" ]
703- page_num = attr_assessment ["page" ]
704-
705- # Validate bbox coordinates
706- if isinstance (bbox_coords , list ) and len (bbox_coords ) == 4 :
707- # Convert to geometry format
708- geometry = self ._convert_bbox_to_geometry (bbox_coords , page_num )
709- enhanced_attr ["geometry" ] = [geometry ]
710-
711- logger .debug (
712- f"Converted bounding box for { attr_name } : { bbox_coords } -> geometry format"
713- )
714- else :
715- logger .warning (
716- f"Invalid bounding box format for { attr_name } : { bbox_coords } "
717- )
718- except Exception as e :
719- logger .warning (
720- f"Failed to process bounding box for { attr_name } : { str (e )} "
721- )
722- raise
723- else :
724- # If only one of bbox/page exists, log a warning about incomplete data
725- if "bbox" in attr_assessment and "page" not in attr_assessment :
726- logger .warning (
727- f"Found bbox without page for { attr_name } - removing incomplete bbox data"
728- )
729- elif "page" in attr_assessment and "bbox" not in attr_assessment :
730- logger .warning (
731- f"Found page without bbox for { attr_name } - removing incomplete page data"
732- )
733-
734- # Always remove raw bbox/page data from output (whether processed or incomplete)
735- enhanced_attr .pop ("bbox" , None )
736- enhanced_attr .pop ("page" , None )
737-
738- return enhanced_attr
739-
740- def _extract_geometry_from_assessment (
741- self , assessment_data : dict [str , Any ]
742- ) -> dict [str , Any ]:
743- """
744- Extract geometry data from assessment response and convert to proper format.
745- Now supports recursive processing of nested group attributes.
746-
747- Args:
748- assessment_data: Dictionary containing assessment results from LLM
749-
750- Returns:
751- Enhanced assessment data with geometry information converted to proper format
752- """
753- enhanced_assessment = {}
754-
755- for attr_name , attr_assessment in assessment_data .items ():
756- if isinstance (attr_assessment , dict ):
757- # Check if this is a direct confidence assessment
758- if "confidence" in attr_assessment :
759- # This is a direct assessment - process its geometry
760- enhanced_assessment [attr_name ] = (
761- self ._process_single_assessment_geometry (
762- attr_assessment , attr_name
763- )
764- )
765- else :
766- # This is a group attribute (no direct confidence) - recursively process nested attributes
767- logger .debug (f"Processing group attribute: { attr_name } " )
768- enhanced_assessment [attr_name ] = (
769- self ._extract_geometry_from_assessment (attr_assessment )
770- )
771-
772- elif isinstance (attr_assessment , list ):
773- # Handle list attributes - process each item recursively
774- enhanced_list = []
775- for i , item_assessment in enumerate (attr_assessment ):
776- if isinstance (item_assessment , dict ):
777- # Recursively process each list item
778- enhanced_item = self ._extract_geometry_from_assessment (
779- item_assessment
780- )
781- enhanced_list .append (enhanced_item )
782- else :
783- # Non-dict items pass through unchanged
784- enhanced_list .append (item_assessment )
785- enhanced_assessment [attr_name ] = enhanced_list
786- else :
787- # Other types pass through unchanged
788- enhanced_assessment [attr_name ] = attr_assessment
789-
790- return enhanced_assessment
649+ # Geometry processing uses shared utilities from geometry_utils module
791650
792651 def process_document_section (self , document : Document , section_id : str ) -> Document :
793652 """
0 commit comments