Skip to content

Segmentation Fault while building PDF #4865

@Tawheed-tariq

Description

@Tawheed-tariq

I am getting segmentation fault when I use the below code to format my pdf:

def apply_translations_to_pdf(self):
        """
        Uniformly perform "redact/white cover + insert translation" operations on PDF
        """
        start_time = time.time()

        normal_font_path = os.path.join(APP_DATA_DIR, 'temp', 'fonts', f"{self.target_language}_subset.ttf")
        bold_font_path =  os.path.join(APP_DATA_DIR, 'temp', 'fonts', f"{self.target_language}_bold_subset.ttf")

        if not check_system_resources():
            print("Resource limit reached, performing aggressive cleanup...")
            gc.collect()
        
        for page_index, blocks in enumerate(self.pages_data):

            if page_index % 5 == 0:
                if not check_system_resources():
                    print("Resource limit reached, performing aggressive cleanup...")
                    gc.collect()
            page = self.doc.load_page(page_index)
            
            # Group text blocks on this page by font type to avoid duplicate font definitions
            normal_blocks = []
            bold_blocks = []
            
            # First cover all areas
            for block in blocks:
                coords = block[1]  # (x0, y0, x1, y1)
                
                # Intelligently calculate expansion ratio based on translated text and original text length ratio
                original_text = block[0]
                translated_text = block[2] if block[2] is not None else original_text
                
                # Calculate expansion factor: maximum limit of 5% expansion
                len_ratio = min(1.05, max(1.01, len(translated_text) / max(1, len(original_text))))
                
                x0, y0, x1, y1 = coords
                width = x1 - x0
                height = y1 - y0
                
                # Only expand to the right, don't change left starting point
                h_expand = (len_ratio - 1) * width
                
                # Apply expansion, only modify x1 value
                x1 = x1 + h_expand
                
                # Reduce coverage area in vertical direction to make it more compact
                # Calculate vertical margin reduction, but keep a minimum margin
                vertical_margin = min(height * 0.1, 3)  # Reduce top and bottom by 10% each, max 3 points
                
                # Apply vertical reduction
                y0 = y0 + vertical_margin
                y1 = y1 - vertical_margin
                
                # Ensure minimum height
                if y1 - y0 < 10:  # Ensure minimum height of 10pt
                    y_center = (coords[1] + coords[3]) / 2  # Use center point of original bbox
                    y0 = y_center - 5
                    y1 = y_center + 5
                
                enlarged_coords = (x0, y0, x1, y1)
                rect = fitz.Rect(*enlarged_coords)

                # First try to cover with Redact
                try:
                    page.add_redact_annot(rect)
                except Exception as e:
                    # If Redact fails, use white rectangle to cover
                    annots = list(page.annots() or [])
                    if annots:
                        page.delete_annot(annots[-1])
                    try:
                        page.draw_rect(rect, color=(1, 1, 1), fill=(1, 1, 1))
                    except Exception as e2:
                        print(f"Error occurred while creating white canvas: {e2}")
                    print(f"Error occurred while applying redaction: {e}")
                
                # Classify text blocks
                if len(block) > 6 and block[6]:  # text_bold
                    bold_blocks.append((block, enlarged_coords))
                else:
                    normal_blocks.append((block, enlarged_coords))
            
            page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)
            # Process normal font text blocks
            if normal_blocks:
                font_family = f"{self.target_language}_font"
                normal_font_path = normal_font_path.replace('\\', '/')
                font = fitz.Font(fontfile=normal_font_path)
                
                # Ensure font file exists
                if not os.path.exists(normal_font_path):
                    print(f"Warning: Font file does not exist: {normal_font_path}")
                
                # Update font usage count
                self.font_usage_counter["normal"] += len(normal_blocks)
                
                # Only add @font-face definition when using this font for the first time
                if font_family not in self.font_css_cache:
                    css_prefix = f"""
                    @font-face {{
                        font-family: "{font_family}";
                        src: url("{normal_font_path}");
                    }}
                    """
                    self.font_css_cache[font_family] = css_prefix
                    self.font_embed_counter["normal"] += 1
                else:
                    css_prefix = self.font_css_cache[font_family]
                
                # Process each normal font text block
                for block_data in normal_blocks:
                    block, enlarged_coords = block_data
                    # If third element is translation, use it, otherwise use original
                    translated_text = block[2] if block[2] is not None else block[0]
                    print(f"Length of tranaslated text (Normal block) is : {len(translated_text)}")
                    translated_text = filter_missing_glyphs(translated_text, font)
                    if not translated_text.strip():
                        continue
                    angle = block[3] if len(block) > 3 else 0
                    html_color = block[4] if len(block) > 4 else '#000000'
                    text_indent = block[5] if len(block) > 5 else 0
                    text_size = float(block[7]) if len(block) > 7 else 12

                    if not validate_font(normal_font_path):
                        print("Skipping normal font rendering for this block")
                        continue
                    
                    # Create rectangle using enlarged coordinates
                    rect = fitz.Rect(*enlarged_coords)
                    
                    # Combine CSS, add auto-resize and auto-wrap properties
                    css = css_prefix + f"""
                    * {{
                        font-family: "{font_family}";
                        color: {html_color};
                        text-indent: {text_indent}pt;  
                        font-size: {text_size}pt; 
                        line-height: 1.5;
                        word-wrap: break-word;
                        overflow-wrap: break-word;
                        width: 100%;
                        box-sizing: border-box;
                    }}
                    """
                    
                    # Insert text
                    try:
                        page.insert_htmlbox(
                            rect,
                            translated_text,
                            css=css,
                            rotate=angle
                        )
                    except Exception as e:
                        print("HTML insertion failed:")
                        print("Font:", font_family)
                        print("Text:", translated_text[:200])
                        print(e)

                    
                print(f"Completed Normal Block Insertion for {page_index}")


            # Process bold font text blocks
            if bold_blocks:
                print(f"Inserting Bold Block for {page_index}")
                font_family = f"{self.target_language}_bold_font"
                bold_font_path = bold_font_path.replace('\\', '/')
                font = fitz.Font(fontfile=bold_font_path)
                
                # Ensure font file exists
                if not os.path.exists(bold_font_path):
                    print(f"Warning: Font file does not exist: {bold_font_path}")
                
                # Update font usage count
                self.font_usage_counter["bold"] += len(bold_blocks)
                
                # Only add @font-face definition when using this font for the first time
                if font_family not in self.font_css_cache:
                    css_prefix = f"""
                    @font-face {{
                        font-family: "{font_family}";
                        src: url("{bold_font_path}");
                    }}
                    """
                    self.font_css_cache[font_family] = css_prefix
                    self.font_embed_counter["bold"] += 1
                else:
                    css_prefix = self.font_css_cache[font_family]
                
                # Process each bold font text block
                for block_data in bold_blocks:
                    block, enlarged_coords = block_data
                    # If third element is translation, use it, otherwise use original
                    translated_text = block[2] if block[2] is not None else block[0]
                    print(f"Length of tranaslated text (Normal block) is : {len(translated_text)}")
                    translated_text = filter_missing_glyphs(translated_text, font)
                    if not translated_text.strip():
                        continue
                    angle = block[3] if len(block) > 3 else 0
                    html_color = block[4] if len(block) > 4 else '#000000'
                    text_indent = block[5] if len(block) > 5 else 0
                    text_size = float(block[7]) if len(block) > 7 else 12

                    if not validate_font(bold_font_path):
                        print("Skipping bold font rendering for this block")
                        continue
                    
                    # Create rectangle using enlarged coordinates
                    rect = fitz.Rect(*enlarged_coords)
                    
                    # Combine CSS, add auto-resize and auto-wrap properties
                    css = css_prefix + f"""
                    * {{
                        font-family: "{font_family}";
                        color: {html_color};
                        text-indent: {text_indent}pt;  
                        font-size: {text_size}pt;
                        line-height: 1.2;
                        word-wrap: break-word;
                        overflow-wrap: break-word;
                        width: 100%;
                        box-sizing: border-box;
                    }}
                    """
                    
                    # Insert text
                    try:
                        page.insert_htmlbox(
                            rect,
                            translated_text,
                            css=css,
                            rotate=angle
                        )
                    except Exception as e:
                        print("HTML insertion failed:")
                        print("Font:", font_family)
                        print("Text:", translated_text[:200])
                        print(e)
                    

                print(f"Completed Bold Block Insertion for {page_index}")
            
            # Print simple progress every 20 pages
            if page_index % 20 == 0:
                print(f"Processing: {page_index}/{len(self.pages_data)} pages")

please help.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions