-
Notifications
You must be signed in to change notification settings - Fork 684
Open
Labels
Description
I am getting segmentation fault when I use the below code to format my pdf:
def apply_translations_to_pdf(self):
"""
Uniformly perform "redact/white cover + insert translation" operations on PDF
"""
start_time = time.time()
normal_font_path = os.path.join(APP_DATA_DIR, 'temp', 'fonts', f"{self.target_language}_subset.ttf")
bold_font_path = os.path.join(APP_DATA_DIR, 'temp', 'fonts', f"{self.target_language}_bold_subset.ttf")
if not check_system_resources():
print("Resource limit reached, performing aggressive cleanup...")
gc.collect()
for page_index, blocks in enumerate(self.pages_data):
if page_index % 5 == 0:
if not check_system_resources():
print("Resource limit reached, performing aggressive cleanup...")
gc.collect()
page = self.doc.load_page(page_index)
# Group text blocks on this page by font type to avoid duplicate font definitions
normal_blocks = []
bold_blocks = []
# First cover all areas
for block in blocks:
coords = block[1] # (x0, y0, x1, y1)
# Intelligently calculate expansion ratio based on translated text and original text length ratio
original_text = block[0]
translated_text = block[2] if block[2] is not None else original_text
# Calculate expansion factor: maximum limit of 5% expansion
len_ratio = min(1.05, max(1.01, len(translated_text) / max(1, len(original_text))))
x0, y0, x1, y1 = coords
width = x1 - x0
height = y1 - y0
# Only expand to the right, don't change left starting point
h_expand = (len_ratio - 1) * width
# Apply expansion, only modify x1 value
x1 = x1 + h_expand
# Reduce coverage area in vertical direction to make it more compact
# Calculate vertical margin reduction, but keep a minimum margin
vertical_margin = min(height * 0.1, 3) # Reduce top and bottom by 10% each, max 3 points
# Apply vertical reduction
y0 = y0 + vertical_margin
y1 = y1 - vertical_margin
# Ensure minimum height
if y1 - y0 < 10: # Ensure minimum height of 10pt
y_center = (coords[1] + coords[3]) / 2 # Use center point of original bbox
y0 = y_center - 5
y1 = y_center + 5
enlarged_coords = (x0, y0, x1, y1)
rect = fitz.Rect(*enlarged_coords)
# First try to cover with Redact
try:
page.add_redact_annot(rect)
except Exception as e:
# If Redact fails, use white rectangle to cover
annots = list(page.annots() or [])
if annots:
page.delete_annot(annots[-1])
try:
page.draw_rect(rect, color=(1, 1, 1), fill=(1, 1, 1))
except Exception as e2:
print(f"Error occurred while creating white canvas: {e2}")
print(f"Error occurred while applying redaction: {e}")
# Classify text blocks
if len(block) > 6 and block[6]: # text_bold
bold_blocks.append((block, enlarged_coords))
else:
normal_blocks.append((block, enlarged_coords))
page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)
# Process normal font text blocks
if normal_blocks:
font_family = f"{self.target_language}_font"
normal_font_path = normal_font_path.replace('\\', '/')
font = fitz.Font(fontfile=normal_font_path)
# Ensure font file exists
if not os.path.exists(normal_font_path):
print(f"Warning: Font file does not exist: {normal_font_path}")
# Update font usage count
self.font_usage_counter["normal"] += len(normal_blocks)
# Only add @font-face definition when using this font for the first time
if font_family not in self.font_css_cache:
css_prefix = f"""
@font-face {{
font-family: "{font_family}";
src: url("{normal_font_path}");
}}
"""
self.font_css_cache[font_family] = css_prefix
self.font_embed_counter["normal"] += 1
else:
css_prefix = self.font_css_cache[font_family]
# Process each normal font text block
for block_data in normal_blocks:
block, enlarged_coords = block_data
# If third element is translation, use it, otherwise use original
translated_text = block[2] if block[2] is not None else block[0]
print(f"Length of tranaslated text (Normal block) is : {len(translated_text)}")
translated_text = filter_missing_glyphs(translated_text, font)
if not translated_text.strip():
continue
angle = block[3] if len(block) > 3 else 0
html_color = block[4] if len(block) > 4 else '#000000'
text_indent = block[5] if len(block) > 5 else 0
text_size = float(block[7]) if len(block) > 7 else 12
if not validate_font(normal_font_path):
print("Skipping normal font rendering for this block")
continue
# Create rectangle using enlarged coordinates
rect = fitz.Rect(*enlarged_coords)
# Combine CSS, add auto-resize and auto-wrap properties
css = css_prefix + f"""
* {{
font-family: "{font_family}";
color: {html_color};
text-indent: {text_indent}pt;
font-size: {text_size}pt;
line-height: 1.5;
word-wrap: break-word;
overflow-wrap: break-word;
width: 100%;
box-sizing: border-box;
}}
"""
# Insert text
try:
page.insert_htmlbox(
rect,
translated_text,
css=css,
rotate=angle
)
except Exception as e:
print("HTML insertion failed:")
print("Font:", font_family)
print("Text:", translated_text[:200])
print(e)
print(f"Completed Normal Block Insertion for {page_index}")
# Process bold font text blocks
if bold_blocks:
print(f"Inserting Bold Block for {page_index}")
font_family = f"{self.target_language}_bold_font"
bold_font_path = bold_font_path.replace('\\', '/')
font = fitz.Font(fontfile=bold_font_path)
# Ensure font file exists
if not os.path.exists(bold_font_path):
print(f"Warning: Font file does not exist: {bold_font_path}")
# Update font usage count
self.font_usage_counter["bold"] += len(bold_blocks)
# Only add @font-face definition when using this font for the first time
if font_family not in self.font_css_cache:
css_prefix = f"""
@font-face {{
font-family: "{font_family}";
src: url("{bold_font_path}");
}}
"""
self.font_css_cache[font_family] = css_prefix
self.font_embed_counter["bold"] += 1
else:
css_prefix = self.font_css_cache[font_family]
# Process each bold font text block
for block_data in bold_blocks:
block, enlarged_coords = block_data
# If third element is translation, use it, otherwise use original
translated_text = block[2] if block[2] is not None else block[0]
print(f"Length of tranaslated text (Normal block) is : {len(translated_text)}")
translated_text = filter_missing_glyphs(translated_text, font)
if not translated_text.strip():
continue
angle = block[3] if len(block) > 3 else 0
html_color = block[4] if len(block) > 4 else '#000000'
text_indent = block[5] if len(block) > 5 else 0
text_size = float(block[7]) if len(block) > 7 else 12
if not validate_font(bold_font_path):
print("Skipping bold font rendering for this block")
continue
# Create rectangle using enlarged coordinates
rect = fitz.Rect(*enlarged_coords)
# Combine CSS, add auto-resize and auto-wrap properties
css = css_prefix + f"""
* {{
font-family: "{font_family}";
color: {html_color};
text-indent: {text_indent}pt;
font-size: {text_size}pt;
line-height: 1.2;
word-wrap: break-word;
overflow-wrap: break-word;
width: 100%;
box-sizing: border-box;
}}
"""
# Insert text
try:
page.insert_htmlbox(
rect,
translated_text,
css=css,
rotate=angle
)
except Exception as e:
print("HTML insertion failed:")
print("Font:", font_family)
print("Text:", translated_text[:200])
print(e)
print(f"Completed Bold Block Insertion for {page_index}")
# Print simple progress every 20 pages
if page_index % 20 == 0:
print(f"Processing: {page_index}/{len(self.pages_data)} pages")please help.