From ffc6b82b787e7506566690e27b8c86f056863bc4 Mon Sep 17 00:00:00 2001 From: "valery.bokov" Date: Thu, 1 Jan 2026 16:38:44 +0100 Subject: [PATCH 1/2] refactor TextToPDF.call method --- .../org/apache/pdfbox/tools/TextToPDF.java | 26 ++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/tools/src/main/java/org/apache/pdfbox/tools/TextToPDF.java b/tools/src/main/java/org/apache/pdfbox/tools/TextToPDF.java index 525dd084883..0662f55a983 100644 --- a/tools/src/main/java/org/apache/pdfbox/tools/TextToPDF.java +++ b/tools/src/main/java/org/apache/pdfbox/tools/TextToPDF.java @@ -185,25 +185,21 @@ public Integer call() setTopMargin(margins[2]); setBottomMargin(margins[3]); - boolean hasUtf8BOM = false; - if (charset.equals(StandardCharsets.UTF_8)) + try (InputStream is = new FileInputStream(infile)) { - // check for utf8 BOM - // FileInputStream doesn't support mark/reset - try (InputStream is = new FileInputStream(infile)) + if (charset.equals(StandardCharsets.UTF_8)) { - if (is.read() == 0xEF && is.read() == 0xBB && is.read() == 0xBF) + try { - hasUtf8BOM = true; + // check for utf8 BOM + // FileInputStream doesn't support mark/reset + int b1 = is.read(); + int b2 = is.read(); + int b3 = is.read(); + //todo Here we can perform a check for file format corruption here. + boolean hasUtf8BOM = b1 == 0xEF && b2 == 0xBB && b3 == 0xBF; } - } - } - try (InputStream is = new FileInputStream(infile)) - { - if (hasUtf8BOM) - { - long skipped = is.skip(3); - if (skipped != 3) + catch (IOException x) { throw new IOException("Could not skip 3 bytes, size changed?!"); } From a6356cf0b72b37f71eeab59d1178e9518bd58512 Mon Sep 17 00:00:00 2001 From: "valery.bokov" Date: Mon, 12 Jan 2026 18:31:17 +0100 Subject: [PATCH 2/2] support UTF8 with BOM and no BOM in TextToPDF.call --- .../org/apache/pdfbox/tools/TextToPDF.java | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/tools/src/main/java/org/apache/pdfbox/tools/TextToPDF.java b/tools/src/main/java/org/apache/pdfbox/tools/TextToPDF.java index 0662f55a983..80622a94e86 100644 --- a/tools/src/main/java/org/apache/pdfbox/tools/TextToPDF.java +++ b/tools/src/main/java/org/apache/pdfbox/tools/TextToPDF.java @@ -20,7 +20,7 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStream; +import java.io.BufferedInputStream; import java.io.InputStreamReader; import java.io.PrintStream; import java.io.Reader; @@ -185,23 +185,30 @@ public Integer call() setTopMargin(margins[2]); setBottomMargin(margins[3]); - try (InputStream is = new FileInputStream(infile)) + try (BufferedInputStream is = new BufferedInputStream(new FileInputStream(infile))) { if (charset.equals(StandardCharsets.UTF_8)) { - try + final int readLimit = 3; + is.mark(readLimit); + + byte[] firstBytes = new byte[readLimit]; + if (is.read(firstBytes) != readLimit) { - // check for utf8 BOM - // FileInputStream doesn't support mark/reset - int b1 = is.read(); - int b2 = is.read(); - int b3 = is.read(); - //todo Here we can perform a check for file format corruption here. - boolean hasUtf8BOM = b1 == 0xEF && b2 == 0xBB && b3 == 0xBF; + throw new IOException("Could not read 3 bytes, size changed?!"); } - catch (IOException x) + + if (firstBytes[0] == (byte) 0xEF && + firstBytes[1] == (byte) 0xBB && + firstBytes[2] == (byte) 0xBF) + { + //UTF-8 with BOM + //3 bytes already read (skipped) + } + else { - throw new IOException("Could not skip 3 bytes, size changed?!"); + //It looks like UTF with no BOM or file was corrupted + is.reset(); } } try (Reader reader = new InputStreamReader(is, charset))