From 8b94778899a8030a60eb490499db6009ba30d09c Mon Sep 17 00:00:00 2001 From: Kakhnovich Raman Date: Fri, 5 Dec 2025 14:01:57 +0300 Subject: [PATCH 1/3] Add 'table-method' option --- .../opendataloader/pdf/cli/CLIOptions.java | 10 ++++- .../org/opendataloader/pdf/api/Config.java | 38 +++++++++++++++++++ .../pdf/processors/DocumentProcessor.java | 4 +- node/opendataloader-pdf/src/cli.ts | 7 +++- node/opendataloader-pdf/src/index.ts | 8 ++++ .../src/opendataloader_pdf/wrapper.py | 11 ++++++ 6 files changed, 75 insertions(+), 3 deletions(-) diff --git a/java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java b/java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java index 3ce519e7..6787ac30 100644 --- a/java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java +++ b/java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java @@ -59,6 +59,8 @@ public class CLIOptions { private static final String USE_STRUCT_TREE_LONG_OPTION = "use-struct-tree"; + private static final String TABLE_METHOD_OPTION = "table-method"; + public static Options defineOptions() { Options options = new Options(); Option contentSafetyOff = new Option(null, CONTENT_SAFETY_OFF_LONG_OPTION, true, "Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny, hidden-ocg"); @@ -105,6 +107,9 @@ public static Options defineOptions() { Option useStructTree = new Option(null, USE_STRUCT_TREE_LONG_OPTION, false, "Enable processing structure tree (disabled by default)"); useStructTree.setRequired(false); options.addOption(useStructTree); + Option tableMethod = new Option(null, TABLE_METHOD_OPTION, true, "Enable specified table detection method. Supported values: " + Config.getTableMethodOptions(",")); + tableMethod.setRequired(false); + options.addOption(tableMethod); return options; } @@ -140,6 +145,9 @@ public static Config createConfigFromCommandLine(CommandLine commandLine) { if (commandLine.hasOption(CLIOptions.USE_STRUCT_TREE_LONG_OPTION)) { config.setUseStructTree(true); } + if (commandLine.hasOption(CLIOptions.TABLE_METHOD_OPTION)) { + config.setTableMethod(commandLine.getOptionValue(CLIOptions.TABLE_METHOD_OPTION)); + } if (commandLine.hasOption(CLIOptions.FOLDER_OPTION)) { config.setOutputFolder(commandLine.getOptionValue(CLIOptions.FOLDER_OPTION)); } else { @@ -157,7 +165,7 @@ private static void applyContentSafetyOption(Config config, CommandLine commandL if (!commandLine.hasOption(CONTENT_SAFETY_OFF_LONG_OPTION)) { return; } - + String[] optionValues = commandLine.getOptionValues(CONTENT_SAFETY_OFF_LONG_OPTION); if (optionValues == null || optionValues.length == 0) { diff --git a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/Config.java b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/Config.java index e873af73..c46756ef 100644 --- a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/Config.java +++ b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/Config.java @@ -7,6 +7,9 @@ */ package org.opendataloader.pdf.api; +import java.util.HashSet; +import java.util.Set; + /** * Configuration class for the PDF processing. * Use this class to specify output formats, text processing options, and other settings. @@ -24,8 +27,16 @@ public class Config { private boolean addImageToMarkdown = false; private String replaceInvalidChars = " "; private String outputFolder; + private String tableMethod; private final FilterConfig filterConfig = new FilterConfig(); + public static final String CLUSTER_TABLE_METHOD = "cluster"; + public static Set tableMethodOptions = new HashSet(); + + static { + tableMethodOptions.add(CLUSTER_TABLE_METHOD); + } + /** * Gets the filter config. * @@ -252,4 +263,31 @@ public boolean isUseStructTree() { public void setUseStructTree(boolean useStructTree) { this.useStructTree = useStructTree; } + + /** + * Gets the method of table detection. + * + * @return The specified method. + */ + public String getTableMethod() { + return tableMethod; + } + + /** + * Sets the method of table detection. + * + * @param tableMethod The specified method. + */ + public void setTableMethod(String tableMethod) { + this.tableMethod = tableMethod; + } + + /** + * Gets the list of methods of table detection. + * + * @return The string with methods separated by @param delimiter. + */ + public static String getTableMethodOptions(CharSequence delimiter) { + return String.join(delimiter, tableMethodOptions); + } } diff --git a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java index ac0ddaf0..08eb87d0 100644 --- a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java +++ b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java @@ -62,7 +62,9 @@ private static List> processDocument(String inputPdfName, Config c StaticContainers.getDocument().getArtifacts(pageNumber), pageNumber, config); contents.add(pageContents); } - new ClusterTableProcessor().processTables(contents); + if (Config.CLUSTER_TABLE_METHOD.equals(config.getTableMethod())) { + new ClusterTableProcessor().processTables(contents); + } for (int pageNumber = 0; pageNumber < StaticContainers.getDocument().getNumberOfPages(); pageNumber++) { List pageContents = TableBorderProcessor.processTableBorders(contents.get(pageNumber), pageNumber); pageContents = pageContents.stream().filter(x -> !(x instanceof LineChunk)).collect(Collectors.toList()); diff --git a/node/opendataloader-pdf/src/cli.ts b/node/opendataloader-pdf/src/cli.ts index 1cb77c3c..3b97692b 100644 --- a/node/opendataloader-pdf/src/cli.ts +++ b/node/opendataloader-pdf/src/cli.ts @@ -11,6 +11,7 @@ interface CliOptions { keepLineBreaks?: boolean; replaceInvalidChars?: string; useStructTree?: boolean; + tableMethod?: string; } const VALID_FORMATS = new Set([ @@ -51,7 +52,8 @@ function createProgram(): Command { .option('--content-safety-off ', 'Disable one or more content safety filters') .option('--keep-line-breaks', 'Preserve line breaks in text output') .option('--replace-invalid-chars ', 'Replacement character for invalid characters') - .option('--use-struct-tree', 'Enable processing structure tree (disabled by default)'); + .option('--use-struct-tree', 'Enable processing structure tree (disabled by default)') + .option('--table-method', 'Enable specified table detection method'); program.configureOutput({ writeErr: (str) => { @@ -92,6 +94,9 @@ function buildConvertOptions(options: CliOptions): ConvertOptions { if (options.useStructTree) { convertOptions.useStructTree = true; } + if (options.tableMethod) { + convertOptions.tableMethod = options.tableMethod; + } return convertOptions; } diff --git a/node/opendataloader-pdf/src/index.ts b/node/opendataloader-pdf/src/index.ts index 36a1e914..27eb1de6 100644 --- a/node/opendataloader-pdf/src/index.ts +++ b/node/opendataloader-pdf/src/index.ts @@ -89,6 +89,7 @@ export interface RunOptions { noJson?: boolean; debug?: boolean; useStructTree?: boolean; + tableMethod?: string; } export function run(inputPath: string, options: RunOptions = {}): Promise { @@ -134,6 +135,9 @@ export function run(inputPath: string, options: RunOptions = {}): Promise { @@ -192,6 +197,9 @@ export function convert(inputPaths: string[], options: ConvertOptions = {}): Pro if (options.useStructTree) { args.push('--use-struct-tree') } + if (options.tableMethod) { + args.push('--table-method') + } return executeJar(args, { streamOutput: !options.quiet, diff --git a/python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py b/python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py index 1aeda5ae..9c365492 100644 --- a/python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py +++ b/python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py @@ -25,6 +25,7 @@ def run( no_json: bool = False, debug: bool = False, use_struct_tree = False, + table_method: str = None, ): """ Runs the opendataloader-pdf with the given arguments. @@ -43,6 +44,7 @@ def run( no_json: If True, disable the JSON output. debug: If True, prints all messages from the CLI to the console during execution. use_struct_tree: If True, enable processing structure tree (disabled by default) + table_method: Specified table detection method. Raises: FileNotFoundError: If the 'java' command is not found or input_path is invalid. @@ -77,6 +79,8 @@ def run( args.append("--no-json") if use_struct_tree: args.append("--use-struct-tree") + if table_method: + args.append(["--table-method", table_method]) # Run the command run_jar(args, quiet=not debug) @@ -92,6 +96,7 @@ def convert( keep_line_breaks: bool = False, replace_invalid_chars: Optional[str] = None, use_struct_tree: bool = False, + table_method: Optional[str] = None, ) -> None: """ Convert PDF(s) into the requested output format(s). @@ -125,6 +130,8 @@ def convert( args.extend(["--replace-invalid-chars", replace_invalid_chars]) if use_struct_tree: args.extend("--use-struct-tree") + if table_method: + args.extend(["--table-method", table_method]) # Run the command run_jar(args, quiet) @@ -251,6 +258,10 @@ def main(argv=None) -> int: action="store_true", help="Enable processing structure tree (disabled by default)", ) + parser.add_argument( + "--table_method", + help="Enable specified table detection method", + ) args = parser.parse_args(argv) try: From 0dd29aa3ccaa9e86ced46dffd1c11ec5fef2b630 Mon Sep 17 00:00:00 2001 From: Kakhnovich Raman Date: Fri, 5 Dec 2025 15:25:39 +0300 Subject: [PATCH 2/3] Update 'table-method' option to take multiple arguments --- .../opendataloader/pdf/cli/CLIOptions.java | 32 ++++++++++++++++--- .../org/opendataloader/pdf/api/Config.java | 13 ++++---- .../pdf/processors/DocumentProcessor.java | 2 +- node/opendataloader-pdf/src/cli.ts | 19 +++++++++-- node/opendataloader-pdf/src/index.ts | 8 ++--- .../src/opendataloader_pdf/wrapper.py | 9 ++++-- 6 files changed, 63 insertions(+), 20 deletions(-) diff --git a/java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java b/java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java index 6787ac30..924a630b 100644 --- a/java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java +++ b/java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java @@ -13,7 +13,6 @@ import org.opendataloader.pdf.api.Config; import java.io.File; -import java.util.ArrayList; import java.util.LinkedHashSet; import java.util.Locale; import java.util.Set; @@ -27,6 +26,7 @@ public class CLIOptions { private static final String CONTENT_SAFETY_OFF_TINY_TEXT_ARGUMENT = "tiny"; private static final String CONTENT_SAFETY_OFF_HIDDEN_OCG_ARGUMENT = "hidden-ocg"; private static final String CONTENT_SAFETY_OFF_SUPPORTED_LIST = "all, hidden-text, off-page, tiny, hidden-ocg"; + public static final String CLUSTER_TABLE_METHOD_ARGUMENT = "cluster"; public static final String PASSWORD_OPTION = "p"; private static final String PASSWORD_LONG_OPTION = "password"; @@ -145,9 +145,6 @@ public static Config createConfigFromCommandLine(CommandLine commandLine) { if (commandLine.hasOption(CLIOptions.USE_STRUCT_TREE_LONG_OPTION)) { config.setUseStructTree(true); } - if (commandLine.hasOption(CLIOptions.TABLE_METHOD_OPTION)) { - config.setTableMethod(commandLine.getOptionValue(CLIOptions.TABLE_METHOD_OPTION)); - } if (commandLine.hasOption(CLIOptions.FOLDER_OPTION)) { config.setOutputFolder(commandLine.getOptionValue(CLIOptions.FOLDER_OPTION)); } else { @@ -158,9 +155,36 @@ public static Config createConfigFromCommandLine(CommandLine commandLine) { } applyContentSafetyOption(config, commandLine); applyFormatOption(config, commandLine); + applyTableMethodOption(config, commandLine); return config; } + private static void applyTableMethodOption(Config config, CommandLine commandLine) { + if (!commandLine.hasOption(TABLE_METHOD_OPTION)) { + return; + } + + String[] optionValues = commandLine.getOptionValues(TABLE_METHOD_OPTION); + if (optionValues == null || optionValues.length == 0) { + throw new IllegalArgumentException(String.format("Option --table-method requires at least one value. Supported values: %s", Config.getTableMethodOptions(","))); + } + + Set values = parseOptionValues(optionValues); + if (values.isEmpty()) { + throw new IllegalArgumentException(String.format("Option --table-method requires at least one value. Supported values: %s", Config.getTableMethodOptions(","))); + } + + for (String value : values) { + switch (value) { + case CLUSTER_TABLE_METHOD_ARGUMENT: + config.setClusterTableMethod(true); + break; + default: + throw new IllegalArgumentException(String.format("Unsupported value '%s'. Supported values: %s", value, Config.getTableMethodOptions(","))); + } + } + } + private static void applyContentSafetyOption(Config config, CommandLine commandLine) { if (!commandLine.hasOption(CONTENT_SAFETY_OFF_LONG_OPTION)) { return; diff --git a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/Config.java b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/Config.java index c46756ef..c2bb0aa7 100644 --- a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/Config.java +++ b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/Config.java @@ -8,6 +8,7 @@ package org.opendataloader.pdf.api; import java.util.HashSet; +import java.util.List; import java.util.Set; /** @@ -27,7 +28,7 @@ public class Config { private boolean addImageToMarkdown = false; private String replaceInvalidChars = " "; private String outputFolder; - private String tableMethod; + private boolean isClusterTableMethod = false; private final FilterConfig filterConfig = new FilterConfig(); public static final String CLUSTER_TABLE_METHOD = "cluster"; @@ -269,17 +270,17 @@ public void setUseStructTree(boolean useStructTree) { * * @return The specified method. */ - public String getTableMethod() { - return tableMethod; + public boolean isClusterTableMethod() { + return isClusterTableMethod; } /** * Sets the method of table detection. * - * @param tableMethod The specified method. + * @param isClusterTableMethod The specified method. */ - public void setTableMethod(String tableMethod) { - this.tableMethod = tableMethod; + public void setClusterTableMethod(boolean isClusterTableMethod) { + this.isClusterTableMethod = isClusterTableMethod; } /** diff --git a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java index 08eb87d0..d310cac5 100644 --- a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java +++ b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java @@ -62,7 +62,7 @@ private static List> processDocument(String inputPdfName, Config c StaticContainers.getDocument().getArtifacts(pageNumber), pageNumber, config); contents.add(pageContents); } - if (Config.CLUSTER_TABLE_METHOD.equals(config.getTableMethod())) { + if (config.isClusterTableMethod()) { new ClusterTableProcessor().processTables(contents); } for (int pageNumber = 0; pageNumber < StaticContainers.getDocument().getNumberOfPages(); pageNumber++) { diff --git a/node/opendataloader-pdf/src/cli.ts b/node/opendataloader-pdf/src/cli.ts index 3b97692b..6d12cb6f 100644 --- a/node/opendataloader-pdf/src/cli.ts +++ b/node/opendataloader-pdf/src/cli.ts @@ -11,7 +11,7 @@ interface CliOptions { keepLineBreaks?: boolean; replaceInvalidChars?: string; useStructTree?: boolean; - tableMethod?: string; + tableMethod?: string[]; } const VALID_FORMATS = new Set([ @@ -32,6 +32,10 @@ const VALID_CONTENT_SAFETY_MODES = new Set([ 'hidden-ocg', ]); +const VALID_TABLE_METHODES = new Set([ + 'cluster', +]); + function createProgram(): Command { const program = new Command(); @@ -53,7 +57,7 @@ function createProgram(): Command { .option('--keep-line-breaks', 'Preserve line breaks in text output') .option('--replace-invalid-chars ', 'Replacement character for invalid characters') .option('--use-struct-tree', 'Enable processing structure tree (disabled by default)') - .option('--table-method', 'Enable specified table detection method'); + .option('--table-method ', 'Enable specified table detection method'); program.configureOutput({ writeErr: (str) => { @@ -94,7 +98,7 @@ function buildConvertOptions(options: CliOptions): ConvertOptions { if (options.useStructTree) { convertOptions.useStructTree = true; } - if (options.tableMethod) { + if (options.tableMethod && options.tableMethod.length) { convertOptions.tableMethod = options.tableMethod; } @@ -144,6 +148,15 @@ async function main(): Promise { } } } + if (cliOptions.tableMethod) { + for (const value of cliOptions.tableMethod) { + if (!VALID_TABLE_METHODES.has(value)) { + console.error(`Invalid table method '${value}'. See '--help' for allowed values.`); + console.error("Use '--help' to see available options."); + return 1; + } + } + } const convertOptions = buildConvertOptions(cliOptions); diff --git a/node/opendataloader-pdf/src/index.ts b/node/opendataloader-pdf/src/index.ts index 27eb1de6..77c82dd4 100644 --- a/node/opendataloader-pdf/src/index.ts +++ b/node/opendataloader-pdf/src/index.ts @@ -136,7 +136,7 @@ export function run(inputPath: string, options: RunOptions = {}): Promise { @@ -197,8 +197,8 @@ export function convert(inputPaths: string[], options: ConvertOptions = {}): Pro if (options.useStructTree) { args.push('--use-struct-tree') } - if (options.tableMethod) { - args.push('--table-method') + if (options.tableMethod && options.tableMethod.length > 0) { + args.push('--table-method', ...options.tableMethod) } return executeJar(args, { diff --git a/python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py b/python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py index 9c365492..fd599ce9 100644 --- a/python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py +++ b/python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py @@ -96,7 +96,7 @@ def convert( keep_line_breaks: bool = False, replace_invalid_chars: Optional[str] = None, use_struct_tree: bool = False, - table_method: Optional[str] = None, + table_method: Optional[List[str]] = None, ) -> None: """ Convert PDF(s) into the requested output format(s). @@ -111,6 +111,7 @@ def convert( keep_line_breaks: Preserve line breaks in text output replace_invalid_chars: Replacement character for invalid/unrecognized characters use_struct_tree: Enable processing structure tree (disabled by default) + table_method: Specified table detection method. """ args: List[str] = [] args.extend(input_path) @@ -131,7 +132,7 @@ def convert( if use_struct_tree: args.extend("--use-struct-tree") if table_method: - args.extend(["--table-method", table_method]) + args.extend(["--table-method", *table_method]) # Run the command run_jar(args, quiet) @@ -260,6 +261,10 @@ def main(argv=None) -> int: ) parser.add_argument( "--table_method", + nargs="+", + choices=[ + "cluster", + ], help="Enable specified table detection method", ) args = parser.parse_args(argv) From b20327b499c2cb748862f45b2ae91b6e20726627 Mon Sep 17 00:00:00 2001 From: Kakhnovich Raman Date: Fri, 5 Dec 2025 19:04:12 +0300 Subject: [PATCH 3/3] Update description of 'table-method' option --- .../src/main/java/org/opendataloader/pdf/cli/CLIOptions.java | 5 ++--- python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py | 5 +---- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java b/java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java index 924a630b..595354ea 100644 --- a/java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java +++ b/java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java @@ -26,7 +26,6 @@ public class CLIOptions { private static final String CONTENT_SAFETY_OFF_TINY_TEXT_ARGUMENT = "tiny"; private static final String CONTENT_SAFETY_OFF_HIDDEN_OCG_ARGUMENT = "hidden-ocg"; private static final String CONTENT_SAFETY_OFF_SUPPORTED_LIST = "all, hidden-text, off-page, tiny, hidden-ocg"; - public static final String CLUSTER_TABLE_METHOD_ARGUMENT = "cluster"; public static final String PASSWORD_OPTION = "p"; private static final String PASSWORD_LONG_OPTION = "password"; @@ -107,7 +106,7 @@ public static Options defineOptions() { Option useStructTree = new Option(null, USE_STRUCT_TREE_LONG_OPTION, false, "Enable processing structure tree (disabled by default)"); useStructTree.setRequired(false); options.addOption(useStructTree); - Option tableMethod = new Option(null, TABLE_METHOD_OPTION, true, "Enable specified table detection method. Supported values: " + Config.getTableMethodOptions(",")); + Option tableMethod = new Option(null, TABLE_METHOD_OPTION, true, "Enable specified table detection method. Accepts a comma-separated list of methods. Supported values: " + Config.getTableMethodOptions(",")); tableMethod.setRequired(false); options.addOption(tableMethod); return options; @@ -176,7 +175,7 @@ private static void applyTableMethodOption(Config config, CommandLine commandLin for (String value : values) { switch (value) { - case CLUSTER_TABLE_METHOD_ARGUMENT: + case Config.CLUSTER_TABLE_METHOD: config.setClusterTableMethod(true); break; default: diff --git a/python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py b/python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py index fd599ce9..1df9776e 100644 --- a/python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py +++ b/python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py @@ -262,10 +262,7 @@ def main(argv=None) -> int: parser.add_argument( "--table_method", nargs="+", - choices=[ - "cluster", - ], - help="Enable specified table detection method", + help="Enable specified table detection method. Accepts a comma-separated list of methods.", ) args = parser.parse_args(argv)