diff --git a/java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java b/java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java index 3ce519e7..595354ea 100644 --- a/java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java +++ b/java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java @@ -13,7 +13,6 @@ import org.opendataloader.pdf.api.Config; import java.io.File; -import java.util.ArrayList; import java.util.LinkedHashSet; import java.util.Locale; import java.util.Set; @@ -59,6 +58,8 @@ public class CLIOptions { private static final String USE_STRUCT_TREE_LONG_OPTION = "use-struct-tree"; + private static final String TABLE_METHOD_OPTION = "table-method"; + public static Options defineOptions() { Options options = new Options(); Option contentSafetyOff = new Option(null, CONTENT_SAFETY_OFF_LONG_OPTION, true, "Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny, hidden-ocg"); @@ -105,6 +106,9 @@ public static Options defineOptions() { Option useStructTree = new Option(null, USE_STRUCT_TREE_LONG_OPTION, false, "Enable processing structure tree (disabled by default)"); useStructTree.setRequired(false); options.addOption(useStructTree); + Option tableMethod = new Option(null, TABLE_METHOD_OPTION, true, "Enable specified table detection method. Accepts a comma-separated list of methods. Supported values: " + Config.getTableMethodOptions(",")); + tableMethod.setRequired(false); + options.addOption(tableMethod); return options; } @@ -150,14 +154,41 @@ public static Config createConfigFromCommandLine(CommandLine commandLine) { } applyContentSafetyOption(config, commandLine); applyFormatOption(config, commandLine); + applyTableMethodOption(config, commandLine); return config; } + private static void applyTableMethodOption(Config config, CommandLine commandLine) { + if (!commandLine.hasOption(TABLE_METHOD_OPTION)) { + return; + } + + String[] optionValues = commandLine.getOptionValues(TABLE_METHOD_OPTION); + if (optionValues == null || optionValues.length == 0) { + throw new IllegalArgumentException(String.format("Option --table-method requires at least one value. Supported values: %s", Config.getTableMethodOptions(","))); + } + + Set values = parseOptionValues(optionValues); + if (values.isEmpty()) { + throw new IllegalArgumentException(String.format("Option --table-method requires at least one value. Supported values: %s", Config.getTableMethodOptions(","))); + } + + for (String value : values) { + switch (value) { + case Config.CLUSTER_TABLE_METHOD: + config.setClusterTableMethod(true); + break; + default: + throw new IllegalArgumentException(String.format("Unsupported value '%s'. Supported values: %s", value, Config.getTableMethodOptions(","))); + } + } + } + private static void applyContentSafetyOption(Config config, CommandLine commandLine) { if (!commandLine.hasOption(CONTENT_SAFETY_OFF_LONG_OPTION)) { return; } - + String[] optionValues = commandLine.getOptionValues(CONTENT_SAFETY_OFF_LONG_OPTION); if (optionValues == null || optionValues.length == 0) { diff --git a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/Config.java b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/Config.java index e873af73..c2bb0aa7 100644 --- a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/Config.java +++ b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/Config.java @@ -7,6 +7,10 @@ */ package org.opendataloader.pdf.api; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + /** * Configuration class for the PDF processing. * Use this class to specify output formats, text processing options, and other settings. @@ -24,8 +28,16 @@ public class Config { private boolean addImageToMarkdown = false; private String replaceInvalidChars = " "; private String outputFolder; + private boolean isClusterTableMethod = false; private final FilterConfig filterConfig = new FilterConfig(); + public static final String CLUSTER_TABLE_METHOD = "cluster"; + public static Set tableMethodOptions = new HashSet(); + + static { + tableMethodOptions.add(CLUSTER_TABLE_METHOD); + } + /** * Gets the filter config. * @@ -252,4 +264,31 @@ public boolean isUseStructTree() { public void setUseStructTree(boolean useStructTree) { this.useStructTree = useStructTree; } + + /** + * Gets the method of table detection. + * + * @return The specified method. + */ + public boolean isClusterTableMethod() { + return isClusterTableMethod; + } + + /** + * Sets the method of table detection. + * + * @param isClusterTableMethod The specified method. + */ + public void setClusterTableMethod(boolean isClusterTableMethod) { + this.isClusterTableMethod = isClusterTableMethod; + } + + /** + * Gets the list of methods of table detection. + * + * @return The string with methods separated by @param delimiter. + */ + public static String getTableMethodOptions(CharSequence delimiter) { + return String.join(delimiter, tableMethodOptions); + } } diff --git a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java index ac0ddaf0..d310cac5 100644 --- a/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java +++ b/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/DocumentProcessor.java @@ -62,7 +62,9 @@ private static List> processDocument(String inputPdfName, Config c StaticContainers.getDocument().getArtifacts(pageNumber), pageNumber, config); contents.add(pageContents); } - new ClusterTableProcessor().processTables(contents); + if (config.isClusterTableMethod()) { + new ClusterTableProcessor().processTables(contents); + } for (int pageNumber = 0; pageNumber < StaticContainers.getDocument().getNumberOfPages(); pageNumber++) { List pageContents = TableBorderProcessor.processTableBorders(contents.get(pageNumber), pageNumber); pageContents = pageContents.stream().filter(x -> !(x instanceof LineChunk)).collect(Collectors.toList()); diff --git a/node/opendataloader-pdf/src/cli.ts b/node/opendataloader-pdf/src/cli.ts index 1cb77c3c..6d12cb6f 100644 --- a/node/opendataloader-pdf/src/cli.ts +++ b/node/opendataloader-pdf/src/cli.ts @@ -11,6 +11,7 @@ interface CliOptions { keepLineBreaks?: boolean; replaceInvalidChars?: string; useStructTree?: boolean; + tableMethod?: string[]; } const VALID_FORMATS = new Set([ @@ -31,6 +32,10 @@ const VALID_CONTENT_SAFETY_MODES = new Set([ 'hidden-ocg', ]); +const VALID_TABLE_METHODES = new Set([ + 'cluster', +]); + function createProgram(): Command { const program = new Command(); @@ -51,7 +56,8 @@ function createProgram(): Command { .option('--content-safety-off ', 'Disable one or more content safety filters') .option('--keep-line-breaks', 'Preserve line breaks in text output') .option('--replace-invalid-chars ', 'Replacement character for invalid characters') - .option('--use-struct-tree', 'Enable processing structure tree (disabled by default)'); + .option('--use-struct-tree', 'Enable processing structure tree (disabled by default)') + .option('--table-method ', 'Enable specified table detection method'); program.configureOutput({ writeErr: (str) => { @@ -92,6 +98,9 @@ function buildConvertOptions(options: CliOptions): ConvertOptions { if (options.useStructTree) { convertOptions.useStructTree = true; } + if (options.tableMethod && options.tableMethod.length) { + convertOptions.tableMethod = options.tableMethod; + } return convertOptions; } @@ -139,6 +148,15 @@ async function main(): Promise { } } } + if (cliOptions.tableMethod) { + for (const value of cliOptions.tableMethod) { + if (!VALID_TABLE_METHODES.has(value)) { + console.error(`Invalid table method '${value}'. See '--help' for allowed values.`); + console.error("Use '--help' to see available options."); + return 1; + } + } + } const convertOptions = buildConvertOptions(cliOptions); diff --git a/node/opendataloader-pdf/src/index.ts b/node/opendataloader-pdf/src/index.ts index 36a1e914..77c82dd4 100644 --- a/node/opendataloader-pdf/src/index.ts +++ b/node/opendataloader-pdf/src/index.ts @@ -89,6 +89,7 @@ export interface RunOptions { noJson?: boolean; debug?: boolean; useStructTree?: boolean; + tableMethod?: string; } export function run(inputPath: string, options: RunOptions = {}): Promise { @@ -134,6 +135,9 @@ export function run(inputPath: string, options: RunOptions = {}): Promise { @@ -192,6 +197,9 @@ export function convert(inputPaths: string[], options: ConvertOptions = {}): Pro if (options.useStructTree) { args.push('--use-struct-tree') } + if (options.tableMethod && options.tableMethod.length > 0) { + args.push('--table-method', ...options.tableMethod) + } return executeJar(args, { streamOutput: !options.quiet, diff --git a/python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py b/python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py index 1aeda5ae..1df9776e 100644 --- a/python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py +++ b/python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py @@ -25,6 +25,7 @@ def run( no_json: bool = False, debug: bool = False, use_struct_tree = False, + table_method: str = None, ): """ Runs the opendataloader-pdf with the given arguments. @@ -43,6 +44,7 @@ def run( no_json: If True, disable the JSON output. debug: If True, prints all messages from the CLI to the console during execution. use_struct_tree: If True, enable processing structure tree (disabled by default) + table_method: Specified table detection method. Raises: FileNotFoundError: If the 'java' command is not found or input_path is invalid. @@ -77,6 +79,8 @@ def run( args.append("--no-json") if use_struct_tree: args.append("--use-struct-tree") + if table_method: + args.append(["--table-method", table_method]) # Run the command run_jar(args, quiet=not debug) @@ -92,6 +96,7 @@ def convert( keep_line_breaks: bool = False, replace_invalid_chars: Optional[str] = None, use_struct_tree: bool = False, + table_method: Optional[List[str]] = None, ) -> None: """ Convert PDF(s) into the requested output format(s). @@ -106,6 +111,7 @@ def convert( keep_line_breaks: Preserve line breaks in text output replace_invalid_chars: Replacement character for invalid/unrecognized characters use_struct_tree: Enable processing structure tree (disabled by default) + table_method: Specified table detection method. """ args: List[str] = [] args.extend(input_path) @@ -125,6 +131,8 @@ def convert( args.extend(["--replace-invalid-chars", replace_invalid_chars]) if use_struct_tree: args.extend("--use-struct-tree") + if table_method: + args.extend(["--table-method", *table_method]) # Run the command run_jar(args, quiet) @@ -251,6 +259,11 @@ def main(argv=None) -> int: action="store_true", help="Enable processing structure tree (disabled by default)", ) + parser.add_argument( + "--table_method", + nargs="+", + help="Enable specified table detection method. Accepts a comma-separated list of methods.", + ) args = parser.parse_args(argv) try: