Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import org.opendataloader.pdf.api.Config;

import java.io.File;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.Locale;
import java.util.Set;
Expand Down Expand Up @@ -59,6 +58,8 @@ public class CLIOptions {

private static final String USE_STRUCT_TREE_LONG_OPTION = "use-struct-tree";

private static final String TABLE_METHOD_OPTION = "table-method";

public static Options defineOptions() {
Options options = new Options();
Option contentSafetyOff = new Option(null, CONTENT_SAFETY_OFF_LONG_OPTION, true, "Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny, hidden-ocg");
Expand Down Expand Up @@ -105,6 +106,9 @@ public static Options defineOptions() {
Option useStructTree = new Option(null, USE_STRUCT_TREE_LONG_OPTION, false, "Enable processing structure tree (disabled by default)");
useStructTree.setRequired(false);
options.addOption(useStructTree);
Option tableMethod = new Option(null, TABLE_METHOD_OPTION, true, "Enable specified table detection method. Accepts a comma-separated list of methods. Supported values: " + Config.getTableMethodOptions(","));
tableMethod.setRequired(false);
options.addOption(tableMethod);
return options;
}

Expand Down Expand Up @@ -150,14 +154,41 @@ public static Config createConfigFromCommandLine(CommandLine commandLine) {
}
applyContentSafetyOption(config, commandLine);
applyFormatOption(config, commandLine);
applyTableMethodOption(config, commandLine);
return config;
}

private static void applyTableMethodOption(Config config, CommandLine commandLine) {
if (!commandLine.hasOption(TABLE_METHOD_OPTION)) {
return;
}

String[] optionValues = commandLine.getOptionValues(TABLE_METHOD_OPTION);
if (optionValues == null || optionValues.length == 0) {
throw new IllegalArgumentException(String.format("Option --table-method requires at least one value. Supported values: %s", Config.getTableMethodOptions(",")));
}

Set<String> values = parseOptionValues(optionValues);
if (values.isEmpty()) {
throw new IllegalArgumentException(String.format("Option --table-method requires at least one value. Supported values: %s", Config.getTableMethodOptions(",")));
}

for (String value : values) {
switch (value) {
case Config.CLUSTER_TABLE_METHOD:
config.setClusterTableMethod(true);
break;
default:
throw new IllegalArgumentException(String.format("Unsupported value '%s'. Supported values: %s", value, Config.getTableMethodOptions(",")));
}
}
}

private static void applyContentSafetyOption(Config config, CommandLine commandLine) {
if (!commandLine.hasOption(CONTENT_SAFETY_OFF_LONG_OPTION)) {
return;
}


String[] optionValues = commandLine.getOptionValues(CONTENT_SAFETY_OFF_LONG_OPTION);
if (optionValues == null || optionValues.length == 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
*/
package org.opendataloader.pdf.api;

import java.util.HashSet;
import java.util.List;
import java.util.Set;

/**
* Configuration class for the PDF processing.
* Use this class to specify output formats, text processing options, and other settings.
Expand All @@ -24,8 +28,16 @@ public class Config {
private boolean addImageToMarkdown = false;
private String replaceInvalidChars = " ";
private String outputFolder;
private boolean isClusterTableMethod = false;
private final FilterConfig filterConfig = new FilterConfig();

public static final String CLUSTER_TABLE_METHOD = "cluster";
public static Set<String> tableMethodOptions = new HashSet<String>();

static {
tableMethodOptions.add(CLUSTER_TABLE_METHOD);
}

/**
* Gets the filter config.
*
Expand Down Expand Up @@ -252,4 +264,31 @@ public boolean isUseStructTree() {
public void setUseStructTree(boolean useStructTree) {
this.useStructTree = useStructTree;
}

/**
* Gets the method of table detection.
*
* @return The specified method.
*/
public boolean isClusterTableMethod() {
return isClusterTableMethod;
}

/**
* Sets the method of table detection.
*
* @param isClusterTableMethod The specified method.
*/
public void setClusterTableMethod(boolean isClusterTableMethod) {
this.isClusterTableMethod = isClusterTableMethod;
}

/**
* Gets the list of methods of table detection.
*
* @return The string with methods separated by @param delimiter.
*/
public static String getTableMethodOptions(CharSequence delimiter) {
return String.join(delimiter, tableMethodOptions);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ private static List<List<IObject>> processDocument(String inputPdfName, Config c
StaticContainers.getDocument().getArtifacts(pageNumber), pageNumber, config);
contents.add(pageContents);
}
new ClusterTableProcessor().processTables(contents);
if (config.isClusterTableMethod()) {
new ClusterTableProcessor().processTables(contents);
}
for (int pageNumber = 0; pageNumber < StaticContainers.getDocument().getNumberOfPages(); pageNumber++) {
List<IObject> pageContents = TableBorderProcessor.processTableBorders(contents.get(pageNumber), pageNumber);
pageContents = pageContents.stream().filter(x -> !(x instanceof LineChunk)).collect(Collectors.toList());
Expand Down
20 changes: 19 additions & 1 deletion node/opendataloader-pdf/src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ interface CliOptions {
keepLineBreaks?: boolean;
replaceInvalidChars?: string;
useStructTree?: boolean;
tableMethod?: string[];
}

const VALID_FORMATS = new Set([
Expand All @@ -31,6 +32,10 @@ const VALID_CONTENT_SAFETY_MODES = new Set([
'hidden-ocg',
]);

const VALID_TABLE_METHODES = new Set([
'cluster',
]);

function createProgram(): Command {
const program = new Command();

Expand All @@ -51,7 +56,8 @@ function createProgram(): Command {
.option('--content-safety-off <mode...>', 'Disable one or more content safety filters')
.option('--keep-line-breaks', 'Preserve line breaks in text output')
.option('--replace-invalid-chars <c>', 'Replacement character for invalid characters')
.option('--use-struct-tree', 'Enable processing structure tree (disabled by default)');
.option('--use-struct-tree', 'Enable processing structure tree (disabled by default)')
.option('--table-method <method...>', 'Enable specified table detection method');

program.configureOutput({
writeErr: (str) => {
Expand Down Expand Up @@ -92,6 +98,9 @@ function buildConvertOptions(options: CliOptions): ConvertOptions {
if (options.useStructTree) {
convertOptions.useStructTree = true;
}
if (options.tableMethod && options.tableMethod.length) {
convertOptions.tableMethod = options.tableMethod;
}

return convertOptions;
}
Expand Down Expand Up @@ -139,6 +148,15 @@ async function main(): Promise<number> {
}
}
}
if (cliOptions.tableMethod) {
for (const value of cliOptions.tableMethod) {
if (!VALID_TABLE_METHODES.has(value)) {
console.error(`Invalid table method '${value}'. See '--help' for allowed values.`);
console.error("Use '--help' to see available options.");
return 1;
}
}
}

const convertOptions = buildConvertOptions(cliOptions);

Expand Down
8 changes: 8 additions & 0 deletions node/opendataloader-pdf/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ export interface RunOptions {
noJson?: boolean;
debug?: boolean;
useStructTree?: boolean;
tableMethod?: string;
}

export function run(inputPath: string, options: RunOptions = {}): Promise<string> {
Expand Down Expand Up @@ -134,6 +135,9 @@ export function run(inputPath: string, options: RunOptions = {}): Promise<string
if (options.useStructTree) {
args.push('--use-struct-tree')
}
if (options.tableMethod) {
args.push('--table-method', options.tableMethod)
}

args.push(inputPath);
executeJar(args, {
Expand All @@ -154,6 +158,7 @@ export interface ConvertOptions {
keepLineBreaks?: boolean;
replaceInvalidChars?: string;
useStructTree?: boolean;
tableMethod?: string[];
}

export function convert(inputPaths: string[], options: ConvertOptions = {}): Promise<string> {
Expand Down Expand Up @@ -192,6 +197,9 @@ export function convert(inputPaths: string[], options: ConvertOptions = {}): Pro
if (options.useStructTree) {
args.push('--use-struct-tree')
}
if (options.tableMethod && options.tableMethod.length > 0) {
args.push('--table-method', ...options.tableMethod)
}

return executeJar(args, {
streamOutput: !options.quiet,
Expand Down
13 changes: 13 additions & 0 deletions python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def run(
no_json: bool = False,
debug: bool = False,
use_struct_tree = False,
table_method: str = None,
):
"""
Runs the opendataloader-pdf with the given arguments.
Expand All @@ -43,6 +44,7 @@ def run(
no_json: If True, disable the JSON output.
debug: If True, prints all messages from the CLI to the console during execution.
use_struct_tree: If True, enable processing structure tree (disabled by default)
table_method: Specified table detection method.

Raises:
FileNotFoundError: If the 'java' command is not found or input_path is invalid.
Expand Down Expand Up @@ -77,6 +79,8 @@ def run(
args.append("--no-json")
if use_struct_tree:
args.append("--use-struct-tree")
if table_method:
args.append(["--table-method", table_method])

# Run the command
run_jar(args, quiet=not debug)
Expand All @@ -92,6 +96,7 @@ def convert(
keep_line_breaks: bool = False,
replace_invalid_chars: Optional[str] = None,
use_struct_tree: bool = False,
table_method: Optional[List[str]] = None,
) -> None:
"""
Convert PDF(s) into the requested output format(s).
Expand All @@ -106,6 +111,7 @@ def convert(
keep_line_breaks: Preserve line breaks in text output
replace_invalid_chars: Replacement character for invalid/unrecognized characters
use_struct_tree: Enable processing structure tree (disabled by default)
table_method: Specified table detection method.
"""
args: List[str] = []
args.extend(input_path)
Expand All @@ -125,6 +131,8 @@ def convert(
args.extend(["--replace-invalid-chars", replace_invalid_chars])
if use_struct_tree:
args.extend("--use-struct-tree")
if table_method:
args.extend(["--table-method", *table_method])

# Run the command
run_jar(args, quiet)
Expand Down Expand Up @@ -251,6 +259,11 @@ def main(argv=None) -> int:
action="store_true",
help="Enable processing structure tree (disabled by default)",
)
parser.add_argument(
"--table_method",
nargs="+",
help="Enable specified table detection method. Accepts a comma-separated list of methods.",
)
args = parser.parse_args(argv)

try:
Expand Down