Skip to content

Commit 1638114

Browse files
committed
docs: cli args only allow input of a single string separated by ","
1 parent 0700585 commit 1638114

File tree

3 files changed

+44
-42
lines changed

3 files changed

+44
-42
lines changed

README.md

Lines changed: 14 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -81,14 +81,14 @@ import opendataloader_pdf
8181
opendataloader_pdf.convert(
8282
input_path=["path/to/document.pdf", "path/to/folder"],
8383
output_dir="path/to/output",
84-
format=["json", "html", "pdf", "markdown"]
84+
format="json,html,pdf,markdown"
8585
)
8686
```
8787

8888
If you want to run it via CLI, you can use the following command on the terminal:
8989

9090
```bash
91-
opendataloader-pdf path/to/document.pdf path/to/folder -o path/to/output -f json html pdf markdown
91+
opendataloader-pdf path/to/document.pdf path/to/folder -o path/to/output -f json,html,pdf,markdown
9292
```
9393

9494
### Function: convert()
@@ -100,9 +100,9 @@ The main function to process PDFs.
100100
| `input_path` | `List[str]` | ✅ Yes || One or more PDF file paths or directories to process. |
101101
| `output_dir` | `Optional[str]` | No | input folder | Directory where outputs are written. |
102102
| `password` | `Optional[str]` | No | `None` | Password used for encrypted PDFs. |
103-
| `format` | `Optional[List[str]]` | No | `None` | Output formats to generate (e.g. `"json"`, `"html"`, `"pdf"`, `"text"`, `"markdown"`, `"markdown-with-html"`, `"markdown-with-images"`). |
103+
| `format` | `Optional[Union[str, List[str]]]` | No | `None` | Comma-separated output formats to generate. (json, text, html, pdf, markdown, markdown-with-html, markdown-with-images) |
104104
| `quiet` | `bool` | No | `False` | Suppresses CLI logging output when `True`. |
105-
| `content_safety_off` | `Optional[List[str]]` | No | `None` | List of content safety filters to disable (e.g. `"all"`, `"hidden-text"`, `"off-page"`, `"tiny"`, `"hidden-ocg"`). |
105+
| `content_safety_off` | `Optional[Union[str, List[str]]]` | No | `None` | Comma-separated content safety filters to disable. (all, hidden-text, off-page, tiny, hidden-ocg) |
106106
| `keep_line_breaks` | `bool` | No | `False` | Preserves line breaks in text output when `True`. |
107107
| `replace_invalid_chars` | `Optional[str]` | No | `None` | Replacement character for invalid or unrecognized characters (e.g., �, `\u0000`). |
108108
| `use_struct_tree` | `bool ` | No | `False` | Enable processing structure tree (disabled by default). |
@@ -138,7 +138,7 @@ async function main() {
138138
try {
139139
await convert(['path/to/document.pdf', 'path/to/folder'], {
140140
outputDir: 'path/to/output',
141-
format: ['json', 'html', 'pdf', 'markdown'],
141+
format: 'json,html,pdf,markdown',
142142
});
143143
console.log('convert() complete');
144144
} catch (error) {
@@ -159,9 +159,9 @@ Multi-input helper matching the Python wrapper.
159159
| `inputPaths` | `string[]` || One or more file paths or directories to process. |
160160
| `options.outputDir` | `string` | `undefined` | Directory where outputs are written. |
161161
| `options.password` | `string` | `undefined` | Password for encrypted PDFs. |
162-
| `options.format` | `string[]` | `undefined` | Output formats (any combination of `json`, `text`, `html`, `pdf`, `markdown`, `markdown-with-html`, `markdown-with-images`). |
162+
| `options.format` | `string \| string[]` | `undefined` | Comma-separated output formats to generate. (json, text, html, pdf, markdown, markdown-with-html, markdown-with-images) |
163163
| `options.quiet` | `boolean` | `false` | Suppress CLI logging output and prevent streaming. |
164-
| `options.contentSafetyOff` | `string[]` | `undefined` | Disable one or more content safety filters (`all`, `hidden-text`, `off-page`, `tiny`, `hidden-ocg`). |
164+
| `options.contentSafetyOff` | `string \| string[]` | `undefined` | Comma-separated content safety filters to disable. (all, hidden-text, off-page, tiny, hidden-ocg) |
165165
| `options.keepLineBreaks` | `boolean` | `false` | Preserve line breaks in text output. |
166166
| `options.replaceInvalidChars` | `string` | `undefined` | Replacement character for invalid or unrecognized characters. |
167167
| `options.useStructTree` | `boolean` | `false` | Enable processing structure tree (disabled by default). |
@@ -173,29 +173,17 @@ Deprecated.
173173
### CLI
174174

175175
```bash
176-
npx @opendataloader/pdf path/to/document.pdf path/to/folder -o path/to/output -f json html pdf markdown
177-
```
178-
179-
Or install globally:
180-
181-
```bash
182-
npm install -g @opendataloader/pdf
183-
```
184-
185-
Then run:
186-
187-
```bash
188-
opendataloader-pdf path/to/document.pdf path/to/folder -o path/to/output -f json html pdf markdown
176+
npx @opendataloader/pdf path/to/document.pdf path/to/folder -o path/to/output -f json,html,pdf,markdown
189177
```
190178

191179
#### Available options
192180

193181
```
194182
-o, --output-dir <path> Directory where outputs are written
195183
-p, --password <password> Password for encrypted PDFs
196-
-f, --format <values> Comma-separated output formats to generate (json, text, html, pdf, markdown, markdown-with-html, markdown-with-images)
184+
-f, --format <values> Comma-separated output formats to generate. (json, text, html, pdf, markdown, markdown-with-html, markdown-with-images)
197185
-q, --quiet Suppress CLI logging output
198-
--content-safety-off <modes> Disable one or more content safety filters (all, hidden-text, off-page, tiny, hidden-ocg)
186+
--content-safety-off <modes> Comma-separated content safety filters to disable. (all, hidden-text, off-page, tiny, hidden-ocg)
199187
--keep-line-breaks Preserve line breaks in text output
200188
--replace-invalid-chars <c> Replacement character for invalid or unrecognized characters
201189
-h, --help Show usage information
@@ -222,7 +210,7 @@ Check for the latest version on [Maven Central](https://search.maven.org/artifac
222210
<dependency>
223211
<groupId>org.opendataloader</groupId>
224212
<artifactId>opendataloader-pdf-core</artifactId>
225-
<version>1.1.2</version>
213+
<version>1.3.0</version>
226214
</dependency>
227215
</dependencies>
228216

@@ -297,7 +285,7 @@ curl -L -o 1901.03003.pdf https://arxiv.org/pdf/1901.03003
297285
Run opendataloader-pdf in Docker container
298286

299287
```
300-
docker run --rm -v "$PWD":/work ghcr.io/opendataloader-project/opendataloader-pdf-cli:latest /work/1901.03003.pdf --markdown --html --pdf
288+
docker run --rm -v "$PWD":/work ghcr.io/opendataloader-project/opendataloader-pdf-cli:latest /work/1901.03003.pdf -f json,html,pdf,markdown
301289
```
302290

303291
<br/>
@@ -343,9 +331,9 @@ The images are extracted from PDF as individual files and stored in a subfolder
343331
Options:
344332
-o,--output-dir <arg> Specifies the output directory for generated files
345333
-p,--password <arg> Specifies the password for an encrypted PDF
346-
-f,--format <arg> Comma-separated list of output formats to generate (json, text, html, pdf, markdown, markdown-with-html, markdown-with-images). Default: json
334+
-f,--format <arg> Comma-separated output formats to generate. (json, text, html, pdf, markdown, markdown-with-html, markdown-with-images)
347335
-q,--quiet Suppresses console logging output
348-
--content-safety-off <arg> Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny, hidden-ocg
336+
--content-safety-off <arg> Comma-separated content safety filters to disable. (all, hidden-text, off-page, tiny, hidden-ocg)
349337
--keep-line-breaks Preserves original line breaks in the extracted text
350338
--replace-invalid-chars <arg> Replaces invalid or unrecognized characters (e.g., �, \u0000) with the specified character
351339
--use-struct-tree Enables processing structure tree (disabled by default)

node/opendataloader-pdf/src/index.ts

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,9 @@ export function run(inputPath: string, options: RunOptions = {}): Promise<string
148148
export interface ConvertOptions {
149149
outputDir?: string;
150150
password?: string;
151-
format?: string;
151+
format?: string | string[];
152152
quiet?: boolean;
153-
contentSafetyOff?: string;
153+
contentSafetyOff?: string | string[];
154154
keepLineBreaks?: boolean;
155155
replaceInvalidChars?: string;
156156
useStructTree?: boolean;
@@ -175,13 +175,21 @@ export function convert(inputPaths: string[], options: ConvertOptions = {}): Pro
175175
args.push('--password', options.password);
176176
}
177177
if (options.format) {
178-
args.push('--format', options.format);
178+
if (Array.isArray(options.format)) {
179+
args.push('--format', options.format.join(','));
180+
} else {
181+
args.push('--format', options.format);
182+
}
179183
}
180184
if (options.quiet) {
181185
args.push('--quiet');
182186
}
183187
if (options.contentSafetyOff) {
184-
args.push('--content-safety-off', options.contentSafetyOff);
188+
if (Array.isArray(options.contentSafetyOff)) {
189+
args.push('--content-safety-off', options.contentSafetyOff.join(','));
190+
} else {
191+
args.push('--content-safety-off', options.contentSafetyOff);
192+
}
185193
}
186194
if (options.keepLineBreaks) {
187195
args.push('--keep-line-breaks');

python/opendataloader-pdf/src/opendataloader_pdf/wrapper.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import importlib.resources as resources
55
import locale
66
from pathlib import Path
7-
from typing import List, Optional
7+
from typing import List, Optional, Union
88

99
# The consistent name of the JAR file bundled with the package
1010
_JAR_NAME = "opendataloader-pdf-cli.jar"
@@ -86,9 +86,9 @@ def convert(
8686
input_path: List[str],
8787
output_dir: Optional[str] = None,
8888
password: Optional[str] = None,
89-
format: Optional[str] = None,
89+
format: Optional[Union[str, List[str]]] = None,
9090
quiet: bool = False,
91-
content_safety_off: Optional[str] = None,
91+
content_safety_off: Optional[Union[str, List[str]]] = None,
9292
keep_line_breaks: bool = False,
9393
replace_invalid_chars: Optional[str] = None,
9494
use_struct_tree: bool = False,
@@ -100,9 +100,9 @@ def convert(
100100
input_path: One or more input PDF file paths or directories
101101
output_dir: Directory where outputs are written
102102
password: Password for encrypted PDFs
103-
format: List of output formats (e.g., ["json", "html"])
103+
format: Comma-separated output formats to generate (json, text, html, pdf, markdown, markdown-with-html, markdown-with-images)
104104
quiet: Suppress CLI logging output
105-
content_safety_off: List of content safety filters to disable
105+
content_safety_off: Disable one or more content safety filters (all, hidden-text, off-page, tiny, hidden-ocg)
106106
keep_line_breaks: Preserve line breaks in text output
107107
replace_invalid_chars: Replacement character for invalid/unrecognized characters
108108
use_struct_tree: Enable processing structure tree (disabled by default)
@@ -114,17 +114,23 @@ def convert(
114114
if password:
115115
args.extend(["--password", password])
116116
if format:
117-
args.extend(["--format", format])
117+
if isinstance(format, list):
118+
args.extend(["--format", ",".join(format)])
119+
else:
120+
args.extend(["--format", format])
118121
if quiet:
119-
args.append("--quiet")
122+
args.extend(["--quiet"])
120123
if content_safety_off:
121-
args.extend(["--content-safety-off", content_safety_off])
124+
if isinstance(content_safety_off, list):
125+
args.extend(["--content-safety-off", ",".join(content_safety_off)])
126+
else:
127+
args.extend(["--content-safety-off", content_safety_off])
122128
if keep_line_breaks:
123-
args.append("--keep-line-breaks")
129+
args.extend(["--keep-line-breaks"])
124130
if replace_invalid_chars:
125131
args.extend(["--replace-invalid-chars", replace_invalid_chars])
126132
if use_struct_tree:
127-
args.extend("--use-struct-tree")
133+
args.extend(["--use-struct-tree"])
128134

129135
# Run the command
130136
run_jar(args, quiet)
@@ -207,7 +213,7 @@ def main(argv=None) -> int:
207213
parser.add_argument(
208214
"-f",
209215
"--format",
210-
help="Comma-separated output format(s) to generate.",
216+
help="Comma-separated output formats to generate. (json, text, html, pdf, markdown, markdown-with-html, markdown-with-images)",
211217
)
212218
parser.add_argument(
213219
"-q",
@@ -217,7 +223,7 @@ def main(argv=None) -> int:
217223
)
218224
parser.add_argument(
219225
"--content-safety-off",
220-
help="Comma-separated content safety filters to disable.",
226+
help="Comma-separated content safety filters to disable. (all, hidden-text, off-page, tiny, hidden-ocg)",
221227
)
222228
parser.add_argument(
223229
"--keep-line-breaks",

0 commit comments

Comments
 (0)