From cd6111ffac3221731ba2315ddd3a4f6ea30e6df6 Mon Sep 17 00:00:00 2001 From: tallison Date: Wed, 17 Dec 2025 13:18:55 -0500 Subject: [PATCH 1/5] TIKA-4582 -- checkpoint --- ...AbstractRecursiveParserWrapperHandler.java | 6 - .../tika/sax/BasicContentHandlerFactory.java | 101 +++++++++++- .../tika/sax/ContentHandlerFactory.java | 22 ++- .../example/PickBestTextEncodingParser.java | 5 - .../tika/example/PipesForkParserExample.java | 6 +- .../pipes/kafka/tests/TikaPipesKafkaTest.java | 4 +- .../opensearch/tests/OpenSearchTest.java | 18 +-- .../pipes/s3/tests/S3PipeIntegrationTest.java | 3 +- .../solr/tests/TikaPipesSolrTestBase.java | 8 +- .../apache/tika/async/cli/TikaAsyncCLI.java | 7 +- .../tika/async/cli/AsyncProcessorTest.java | 15 -- .../apache/tika/pipes/api/HandlerConfig.java | 149 ------------------ .../org/apache/tika/pipes/api/ParseMode.java | 67 ++++++++ .../PipesIteratorBaseConfig.java | 37 ----- .../pipesiterator/PipesIteratorConfig.java | 21 --- .../apache/tika/pipes/core/PipesConfig.java | 59 +++++++ .../tika/pipes/core/server/ParseHandler.java | 56 ++++--- .../tika/pipes/core/server/PipesWorker.java | 8 +- .../serialization/JsonFetchEmitTupleTest.java | 34 ++-- .../tika/pipes/fork/PipesForkParser.java | 11 +- .../pipes/fork/PipesForkParserConfig.java | 48 ++++-- .../tika/pipes/fork/PipesForkParserTest.java | 20 +-- .../pipesiterator/PipesIteratorBase.java | 10 ++ .../pipesiterator/PipesIteratorConfig.java | 61 +++++++ .../iterator/azblob/AZBlobPipesIterator.java | 11 +- .../azblob/AZBlobPipesIteratorConfig.java | 24 ++- .../pipes/iterator/csv/CSVPipesIterator.java | 29 ++-- .../iterator/csv/CSVPipesIteratorConfig.java | 24 ++- .../iterator/fs/FileSystemPipesIterator.java | 12 +- .../fs/FileSystemPipesIteratorConfig.java | 23 ++- .../pipes/iterator/gcs/GCSPipesIterator.java | 13 +- .../iterator/gcs/GCSPipesIteratorConfig.java | 24 ++- .../iterator/jdbc/JDBCPipesIterator.java | 33 ++-- .../jdbc/JDBCPipesIteratorConfig.java | 24 ++- .../json/JsonPipesIteratorConfig.java | 24 ++- .../iterator/kafka/KafkaPipesIterator.java | 11 +- .../kafka/KafkaPipesIteratorConfig.java | 24 ++- .../pipes/iterator/s3/S3PipesIterator.java | 13 +- .../iterator/s3/S3PipesIteratorConfig.java | 24 ++- .../iterator/solr/SolrPipesIterator.java | 12 +- .../solr/SolrPipesIteratorConfig.java | 24 ++- .../tika/config/loader/ComponentRegistry.java | 2 - .../tika/config/loader/ConfigLoader.java | 16 +- .../apache/tika/config/loader/TikaLoader.java | 4 +- .../resource/RecursiveMetadataResource.java | 30 ++-- .../server/core/resource/TikaResource.java | 4 +- .../tika/server/core/TikaPipesTest.java | 8 +- .../core/TikaServerAsyncIntegrationTest.java | 9 +- .../core/TikaServerPipesIntegrationTest.java | 13 +- .../RecursiveMetadataResourceTest.java | 4 +- .../tika/server/standard/TikaPipesTest.java | 10 +- 51 files changed, 619 insertions(+), 606 deletions(-) delete mode 100644 tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/HandlerConfig.java create mode 100644 tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java delete mode 100644 tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorBaseConfig.java delete mode 100644 tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorConfig.java create mode 100644 tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorConfig.java diff --git a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java index 850ceb4147c..d607c28142e 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java @@ -16,9 +16,7 @@ */ package org.apache.tika.sax; -import java.io.OutputStream; import java.io.Serializable; -import java.nio.charset.Charset; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -59,10 +57,6 @@ public ContentHandler getNewContentHandler() { return contentHandlerFactory.getNewContentHandler(); } - public ContentHandler getNewContentHandler(OutputStream os, Charset charset) { - return contentHandlerFactory.getNewContentHandler(os, charset); - } - /** * This is called before parsing each embedded document. Override this * for custom behavior. Make sure to call this in your custom classes diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java index 361b7817c72..2f4ca1d35e9 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java @@ -29,16 +29,25 @@ import org.apache.tika.parser.ParseContext; /** - * Basic factory for creating common types of ContentHandlers + * Basic factory for creating common types of ContentHandlers. + *

+ * Implements {@link StreamingContentHandlerFactory} to support both in-memory + * content extraction and streaming output to an OutputStream. */ -public class BasicContentHandlerFactory implements ContentHandlerFactory, WriteLimiter { +public class BasicContentHandlerFactory implements StreamingContentHandlerFactory, WriteLimiter { - private final HANDLER_TYPE type; - private final int writeLimit; + private HANDLER_TYPE type = HANDLER_TYPE.TEXT; + private int writeLimit = -1; + private boolean throwOnWriteLimitReached = true; + private int maxEmbeddedResources = -1; + private transient ParseContext parseContext; - private final boolean throwOnWriteLimitReached; - - private final ParseContext parseContext; + /** + * No-arg constructor for bean-style configuration (e.g., Jackson deserialization). + * Creates a factory with TEXT handler type, unlimited write, and throwOnWriteLimitReached=true. + */ + public BasicContentHandlerFactory() { + } /** * Create a BasicContentHandlerFactory with {@link #throwOnWriteLimitReached} is true @@ -70,7 +79,29 @@ public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit, throw new IllegalArgumentException("parse context must not be null if " + "throwOnWriteLimitReached is false"); } + } + /** + * Full constructor with all parameters including maxEmbeddedResources. + * + * @param type basic type of handler + * @param writeLimit maximum number of characters to store; -1 for unlimited + * @param throwOnWriteLimitReached whether to throw when write limit is reached + * @param maxEmbeddedResources maximum number of embedded resources to process; -1 for unlimited + * @param parseContext to store warnings if throwOnWriteLimitReached is false + */ + public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit, + boolean throwOnWriteLimitReached, int maxEmbeddedResources, + ParseContext parseContext) { + this.type = type; + this.writeLimit = writeLimit; + this.throwOnWriteLimitReached = throwOnWriteLimitReached; + this.maxEmbeddedResources = maxEmbeddedResources; + this.parseContext = parseContext; + if (throwOnWriteLimitReached == false && parseContext == null) { + throw new IllegalArgumentException("parse context must not be null if " + + "throwOnWriteLimitReached is false"); + } } /** @@ -191,6 +222,22 @@ public HANDLER_TYPE getType() { return type; } + /** + * Sets the handler type. + * @param type the handler type + */ + public void setType(HANDLER_TYPE type) { + this.type = type; + } + + /** + * Sets the handler type from a string. + * @param type the handler type name (text, html, xml, body, ignore) + */ + public void setType(String type) { + this.type = parseHandlerType(type, HANDLER_TYPE.TEXT); + } + /** * Common handler types for content. */ @@ -203,8 +250,48 @@ public int getWriteLimit() { return writeLimit; } + /** + * Sets the write limit. + * @param writeLimit max characters to extract; -1 for unlimited + */ + public void setWriteLimit(int writeLimit) { + this.writeLimit = writeLimit; + } + @Override public boolean isThrowOnWriteLimitReached() { return throwOnWriteLimitReached; } + + /** + * Sets whether to throw an exception when write limit is reached. + * @param throwOnWriteLimitReached true to throw, false to silently stop + */ + public void setThrowOnWriteLimitReached(boolean throwOnWriteLimitReached) { + this.throwOnWriteLimitReached = throwOnWriteLimitReached; + } + + /** + * Gets the maximum number of embedded resources to process. + * @return max embedded resources; -1 for unlimited + */ + public int getMaxEmbeddedResources() { + return maxEmbeddedResources; + } + + /** + * Sets the maximum number of embedded resources to process. + * @param maxEmbeddedResources max embedded resources; -1 for unlimited + */ + public void setMaxEmbeddedResources(int maxEmbeddedResources) { + this.maxEmbeddedResources = maxEmbeddedResources; + } + + /** + * Sets the parse context for storing warnings when throwOnWriteLimitReached is false. + * @param parseContext the parse context + */ + public void setParseContext(ParseContext parseContext) { + this.parseContext = parseContext; + } } diff --git a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java index dc2f3384fcf..1022e3f4ca3 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java @@ -16,19 +16,27 @@ */ package org.apache.tika.sax; - -import java.io.OutputStream; import java.io.Serializable; -import java.nio.charset.Charset; import org.xml.sax.ContentHandler; /** - * Interface to allow easier injection of code for getting a new ContentHandler + * Factory interface for creating ContentHandler instances. + *

+ * This is the base interface used by tika-pipes, RecursiveParserWrapper, and other + * components that need to create content handlers for in-memory content extraction. + *

+ * For streaming output to an OutputStream, see {@link StreamingContentHandlerFactory}. + * + * @see StreamingContentHandlerFactory + * @see BasicContentHandlerFactory */ public interface ContentHandlerFactory extends Serializable { - ContentHandler getNewContentHandler(); - - ContentHandler getNewContentHandler(OutputStream os, Charset charset); + /** + * Creates a new ContentHandler for extracting content. + * + * @return a new ContentHandler instance + */ + ContentHandler getNewContentHandler(); } diff --git a/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java b/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java index 4796401ebf8..4afc9122903 100644 --- a/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java +++ b/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java @@ -164,11 +164,6 @@ public ContentHandler getNewContentHandler() { } return handler; } - - @Override - public ContentHandler getNewContentHandler(OutputStream os, Charset charset) { - return getNewContentHandler(); - } } protected class CharsetTester { diff --git a/tika-example/src/main/java/org/apache/tika/example/PipesForkParserExample.java b/tika-example/src/main/java/org/apache/tika/example/PipesForkParserExample.java index e4439b801f6..4b69d10afa0 100644 --- a/tika-example/src/main/java/org/apache/tika/example/PipesForkParserExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/PipesForkParserExample.java @@ -26,7 +26,7 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.core.PipesException; import org.apache.tika.pipes.fork.PipesForkParser; import org.apache.tika.pipes.fork.PipesForkParserConfig; @@ -277,7 +277,7 @@ public void parseWithMetadata(Path filePath) public void parseEmbeddedDocumentsRmeta(Path filePath) throws IOException, InterruptedException, TikaException, PipesException { PipesForkParserConfig config = new PipesForkParserConfig() - .setParseMode(HandlerConfig.PARSE_MODE.RMETA); + .setParseMode(ParseMode.RMETA); try (PipesForkParser parser = new PipesForkParser(config); TikaInputStream tis = TikaInputStream.get(filePath)) { @@ -334,7 +334,7 @@ public void parseEmbeddedDocumentsRmeta(Path filePath) public void parseEmbeddedDocumentsConcatenate(Path filePath) throws IOException, InterruptedException, TikaException, PipesException { PipesForkParserConfig config = new PipesForkParserConfig() - .setParseMode(HandlerConfig.PARSE_MODE.CONCATENATE); + .setParseMode(ParseMode.CONCATENATE); try (PipesForkParser parser = new PipesForkParser(config); TikaInputStream tis = TikaInputStream.get(filePath)) { diff --git a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java index e1b32ceb259..cdfb7391b99 100644 --- a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java +++ b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java @@ -64,7 +64,7 @@ import org.apache.tika.cli.TikaCLI; import org.apache.tika.config.JsonConfigHelper; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.utils.SystemUtils; /** @@ -220,7 +220,7 @@ private Path getTikaConfig(Path pipesDirectory, Path testFileFolderPath) throws replacements.put("EMITTER_TOPIC", EMITTER_TOPIC); replacements.put("BOOTSTRAP_SERVERS", kafka.getBootstrapServers()); replacements.put("FETCHER_BASE_PATH", testFileFolderPath); - replacements.put("PARSE_MODE", HandlerConfig.PARSE_MODE.RMETA.name()); + replacements.put("PARSE_MODE", ParseMode.RMETA.name()); replacements.put("LOG4J_JVM_ARG", "-Dlog4j.configurationFile=" + log4jPropFile.toAbsolutePath()); JsonConfigHelper.writeConfigFromResource("/kafka/plugins-template.json", diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java index 4a4b75f8a30..65a72ab2f73 100644 --- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java +++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java @@ -53,7 +53,7 @@ import org.apache.tika.exception.TikaConfigException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.emitter.Emitter; import org.apache.tika.pipes.core.emitter.EmitterManager; import org.apache.tika.pipes.emitter.opensearch.HttpClientConfig; @@ -96,7 +96,7 @@ public void testPluginsConfig(@TempDir Path pipesDirectory) throws Exception { Path pluginsConfg = getPluginsConfig( pipesDirectory, OpenSearchEmitterConfig.AttachmentStrategy.PARENT_CHILD, OpenSearchEmitterConfig.UpdateStrategy.OVERWRITE, - HandlerConfig.PARSE_MODE.RMETA, "https://opensearch", Paths.get("testDocs")); + ParseMode.RMETA, "https://opensearch", Paths.get("testDocs")); // PipesReporter reporter = ReporterManager.load(pluginsConfg); // System.out.println(reporter); // PipesIterator pipesIterator = PipesIteratorManager.load(pluginsConfg); @@ -113,7 +113,7 @@ public void testBasicFSToOpenSearch(@TempDir Path pipesDirectory, @TempDir Path sendMappings(client, endpoint, TEST_INDEX, "opensearch-mappings.json"); runPipes(client, OpenSearchEmitterConfig.AttachmentStrategy.SEPARATE_DOCUMENTS, - OpenSearchEmitterConfig.UpdateStrategy.UPSERT, HandlerConfig.PARSE_MODE.CONCATENATE, endpoint, + OpenSearchEmitterConfig.UpdateStrategy.UPSERT, ParseMode.CONCATENATE, endpoint, pipesDirectory, testDocDirectory); String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " + @@ -182,7 +182,7 @@ public void testParentChildFSToOpenSearch(@TempDir Path pipesDirectory, @TempDir runPipes(client, OpenSearchEmitterConfig.AttachmentStrategy.PARENT_CHILD, OpenSearchEmitterConfig.UpdateStrategy.OVERWRITE, - HandlerConfig.PARSE_MODE.RMETA, endpoint, pipesDirectory, testDocDirectory); + ParseMode.RMETA, endpoint, pipesDirectory, testDocDirectory); String query = "{ \"track_total_hits\": true, \"from\":0, \"size\": 10000, \"query\": { \"match\": { \"content\": { " + "\"query\": \"happiness\" } } } }"; @@ -250,7 +250,7 @@ public void testSeparateDocsFSToOpenSearch(@TempDir Path pipesDirectory, @TempDi runPipes(client, OpenSearchEmitterConfig.AttachmentStrategy.SEPARATE_DOCUMENTS, OpenSearchEmitterConfig.UpdateStrategy.OVERWRITE, - HandlerConfig.PARSE_MODE.RMETA, endpoint, + ParseMode.RMETA, endpoint, pipesDirectory, testDocDirectory); String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " + @@ -316,7 +316,7 @@ public void testUpsertSeparateDocsFSToOpenSearch(@TempDir Path pipesDirectory, @ runPipes(client, OpenSearchEmitterConfig.AttachmentStrategy.SEPARATE_DOCUMENTS, OpenSearchEmitterConfig.UpdateStrategy.UPSERT, - HandlerConfig.PARSE_MODE.RMETA, endpoint, pipesDirectory, testDocDirectory); + ParseMode.RMETA, endpoint, pipesDirectory, testDocDirectory); String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " + "\"query\": \"happiness\" } } } }"; @@ -376,7 +376,7 @@ public void testUpsert(@TempDir Path pipesDirectory, @TempDir Path testDocDirect String endpoint = CONTAINER.getHttpHostAddress() + "/" + TEST_INDEX; sendMappings(client, endpoint, TEST_INDEX, "opensearch-mappings.json"); Path pluginsConfigFile = getPluginsConfig(pipesDirectory, OpenSearchEmitterConfig.AttachmentStrategy.SEPARATE_DOCUMENTS, - OpenSearchEmitterConfig.UpdateStrategy.UPSERT, HandlerConfig.PARSE_MODE.RMETA, + OpenSearchEmitterConfig.UpdateStrategy.UPSERT, ParseMode.RMETA, endpoint, testDocDirectory); TikaJsonConfig tikaJsonConfig = TikaJsonConfig.load(pluginsConfigFile); @@ -448,7 +448,7 @@ protected void sendMappings(OpensearchTestClient client, String endpoint, String private void runPipes(OpensearchTestClient client, OpenSearchEmitterConfig.AttachmentStrategy attachmentStrategy, OpenSearchEmitterConfig.UpdateStrategy updateStrategy, - HandlerConfig.PARSE_MODE parseMode, String endpoint, Path pipesDirectory, Path testDocDirectory) throws Exception { + ParseMode parseMode, String endpoint, Path pipesDirectory, Path testDocDirectory) throws Exception { Path pluginsConfig = getPluginsConfig(pipesDirectory, attachmentStrategy, updateStrategy, parseMode, endpoint, testDocDirectory); @@ -464,7 +464,7 @@ private void runPipes(OpensearchTestClient client, OpenSearchEmitterConfig.Attac @NotNull private Path getPluginsConfig(Path pipesDirectory, OpenSearchEmitterConfig.AttachmentStrategy attachmentStrategy, OpenSearchEmitterConfig.UpdateStrategy updateStrategy, - HandlerConfig.PARSE_MODE parseMode, String endpoint, Path testDocDirectory) throws IOException { + ParseMode parseMode, String endpoint, Path testDocDirectory) throws IOException { Path tikaConfig = pipesDirectory.resolve("plugins-config.json"); Path log4jPropFile = pipesDirectory.resolve("log4j2.xml"); diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java index 92b3c6b2479..888396343fa 100644 --- a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java +++ b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java @@ -55,6 +55,7 @@ import org.apache.tika.cli.TikaCLI; import org.apache.tika.config.JsonConfigHelper; +import org.apache.tika.pipes.api.ParseMode; @TestInstance(TestInstance.Lifecycle.PER_CLASS) @Testcontainers(disabledWithoutDocker = true) @@ -140,7 +141,7 @@ void s3PipelineIteratorS3FetcherAndS3Emitter() throws Exception { // Create plugins config JSON Map replacements = new HashMap<>(); replacements.put("LOG4J_JVM_ARG", "-Dlog4j.configurationFile=" + log4jPropFile.toAbsolutePath()); - replacements.put("PARSE_MODE", org.apache.tika.pipes.api.HandlerConfig.PARSE_MODE.RMETA.name()); + replacements.put("PARSE_MODE", ParseMode.RMETA.name()); replacements.put("PIPE_ITERATOR_BUCKET", FETCH_BUCKET); replacements.put("EMIT_BUCKET", EMIT_BUCKET); replacements.put("FETCH_BUCKET", FETCH_BUCKET); diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java index fb195df8562..0fea4b0cd07 100644 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java +++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java @@ -48,7 +48,7 @@ import org.apache.tika.cli.TikaCLI; import org.apache.tika.config.JsonConfigHelper; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.emitter.solr.SolrEmitterConfig; import org.apache.tika.utils.SystemUtils; @@ -210,7 +210,7 @@ protected void runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter(Path pipesDire Path tikaConfigFile = getTikaConfig(pipesDirectory, SolrEmitterConfig.UpdateStrategy.ADD, SolrEmitterConfig.AttachmentStrategy.PARENT_CHILD, - HandlerConfig.PARSE_MODE.RMETA); + ParseMode.RMETA); TikaCLI.main(new String[]{"-a", "-c", tikaConfigFile.toAbsolutePath().toString()}); @@ -244,7 +244,7 @@ protected void runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter(Path pipesDire tikaConfigFile = getTikaConfig(pipesDirectory, SolrEmitterConfig.UpdateStrategy.UPDATE_MUST_EXIST, SolrEmitterConfig.AttachmentStrategy.PARENT_CHILD, - HandlerConfig.PARSE_MODE.RMETA); + ParseMode.RMETA); TikaCLI.main(new String[]{"-a", "-c", tikaConfigFile.toAbsolutePath().toString()}); @@ -263,7 +263,7 @@ protected void runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter(Path pipesDire private Path getTikaConfig(Path pipesDirectory, SolrEmitterConfig.UpdateStrategy updateStrategy, SolrEmitterConfig.AttachmentStrategy attachmentStrategy, - HandlerConfig.PARSE_MODE parseMode) throws IOException { + ParseMode parseMode) throws IOException { Path tikaConfig = pipesDirectory.resolve("plugins-config.json"); Path log4jPropFile = pipesDirectory.resolve("log4j2.xml"); diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java index 6576c904ea2..15586c526cf 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java @@ -37,7 +37,6 @@ import org.apache.tika.exception.TikaConfigException; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.pipes.api.pipesiterator.PipesIterator; @@ -47,6 +46,7 @@ import org.apache.tika.plugins.ExtensionConfig; import org.apache.tika.plugins.TikaPluginManager; import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.utils.StringUtils; public class TikaAsyncCLI { @@ -290,9 +290,8 @@ private static void configureHandler(FetchEmitTuple t, SimpleAsyncConfig asyncCo if (asyncConfig.getHandlerType() == BasicContentHandlerFactory.HANDLER_TYPE.TEXT) { return; } - HandlerConfig handlerConfig = new HandlerConfig(asyncConfig.getHandlerType(), HandlerConfig.PARSE_MODE.RMETA, - -1, -1, false); - t.getParseContext().set(HandlerConfig.class, handlerConfig); + ContentHandlerFactory factory = new BasicContentHandlerFactory(asyncConfig.getHandlerType(), -1); + t.getParseContext().set(ContentHandlerFactory.class, factory); } private static void configureExtractBytes(FetchEmitTuple t, SimpleAsyncConfig asyncConfig) { diff --git a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java index 4bd181699e6..6d26b6dd0fa 100644 --- a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java +++ b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java @@ -17,7 +17,6 @@ package org.apache.tika.async.cli; -import static org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig.DEFAULT_HANDLER_CONFIG; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -44,7 +43,6 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.pipes.api.pipesiterator.PipesIterator; @@ -112,8 +110,6 @@ public void setUp() throws Exception { @Test public void testRecursiveUnpacking() throws Exception { -// TikaAsyncCLI cli = new TikaAsyncCLI(); - // cli.main(new String[]{ configDir.resolve("tika-config.xml").toAbsolutePath().toString()}); AsyncProcessor processor = AsyncProcessor.load(configDir.resolve("tika-config.json")); EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = new EmbeddedDocumentBytesConfig(true); @@ -122,7 +118,6 @@ public void testRecursiveUnpacking() throws Exception { embeddedDocumentBytesConfig.setSuffixStrategy(EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.NONE); embeddedDocumentBytesConfig.setEmbeddedIdPrefix("-"); ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG); parseContext.set(EmbeddedDocumentBytesConfig.class, embeddedDocumentBytesConfig); FetchEmitTuple t = new FetchEmitTuple("myId-1", new FetchKey("fsf", "mock.xml"), @@ -133,7 +128,6 @@ public void testRecursiveUnpacking() throws Exception { for (int i = 0; i < 10; i++) { processor.offer(PipesIterator.COMPLETED_SEMAPHORE, 1000); } - //TODO clean this up while (processor.checkActive()) { Thread.sleep(100); } @@ -161,14 +155,9 @@ public void testRecursiveUnpacking() throws Exception { @Test public void testStopsOnApplicationError() throws Exception { - // Test that AsyncProcessor stops processing when an application error occurs - // (TIKA-4570) AsyncProcessor processor = AsyncProcessor.load(configDir.resolve("tika-config.json")); - // Create a tuple with a non-existent fetcher - this will cause FETCHER_NOT_FOUND - // which is a TASK_EXCEPTION but will stop processing in CLI mode (default) ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG); FetchEmitTuple badTuple = new FetchEmitTuple( "bad-tuple-1", new FetchKey("non-existent-fetcher", "some-file.txt"), @@ -177,10 +166,8 @@ public void testStopsOnApplicationError() throws Exception { parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT); - // Offer the bad tuple processor.offer(badTuple, 1000); - // Wait for the error to be detected int maxWaitMs = 30000; int waited = 0; while (!processor.hasApplicationError() && waited < maxWaitMs) { @@ -188,11 +175,9 @@ public void testStopsOnApplicationError() throws Exception { waited += 100; } - // Verify that the application error was detected assertTrue(processor.hasApplicationError(), "AsyncProcessor should detect application error from bad fetcher"); - // Verify that subsequent offers throw PipesException FetchEmitTuple anotherTuple = new FetchEmitTuple( "another-tuple", new FetchKey("fsf", "mock.xml"), diff --git a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/HandlerConfig.java b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/HandlerConfig.java deleted file mode 100644 index b336f1a4fcc..00000000000 --- a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/HandlerConfig.java +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.pipes.api; - -import java.io.Serializable; -import java.util.Locale; -import java.util.Objects; - -import org.apache.tika.sax.BasicContentHandlerFactory; - -/** - * Configuration for content handler behavior during parsing. - */ -public class HandlerConfig implements Serializable { - - /** - * {@link PARSE_MODE#RMETA} "recursive metadata" is the same as the -J option - * in tika-app and the /rmeta endpoint in tika-server. Each embedded file is represented as - * its own metadata object. - * - * {@link PARSE_MODE#CONCATENATE} is similar - * to the legacy tika-app behavior and the /tika endpoint (accept: application/json) in - * tika-server. This concatenates the - * contents of embedded files and returns a single metadata object for the file no - * matter how many embedded objects there are; this option throws away metadata from - * embedded objects and silently skips exceptions in embedded objects. - */ - public enum PARSE_MODE { - RMETA, - CONCATENATE; - - public static PARSE_MODE parseMode(String modeString) { - for (PARSE_MODE m : PARSE_MODE.values()) { - if (m.name().equalsIgnoreCase(modeString)) { - return m; - } - } - StringBuilder sb = new StringBuilder(); - int i = 0; - for (PARSE_MODE m : PARSE_MODE.values()) { - if (i++ > 0) { - sb.append(", "); - } - sb.append(m.name().toLowerCase(Locale.US)); - } - throw new IllegalArgumentException("mode must be one of: (" + sb + - "). I regret I do not understand: " + modeString); - } - } - BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.TEXT; - PARSE_MODE parseMode = PARSE_MODE.RMETA; - int writeLimit = -1; - int maxEmbeddedResources = -1; - boolean throwOnWriteLimitReached = true; - - public HandlerConfig() { - - } - - public HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE type, PARSE_MODE parseMode, int writeLimit, int maxEmbeddedResources, boolean throwOnWriteLimitReached) { - this.type = type; - this.parseMode = parseMode; - this.writeLimit = writeLimit; - this.maxEmbeddedResources = maxEmbeddedResources; - this.throwOnWriteLimitReached = throwOnWriteLimitReached; - } - - public BasicContentHandlerFactory.HANDLER_TYPE getType() { - return type; - } - - public void setType(BasicContentHandlerFactory.HANDLER_TYPE type) { - this.type = type; - } - - public void setType(String typeString) { - this.type = BasicContentHandlerFactory.HANDLER_TYPE.valueOf(typeString); - } - - public PARSE_MODE getParseMode() { - return parseMode; - } - - public void setParseMode(PARSE_MODE parseMode) { - this.parseMode = parseMode; - } - - public void setParseMode(String parseMode) { - this.parseMode = PARSE_MODE.valueOf(parseMode); - } - - public int getWriteLimit() { - return writeLimit; - } - - public void setWriteLimit(int writeLimit) { - this.writeLimit = writeLimit; - } - - public int getMaxEmbeddedResources() { - return maxEmbeddedResources; - } - - public void setMaxEmbeddedResources(int maxEmbeddedResources) { - this.maxEmbeddedResources = maxEmbeddedResources; - } - - public boolean isThrowOnWriteLimitReached() { - return throwOnWriteLimitReached; - } - - public void setThrowOnWriteLimitReached(boolean throwOnWriteLimitReached) { - this.throwOnWriteLimitReached = throwOnWriteLimitReached; - } - - @Override - public final boolean equals(Object o) { - if (!(o instanceof HandlerConfig that)) { - return false; - } - - return writeLimit == that.writeLimit && maxEmbeddedResources == that.maxEmbeddedResources && throwOnWriteLimitReached == that.throwOnWriteLimitReached && - type == that.type && parseMode == that.parseMode; - } - - @Override - public int hashCode() { - int result = Objects.hashCode(type); - result = 31 * result + Objects.hashCode(parseMode); - result = 31 * result + writeLimit; - result = 31 * result + maxEmbeddedResources; - result = 31 * result + Boolean.hashCode(throwOnWriteLimitReached); - return result; - } -} diff --git a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java new file mode 100644 index 00000000000..edd82729dad --- /dev/null +++ b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.api; + +import java.util.Locale; + +/** + * Controls how embedded documents are handled during parsing. + *

+ * This can be set as a default in PipesConfig (loaded from tika-config.json) + * or overridden per-file via ParseContext. + */ +public enum ParseMode { + + /** + * Each embedded file gets its own metadata object in a list. + *

+ * This is equivalent to the -J option in tika-app and the /rmeta endpoint + * in tika-server. The result is a list of metadata objects, one for each + * document (container + all embedded documents). + */ + RMETA, + + /** + * Concatenates content from all embedded files into a single document. + *

+ * This is equivalent to the legacy tika-app behavior and the /tika endpoint + * in tika-server. The result is a single metadata object with concatenated + * content from all documents. + */ + CONCATENATE; + + /** + * Parses a string to a ParseMode enum value. + * + * @param modeString the string to parse (case-insensitive) + * @return the corresponding ParseMode + * @throws IllegalArgumentException if the string doesn't match any mode + */ + public static ParseMode parse(String modeString) { + if (modeString == null) { + throw new IllegalArgumentException("Parse mode cannot be null"); + } + String normalized = modeString.toUpperCase(Locale.ROOT).trim(); + try { + return ParseMode.valueOf(normalized); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException( + "Invalid parse mode: '" + modeString + "'. " + + "Must be one of: RMETA, CONCATENATE"); + } + } +} diff --git a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorBaseConfig.java b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorBaseConfig.java deleted file mode 100644 index 021d62e400a..00000000000 --- a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorBaseConfig.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.pipes.api.pipesiterator; - -import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; -import org.apache.tika.sax.BasicContentHandlerFactory; - - -public record PipesIteratorBaseConfig(String fetcherId, String emitterId, HandlerConfig handlerConfig, - FetchEmitTuple.ON_PARSE_EXCEPTION onParseException, long maxWaitMs, int queueSize) { - - public static final HandlerConfig DEFAULT_HANDLER_CONFIG = new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, HandlerConfig.PARSE_MODE.RMETA, - -1, -1, true); - private static final FetchEmitTuple.ON_PARSE_EXCEPTION DEFAULT_ON_PARSE_EXCEPTION = FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT; - private static final long DEFAULT_MAX_WAIT_MS = 600_000; - private static final int DEFAULT_QUEUE_SIZE = 10000; - - public PipesIteratorBaseConfig(String fetcherId, String emitterId) { - this(fetcherId, emitterId, DEFAULT_HANDLER_CONFIG, DEFAULT_ON_PARSE_EXCEPTION, DEFAULT_MAX_WAIT_MS, DEFAULT_QUEUE_SIZE); - } - -} diff --git a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorConfig.java b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorConfig.java deleted file mode 100644 index 09a9ab4abb3..00000000000 --- a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorConfig.java +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.pipes.api.pipesiterator; - -public interface PipesIteratorConfig { - PipesIteratorBaseConfig getBaseConfig(); -} diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesConfig.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesConfig.java index c70f16b1dc7..f5c2622ece7 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesConfig.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesConfig.java @@ -21,6 +21,8 @@ import org.apache.tika.config.loader.TikaJsonConfig; import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.pipes.api.FetchEmitTuple; +import org.apache.tika.pipes.api.ParseMode; public class PipesConfig { @@ -85,6 +87,17 @@ public class PipesConfig { */ private boolean stopOnlyOnFatal = false; + /** + * Default parse mode for how embedded documents are handled. + * Can be overridden per-file via ParseContext. + */ + private ParseMode parseMode = ParseMode.RMETA; + + /** + * Default behavior when a parse exception occurs. + */ + private FetchEmitTuple.ON_PARSE_EXCEPTION onParseException = FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT; + private ArrayList forkedJvmArgs = new ArrayList<>(); private String javaPath = "java"; @@ -348,4 +361,50 @@ public boolean isStopOnlyOnFatal() { public void setStopOnlyOnFatal(boolean stopOnlyOnFatal) { this.stopOnlyOnFatal = stopOnlyOnFatal; } + + /** + * Gets the default parse mode for how embedded documents are handled. + * + * @return the default parse mode + */ + public ParseMode getParseMode() { + return parseMode; + } + + /** + * Sets the default parse mode for how embedded documents are handled. + * This can be overridden per-file via ParseContext. + * + * @param parseMode the parse mode (RMETA or CONCATENATE) + */ + public void setParseMode(ParseMode parseMode) { + this.parseMode = parseMode; + } + + /** + * Sets the default parse mode from a string. + * + * @param parseMode the parse mode name (rmeta or concatenate) + */ + public void setParseMode(String parseMode) { + this.parseMode = ParseMode.parse(parseMode); + } + + /** + * Gets the default behavior when a parse exception occurs. + * + * @return the parse exception behavior + */ + public FetchEmitTuple.ON_PARSE_EXCEPTION getOnParseException() { + return onParseException; + } + + /** + * Sets the default behavior when a parse exception occurs. + * + * @param onParseException the parse exception behavior + */ + public void setOnParseException(FetchEmitTuple.ON_PARSE_EXCEPTION onParseException) { + this.onParseException = onParseException; + } } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java index 7e670c63a4f..5ad637bd13f 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java @@ -44,7 +44,7 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.ContentHandlerFactory; @@ -79,12 +79,13 @@ PipesWorker.ParseDataOrPipesResult parseWithStream(FetchEmitTuple fetchEmitTuple List metadataList; //this adds the EmbeddedDocumentByteStore to the parsecontext - HandlerConfig handlerConfig = parseContext.get(HandlerConfig.class); - if (handlerConfig.getParseMode() == HandlerConfig.PARSE_MODE.RMETA) { + ParseMode parseMode = getParseMode(parseContext); + ContentHandlerFactory contentHandlerFactory = getContentHandlerFactory(parseContext); + if (parseMode == ParseMode.RMETA) { metadataList = - parseRecursive(fetchEmitTuple, handlerConfig, stream, metadata, parseContext); + parseRecursive(fetchEmitTuple, contentHandlerFactory, stream, metadata, parseContext); } else { - metadataList = parseConcatenated(fetchEmitTuple, handlerConfig, stream, metadata, + metadataList = parseConcatenated(fetchEmitTuple, contentHandlerFactory, stream, metadata, parseContext); } @@ -92,6 +93,24 @@ PipesWorker.ParseDataOrPipesResult parseWithStream(FetchEmitTuple fetchEmitTuple parseContext.get(EmbeddedDocumentBytesHandler.class)), null); } + private ParseMode getParseMode(ParseContext parseContext) { + ParseMode mode = parseContext.get(ParseMode.class); + if (mode != null) { + return mode; + } + // Default to RMETA mode + return ParseMode.RMETA; + } + + private ContentHandlerFactory getContentHandlerFactory(ParseContext parseContext) { + ContentHandlerFactory factory = parseContext.get(ContentHandlerFactory.class); + if (factory != null) { + return factory; + } + // Default to BasicContentHandlerFactory with TEXT handler, unlimited write + return new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1); + } + private void _preParse(FetchEmitTuple t, TikaInputStream tis, Metadata metadata, @@ -133,14 +152,16 @@ private Metadata preParse(FetchEmitTuple t, TikaInputStream tis, Metadata metada } public List parseRecursive(FetchEmitTuple fetchEmitTuple, - HandlerConfig handlerConfig, TikaInputStream stream, + ContentHandlerFactory contentHandlerFactory, TikaInputStream stream, Metadata metadata, ParseContext parseContext) throws InterruptedException { //Intentionally do not add the metadata filter here! //We need to let stacktraces percolate + int maxEmbeddedResources = -1; + if (contentHandlerFactory instanceof BasicContentHandlerFactory) { + maxEmbeddedResources = ((BasicContentHandlerFactory) contentHandlerFactory).getMaxEmbeddedResources(); + } RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(handlerConfig.getType(), - handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), - parseContext), handlerConfig.getMaxEmbeddedResources()); + contentHandlerFactory, maxEmbeddedResources); long start = System.currentTimeMillis(); @@ -168,25 +189,24 @@ public List parseRecursive(FetchEmitTuple fetchEmitTuple, } public List parseConcatenated(FetchEmitTuple fetchEmitTuple, - HandlerConfig handlerConfig, TikaInputStream stream, + ContentHandlerFactory contentHandlerFactory, TikaInputStream stream, Metadata metadata, ParseContext parseContext) { - ContentHandlerFactory contentHandlerFactory = - new BasicContentHandlerFactory(handlerConfig.getType(), - handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), - parseContext); - ContentHandler handler = contentHandlerFactory.getNewContentHandler(); + int maxEmbedded = -1; + if (contentHandlerFactory instanceof BasicContentHandlerFactory) { + maxEmbedded = ((BasicContentHandlerFactory) contentHandlerFactory).getMaxEmbeddedResources(); + } + final int finalMaxEmbedded = maxEmbedded; parseContext.set(DocumentSelector.class, new DocumentSelector() { - final int maxEmbedded = handlerConfig.getMaxEmbeddedResources(); int embedded = 0; @Override public boolean select(Metadata metadata) { - if (maxEmbedded < 0) { + if (finalMaxEmbedded < 0) { return true; } - return embedded++ < maxEmbedded; + return embedded++ < finalMaxEmbedded; } }); diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java index 8d15c92a0a0..b7793881274 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java @@ -16,8 +16,6 @@ */ package org.apache.tika.pipes.core.server; -import static org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig.DEFAULT_HANDLER_CONFIG; - import java.io.Closeable; import java.io.IOException; import java.time.Duration; @@ -41,7 +39,6 @@ import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.PipesResult; import org.apache.tika.pipes.core.PipesResults; import org.apache.tika.pipes.core.emitter.EmitterManager; @@ -149,9 +146,8 @@ protected ParseDataOrPipesResult parseFromTuple() throws TikaException, Interrup private ParseContext setupParseContext(FetchEmitTuple fetchEmitTuple) throws TikaException, IOException { ParseContext parseContext = fetchEmitTuple.getParseContext(); - if (parseContext.get(HandlerConfig.class) == null) { - parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG); - } + // ContentHandlerFactory and ParseMode are retrieved from ParseContext in ParseHandler. + // They are set in ParseContext from PipesConfig loaded via TikaLoader at startup. EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = parseContext.get(EmbeddedDocumentBytesConfig.class); if (embeddedDocumentBytesConfig == null) { //make sure there's one here -- or do we make this default in fetchemit tuple? diff --git a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleTest.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleTest.java index 4168d37a6f2..1650e7d00ad 100644 --- a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleTest.java +++ b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleTest.java @@ -27,10 +27,11 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; public class JsonFetchEmitTupleTest { @@ -45,8 +46,11 @@ public void testBasic() throws Exception { ParseContext parseContext = new ParseContext(); - HandlerConfig h = new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.XML, HandlerConfig.PARSE_MODE.CONCATENATE, 10000, 10, true); - parseContext.set(HandlerConfig.class, h); + // Set ContentHandlerFactory and ParseMode in ParseContext + ContentHandlerFactory factory = new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.XML, 10000); + parseContext.set(ContentHandlerFactory.class, factory); + parseContext.set(ParseMode.class, ParseMode.CONCATENATE); FetchEmitTuple t = new FetchEmitTuple("my_id", new FetchKey("my_fetcher", "fetchKey1"), new EmitKey("my_emitter", "emitKey1"), m, parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP); @@ -66,12 +70,10 @@ public void testFetchRange() throws Exception { m.add("m2", "v3"); m.add("m3", "v4"); - /** - * TODO -- add this to the ParseContext - * new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.XML, - * HandlerConfig.PARSE_MODE.CONCATENATE, - * 10000,10, true), - */ + // TODO -- add this to the ParseContext: + // parseContext.set(ContentHandlerFactory.class, new BasicContentHandlerFactory( + // BasicContentHandlerFactory.HANDLER_TYPE.XML, 10000)); + // parseContext.set(ParseMode.class, ParseMode.CONCATENATE); FetchEmitTuple t = new FetchEmitTuple("my_id", new FetchKey("my_fetcher", "fetchKey1", 10, 1000), new EmitKey("my_emitter", "emitKey1"), m, new ParseContext(), FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP); StringWriter writer = new StringWriter(); @@ -83,14 +85,12 @@ public void testFetchRange() throws Exception { @Test public void testBytes() throws Exception { - /** - * TODO -- add these to the ParseContext - EmbeddedDocumentBytesConfig bytesConfig = new EmbeddedDocumentBytesConfig(true); - bytesConfig.setEmitter("emitter"); - * new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.XML, - * HandlerConfig.PARSE_MODE.CONCATENATE, - * 10000,10, true) - */ + // TODO -- add these to the ParseContext: + // EmbeddedDocumentBytesConfig bytesConfig = new EmbeddedDocumentBytesConfig(true); + // bytesConfig.setEmitter("emitter"); + // parseContext.set(ContentHandlerFactory.class, new BasicContentHandlerFactory( + // BasicContentHandlerFactory.HANDLER_TYPE.XML, 10000)); + // parseContext.set(ParseMode.class, ParseMode.CONCATENATE); FetchEmitTuple t = new FetchEmitTuple("my_id", new FetchKey("my_fetcher", "fetchKey1", 10, 1000), new EmitKey("my_emitter", "emitKey1"), new Metadata(), new ParseContext(), FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP); StringWriter writer = new StringWriter(); diff --git a/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParser.java b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParser.java index 0420596d58c..cfb9251e30c 100644 --- a/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParser.java +++ b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParser.java @@ -33,13 +33,14 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.PipesResult; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.pipes.core.PipesConfig; import org.apache.tika.pipes.core.PipesException; import org.apache.tika.pipes.core.PipesParser; +import org.apache.tika.sax.ContentHandlerFactory; /** * A ForkParser implementation backed by {@link PipesParser}. @@ -86,7 +87,8 @@ * Example usage: *

  * PipesForkParserConfig config = new PipesForkParserConfig();
- * config.setHandlerConfig(new HandlerConfig(HANDLER_TYPE.TEXT, PARSE_MODE.RMETA, -1, -1, true));
+ * config.setHandlerType(HANDLER_TYPE.TEXT);
+ * config.setParseMode(ParseMode.RMETA);
  *
  * try (PipesForkParser parser = new PipesForkParser(config)) {
  *     // Parse from a file
@@ -204,8 +206,9 @@ public PipesForkResult parse(TikaInputStream tis, Metadata metadata, ParseContex
         FetchKey fetchKey = new FetchKey(config.getFetcherName(), absolutePath);
         EmitKey emitKey = new EmitKey("", id); // Empty emitter name since we're using PASSBACK_ALL
 
-        // Add handler config to parse context so server knows how to handle content
-        parseContext.set(HandlerConfig.class, config.getHandlerConfig());
+        // Add content handler factory and parse mode to parse context
+        parseContext.set(ContentHandlerFactory.class, config.getContentHandlerFactory());
+        parseContext.set(ParseMode.class, config.getParseMode());
 
         FetchEmitTuple tuple = new FetchEmitTuple(id, fetchKey, emitKey, metadata, parseContext);
 
diff --git a/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserConfig.java b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserConfig.java
index 8ffa0b555f1..467a2189730 100644
--- a/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserConfig.java
+++ b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserConfig.java
@@ -20,9 +20,10 @@
 import java.util.ArrayList;
 import java.util.List;
 
-import org.apache.tika.pipes.api.HandlerConfig;
+import org.apache.tika.pipes.api.ParseMode;
 import org.apache.tika.pipes.core.PipesConfig;
 import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
 
 /**
  * Configuration for {@link PipesForkParser}.
@@ -33,13 +34,15 @@
 public class PipesForkParserConfig {
 
     private final PipesConfig pipesConfig;
-    private HandlerConfig handlerConfig;
+    private ContentHandlerFactory contentHandlerFactory;
+    private ParseMode parseMode = ParseMode.RMETA;
     private String fetcherName = PipesForkParser.DEFAULT_FETCHER_NAME;
     private Path pluginsDir;
 
     public PipesForkParserConfig() {
         this.pipesConfig = new PipesConfig();
-        this.handlerConfig = new HandlerConfig();
+        this.contentHandlerFactory = new BasicContentHandlerFactory(
+                BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1);
         // Default to single client for simple fork parser use case
         this.pipesConfig.setNumClients(1);
     }
@@ -54,25 +57,34 @@ public PipesConfig getPipesConfig() {
     }
 
     /**
-     * Get the handler configuration that specifies how content should be handled.
+     * Get the content handler factory that specifies how content should be handled.
      *
-     * @return the handler configuration
+     * @return the content handler factory
      */
-    public HandlerConfig getHandlerConfig() {
-        return handlerConfig;
+    public ContentHandlerFactory getContentHandlerFactory() {
+        return contentHandlerFactory;
     }
 
     /**
-     * Set the handler configuration.
+     * Set the content handler factory.
      *
-     * @param handlerConfig the handler configuration
+     * @param contentHandlerFactory the content handler factory
      * @return this config for chaining
      */
-    public PipesForkParserConfig setHandlerConfig(HandlerConfig handlerConfig) {
-        this.handlerConfig = handlerConfig;
+    public PipesForkParserConfig setContentHandlerFactory(ContentHandlerFactory contentHandlerFactory) {
+        this.contentHandlerFactory = contentHandlerFactory;
         return this;
     }
 
+    /**
+     * Get the parse mode.
+     *
+     * @return the parse mode
+     */
+    public ParseMode getParseMode() {
+        return parseMode;
+    }
+
     /**
      * Set the handler type (TEXT, HTML, XML, etc.).
      *
@@ -80,7 +92,7 @@ public PipesForkParserConfig setHandlerConfig(HandlerConfig handlerConfig) {
      * @return this config for chaining
      */
     public PipesForkParserConfig setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE type) {
-        this.handlerConfig.setType(type);
+        this.contentHandlerFactory = new BasicContentHandlerFactory(type, -1);
         return this;
     }
 
@@ -90,8 +102,8 @@ public PipesForkParserConfig setHandlerType(BasicContentHandlerFactory.HANDLER_T
      * @param parseMode the parse mode
      * @return this config for chaining
      */
-    public PipesForkParserConfig setParseMode(HandlerConfig.PARSE_MODE parseMode) {
-        this.handlerConfig.setParseMode(parseMode);
+    public PipesForkParserConfig setParseMode(ParseMode parseMode) {
+        this.parseMode = parseMode;
         return this;
     }
 
@@ -102,7 +114,9 @@ public PipesForkParserConfig setParseMode(HandlerConfig.PARSE_MODE parseMode) {
      * @return this config for chaining
      */
     public PipesForkParserConfig setWriteLimit(int writeLimit) {
-        this.handlerConfig.setWriteLimit(writeLimit);
+        if (contentHandlerFactory instanceof BasicContentHandlerFactory bcf) {
+            this.contentHandlerFactory = new BasicContentHandlerFactory(bcf.getType(), writeLimit);
+        }
         return this;
     }
 
@@ -113,7 +127,9 @@ public PipesForkParserConfig setWriteLimit(int writeLimit) {
      * @return this config for chaining
      */
     public PipesForkParserConfig setMaxEmbeddedResources(int maxEmbeddedResources) {
-        this.handlerConfig.setMaxEmbeddedResources(maxEmbeddedResources);
+        if (contentHandlerFactory instanceof BasicContentHandlerFactory bcf) {
+            bcf.setMaxEmbeddedResources(maxEmbeddedResources);
+        }
         return this;
     }
 
diff --git a/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java b/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java
index 30fc322dcef..34e56552b33 100644
--- a/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java
+++ b/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java
@@ -38,7 +38,7 @@
 
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.pipes.api.HandlerConfig;
+import org.apache.tika.pipes.api.ParseMode;
 import org.apache.tika.pipes.api.PipesResult;
 import org.apache.tika.sax.BasicContentHandlerFactory;
 
@@ -80,7 +80,7 @@ public void testParseTextFile() throws Exception {
         PipesForkParserConfig config = new PipesForkParserConfig()
                 .setPluginsDir(PLUGINS_DIR)
                 .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
-                .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
+                .setParseMode(ParseMode.RMETA)
                 .setTimeoutMillis(60000)
                 .addJvmArg("-Xmx256m");
 
@@ -114,7 +114,7 @@ public void testParseWithMetadata() throws Exception {
         PipesForkParserConfig config = new PipesForkParserConfig()
                 .setPluginsDir(PLUGINS_DIR)
                 .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
-                .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
+                .setParseMode(ParseMode.RMETA)
                 .setTimeoutMillis(60000);
 
         try (PipesForkParser parser = new PipesForkParser(config);
@@ -144,7 +144,7 @@ public void testParseMultipleFiles() throws Exception {
         PipesForkParserConfig config = new PipesForkParserConfig()
                 .setPluginsDir(PLUGINS_DIR)
                 .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
-                .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
+                .setParseMode(ParseMode.RMETA)
                 .setTimeoutMillis(60000);
 
         try (PipesForkParser parser = new PipesForkParser(config)) {
@@ -171,7 +171,7 @@ public void testConcatenateMode() throws Exception {
         PipesForkParserConfig config = new PipesForkParserConfig()
                 .setPluginsDir(PLUGINS_DIR)
                 .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
-                .setParseMode(HandlerConfig.PARSE_MODE.CONCATENATE)
+                .setParseMode(ParseMode.CONCATENATE)
                 .setTimeoutMillis(60000);
 
         try (PipesForkParser parser = new PipesForkParser(config);
@@ -204,7 +204,7 @@ public void testRmetaModeWithEmbedded() throws Exception {
         PipesForkParserConfig config = new PipesForkParserConfig()
                 .setPluginsDir(PLUGINS_DIR)
                 .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
-                .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
+                .setParseMode(ParseMode.RMETA)
                 .setTimeoutMillis(60000);
 
         try (PipesForkParser parser = new PipesForkParser(config);
@@ -232,7 +232,7 @@ public void testDefaultConfigMatchesExplicitRmeta() throws Exception {
         PipesForkParserConfig explicitConfig = new PipesForkParserConfig()
                 .setPluginsDir(PLUGINS_DIR)
                 .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
-                .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
+                .setParseMode(ParseMode.RMETA)
                 .setTimeoutMillis(60000);
 
         int explicitMetadataCount;
@@ -268,7 +268,7 @@ public void testTextVsXhtmlHandlerType() throws Exception {
         PipesForkParserConfig textConfig = new PipesForkParserConfig()
                 .setPluginsDir(PLUGINS_DIR)
                 .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
-                .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
+                .setParseMode(ParseMode.RMETA)
                 .setTimeoutMillis(60000);
 
         String textContent;
@@ -288,7 +288,7 @@ public void testTextVsXhtmlHandlerType() throws Exception {
         PipesForkParserConfig xmlConfig = new PipesForkParserConfig()
                 .setPluginsDir(PLUGINS_DIR)
                 .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.XML)
-                .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
+                .setParseMode(ParseMode.RMETA)
                 .setTimeoutMillis(60000);
 
         String xmlContent;
@@ -322,7 +322,7 @@ public void testWriteLimit() throws Exception {
         PipesForkParserConfig config = new PipesForkParserConfig()
                 .setPluginsDir(PLUGINS_DIR)
                 .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
-                .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
+                .setParseMode(ParseMode.RMETA)
                 .setWriteLimit(100)  // Limit to 100 characters
                 .setTimeoutMillis(60000);
 
diff --git a/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorBase.java b/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorBase.java
index 4fd11352da1..8a4622dcb8e 100644
--- a/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorBase.java
+++ b/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorBase.java
@@ -54,6 +54,16 @@ public abstract class PipesIteratorBase extends AbstractTikaExtension implements
     private int added = 0;
     private FutureTask futureTask;
 
+    /**
+     * The fetcher ID to use for fetching documents.
+     */
+    private String fetcherId;
+
+    /**
+     * The emitter ID to use for emitting results.
+     */
+    private String emitterId;
+
     public PipesIteratorBase(ExtensionConfig pluginConfig) {
         super(pluginConfig);
     }
diff --git a/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorConfig.java b/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorConfig.java
new file mode 100644
index 00000000000..e8356a64a86
--- /dev/null
+++ b/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorConfig.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.pipesiterator;
+
+import java.util.Objects;
+
+/**
+ * Abstract base class for pipes iterator configurations.
+ * Provides the common fetcherId and emitterId fields that all iterators need.
+ * 

+ * ContentHandlerFactory, ParseMode, and other parsing settings should be loaded + * from tika-config.json via TikaLoader and set in PipesConfig. + */ +public abstract class PipesIteratorConfig { + + private String fetcherId; + private String emitterId; + + public String getFetcherId() { + return fetcherId; + } + + public void setFetcherId(String fetcherId) { + this.fetcherId = fetcherId; + } + + public String getEmitterId() { + return emitterId; + } + + public void setEmitterId(String emitterId) { + this.emitterId = emitterId; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof PipesIteratorConfig that)) return false; + return Objects.equals(fetcherId, that.fetcherId) && + Objects.equals(emitterId, that.emitterId); + } + + @Override + public int hashCode() { + return Objects.hash(fetcherId, emitterId); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIterator.java index 8d56f2e87d7..855059914c6 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIterator.java @@ -36,10 +36,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; import org.apache.tika.pipes.pipesiterator.PipesIteratorBase; import org.apache.tika.plugins.ExtensionConfig; import org.apache.tika.utils.StringUtils; @@ -81,10 +79,8 @@ private void checkConfig(AZBlobPipesIteratorConfig config) throws TikaConfigExce @Override protected void enqueue() throws InterruptedException, IOException, TimeoutException { - PipesIteratorBaseConfig baseConfig = config.getBaseConfig(); - String fetcherId = baseConfig.fetcherId(); - String emitterId = baseConfig.emitterId(); - HandlerConfig handlerConfig = baseConfig.handlerConfig(); + String fetcherId = config.getFetcherId(); + String emitterId = config.getEmitterId(); long start = System.currentTimeMillis(); int count = 0; @@ -125,10 +121,9 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept } //TODO -- extract metadata about content length etc from properties ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, handlerConfig); tryToAdd(new FetchEmitTuple(blob.getName(), new FetchKey(fetcherId, blob.getName()), new EmitKey(emitterId, blob.getName()), new Metadata(), parseContext, - baseConfig.onParseException())); + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); count++; } long elapsed = System.currentTimeMillis() - start; diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorConfig.java index 068ff346044..ef3d78a49a8 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorConfig.java @@ -22,10 +22,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig; +import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig; -public class AZBlobPipesIteratorConfig implements PipesIteratorConfig { +public class AZBlobPipesIteratorConfig extends PipesIteratorConfig { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -45,7 +44,6 @@ public static AZBlobPipesIteratorConfig load(final String json) private String container; private String prefix = ""; private long timeoutMillis = 360000; - private PipesIteratorBaseConfig baseConfig = null; public String getSasToken() { return sasToken; @@ -68,32 +66,28 @@ public long getTimeoutMillis() { } @Override - public PipesIteratorBaseConfig getBaseConfig() { - return baseConfig; - } - - @Override - public final boolean equals(Object o) { + public boolean equals(Object o) { if (!(o instanceof AZBlobPipesIteratorConfig that)) { return false; } - + if (!super.equals(o)) { + return false; + } return timeoutMillis == that.timeoutMillis && Objects.equals(sasToken, that.sasToken) && Objects.equals(endpoint, that.endpoint) && Objects.equals(container, that.container) && - Objects.equals(prefix, that.prefix) && - Objects.equals(baseConfig, that.baseConfig); + Objects.equals(prefix, that.prefix); } @Override public int hashCode() { - int result = Objects.hashCode(sasToken); + int result = super.hashCode(); + result = 31 * result + Objects.hashCode(sasToken); result = 31 * result + Objects.hashCode(endpoint); result = 31 * result + Objects.hashCode(container); result = 31 * result + Objects.hashCode(prefix); result = 31 * result + Long.hashCode(timeoutMillis); - result = 31 * result + Objects.hashCode(baseConfig); return result; } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIterator.java index 7ca24c03e15..317db26e132 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIterator.java @@ -34,7 +34,6 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.pipes.pipesiterator.PipesIteratorBase; @@ -91,8 +90,8 @@ public static CSVPipesIterator build(ExtensionConfig extensionConfig) throws IOE @Override protected void enqueue() throws InterruptedException, IOException, TimeoutException { - String fetcherPluginId = config.getBaseConfig().fetcherId(); - String emitterName = config.getBaseConfig().emitterId(); + String fetcherId = config.getFetcherId(); + String emitterId = config.getEmitterId(); try (Reader reader = Files.newBufferedReader(config.getCsvPath(), charset)) { Iterable records = CSVFormat.EXCEL.parse(reader); List headers = new ArrayList<>(); @@ -103,17 +102,16 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept } try { - checkFetchEmitValidity(fetcherPluginId, emitterName, fetchEmitKeyIndices, headers); + checkFetchEmitValidity(fetcherId, emitterId, fetchEmitKeyIndices, headers); } catch (TikaConfigException e) { throw new IOException(e); } - HandlerConfig handlerConfig = config.getBaseConfig().handlerConfig(); for (CSVRecord record : records) { String id = record.get(fetchEmitKeyIndices.idIndex); String fetchKey = record.get(fetchEmitKeyIndices.fetchKeyIndex); String emitKey = record.get(fetchEmitKeyIndices.emitKeyIndex); - if (StringUtils.isBlank(fetchKey) && !StringUtils.isBlank(fetcherPluginId)) { - LOGGER.debug("Fetcher specified ({}), but no fetchkey was found in ({})", fetcherPluginId, record); + if (StringUtils.isBlank(fetchKey) && !StringUtils.isBlank(fetcherId)) { + LOGGER.debug("Fetcher specified ({}), but no fetchkey was found in ({})", fetcherId, record); } if (StringUtils.isBlank(emitKey)) { throw new IOException("emitKey must not be blank in :" + record); @@ -121,27 +119,26 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept Metadata metadata = loadMetadata(fetchEmitKeyIndices, headers, record); ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, handlerConfig); - tryToAdd(new FetchEmitTuple(id, new FetchKey(fetcherPluginId, fetchKey), new EmitKey(emitterName, emitKey), metadata, parseContext, - config.getBaseConfig().onParseException())); + tryToAdd(new FetchEmitTuple(id, new FetchKey(fetcherId, fetchKey), new EmitKey(emitterId, emitKey), metadata, parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); } } } - private void checkFetchEmitValidity(String fetcherPluginId, String emitterName, FetchEmitKeyIndices fetchEmitKeyIndices, List headers) throws TikaConfigException { + private void checkFetchEmitValidity(String fetcherId, String emitterId, FetchEmitKeyIndices fetchEmitKeyIndices, List headers) throws TikaConfigException { String fetchKeyColumn = config.getFetchKeyColumn(); String emitKeyColumn = config.getEmitKeyColumn(); String idColumn = config.getIdColumn(); - if (StringUtils.isBlank(emitterName)) { - throw new TikaConfigException("must specify at least an emitterName"); + if (StringUtils.isBlank(emitterId)) { + throw new TikaConfigException("must specify at least an emitterId"); } - if (StringUtils.isBlank(fetcherPluginId) && !StringUtils.isBlank(fetchKeyColumn)) { - throw new TikaConfigException("If specifying a 'fetchKeyColumn', " + "you must also specify a 'fetcherPluginId'"); + if (StringUtils.isBlank(fetcherId) && !StringUtils.isBlank(fetchKeyColumn)) { + throw new TikaConfigException("If specifying a 'fetchKeyColumn', " + "you must also specify a 'fetcherId'"); } - if (StringUtils.isBlank(fetcherPluginId)) { + if (StringUtils.isBlank(fetcherId)) { LOGGER.info("No fetcher specified. This will be metadata only"); } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorConfig.java index 46bee035e9a..3a5231821d8 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorConfig.java @@ -23,10 +23,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig; +import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig; -public class CSVPipesIteratorConfig implements PipesIteratorConfig { +public class CSVPipesIteratorConfig extends PipesIteratorConfig { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -45,7 +44,6 @@ public static CSVPipesIteratorConfig load(final String json) private String fetchKeyColumn; private String emitKeyColumn; private String idColumn; - private PipesIteratorBaseConfig baseConfig = null; public Path getCsvPath() { return csvPath; @@ -64,30 +62,26 @@ public String getIdColumn() { } @Override - public PipesIteratorBaseConfig getBaseConfig() { - return baseConfig; - } - - @Override - public final boolean equals(Object o) { + public boolean equals(Object o) { if (!(o instanceof CSVPipesIteratorConfig that)) { return false; } - + if (!super.equals(o)) { + return false; + } return Objects.equals(csvPath, that.csvPath) && Objects.equals(fetchKeyColumn, that.fetchKeyColumn) && Objects.equals(emitKeyColumn, that.emitKeyColumn) && - Objects.equals(idColumn, that.idColumn) && - Objects.equals(baseConfig, that.baseConfig); + Objects.equals(idColumn, that.idColumn); } @Override public int hashCode() { - int result = Objects.hashCode(csvPath); + int result = super.hashCode(); + result = 31 * result + Objects.hashCode(csvPath); result = 31 * result + Objects.hashCode(fetchKeyColumn); result = 31 * result + Objects.hashCode(emitKeyColumn); result = 31 * result + Objects.hashCode(idColumn); - result = 31 * result + Objects.hashCode(baseConfig); return result; } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIterator.java index 4dedfaf478f..bb4b1fc0dcd 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIterator.java @@ -34,10 +34,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; import org.apache.tika.pipes.api.pipesiterator.TotalCountResult; import org.apache.tika.pipes.api.pipesiterator.TotalCounter; import org.apache.tika.pipes.pipesiterator.PipesIteratorBase; @@ -79,9 +77,8 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept "\"basePath\" directory does not exist: " + config .getBasePath().toAbsolutePath()); } - PipesIteratorBaseConfig config = this.config.getBaseConfig(); try { - Files.walkFileTree(this.config.getBasePath(), new FSFileVisitor(config.fetcherId(), config.emitterId())); + Files.walkFileTree(config.getBasePath(), new FSFileVisitor(config.getFetcherId(), config.getEmitterId())); } catch (IOException e) { Throwable cause = e.getCause(); if (cause != null && cause instanceof TimeoutException) { @@ -139,15 +136,14 @@ public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { - String relPath = config + String relPath = FileSystemPipesIterator.this.config .getBasePath().relativize(file).toString(); - PipesIteratorBaseConfig config = FileSystemPipesIterator.this.config.getBaseConfig(); try { ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, config.handlerConfig()); + // ContentHandlerFactory, ParseMode, and onParseException come from PipesConfig loaded via TikaLoader tryToAdd(new FetchEmitTuple(relPath, new FetchKey(fetcherId, relPath), new EmitKey(emitterId, relPath), new Metadata(), parseContext, - config.onParseException())); + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); } catch (TimeoutException e) { throw new IOException(e); } catch (InterruptedException e) { diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorConfig.java index 0648620fc4c..61eeeb66a65 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorConfig.java @@ -23,10 +23,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig; +import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig; -public class FileSystemPipesIteratorConfig implements PipesIteratorConfig { +public class FileSystemPipesIteratorConfig extends PipesIteratorConfig { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -44,7 +43,6 @@ public static FileSystemPipesIteratorConfig load(final String json) private Path basePath = null; private boolean countTotal = true; - private PipesIteratorBaseConfig baseConfig = null; public Path getBasePath() { return basePath; @@ -55,24 +53,21 @@ public boolean isCountTotal() { } @Override - public PipesIteratorBaseConfig getBaseConfig() { - return baseConfig; - } - - @Override - public final boolean equals(Object o) { + public boolean equals(Object o) { if (!(o instanceof FileSystemPipesIteratorConfig that)) { return false; } - - return countTotal == that.countTotal && Objects.equals(basePath, that.basePath) && Objects.equals(baseConfig, that.baseConfig); + if (!super.equals(o)) { + return false; + } + return countTotal == that.countTotal && Objects.equals(basePath, that.basePath); } @Override public int hashCode() { - int result = Objects.hashCode(basePath); + int result = super.hashCode(); + result = 31 * result + Objects.hashCode(basePath); result = 31 * result + Boolean.hashCode(countTotal); - result = 31 * result + Objects.hashCode(baseConfig); return result; } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIterator.java index f25fd696af1..0b64c18812c 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIterator.java @@ -30,10 +30,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; import org.apache.tika.pipes.pipesiterator.PipesIteratorBase; import org.apache.tika.plugins.ExtensionConfig; import org.apache.tika.utils.StringUtils; @@ -71,12 +69,10 @@ public static GCSPipesIterator build(ExtensionConfig extensionConfig) throws IOE @Override protected void enqueue() throws InterruptedException, IOException, TimeoutException { - PipesIteratorBaseConfig baseConfig = config.getBaseConfig(); - String fetcherPluginId = baseConfig.fetcherId(); - String emitterName = baseConfig.emitterId(); + String fetcherId = config.getFetcherId(); + String emitterId = config.getEmitterId(); long start = System.currentTimeMillis(); int count = 0; - HandlerConfig handlerConfig = baseConfig.handlerConfig(); Page blobs = null; String prefix = config.getPrefix(); @@ -96,9 +92,8 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept LOGGER.debug("adding ({}) {} in {} ms", count, blob.getName(), elapsed); //TODO -- allow user specified metadata as the "id"? ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, handlerConfig); - tryToAdd(new FetchEmitTuple(blob.getName(), new FetchKey(fetcherPluginId, blob.getName()), new EmitKey(emitterName, blob.getName()), new Metadata(), parseContext, - baseConfig.onParseException())); + tryToAdd(new FetchEmitTuple(blob.getName(), new FetchKey(fetcherId, blob.getName()), new EmitKey(emitterId, blob.getName()), new Metadata(), parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); count++; } long elapsed = System.currentTimeMillis() - start; diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorConfig.java index f4c4f1690e3..d87fea102a1 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorConfig.java @@ -22,10 +22,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig; +import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig; -public class GCSPipesIteratorConfig implements PipesIteratorConfig { +public class GCSPipesIteratorConfig extends PipesIteratorConfig { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -42,7 +41,6 @@ public static GCSPipesIteratorConfig load(final String json) private String bucket; private String prefix = ""; private String projectId = ""; - private PipesIteratorBaseConfig baseConfig = null; public String getBucket() { return bucket; @@ -57,28 +55,24 @@ public String getProjectId() { } @Override - public PipesIteratorBaseConfig getBaseConfig() { - return baseConfig; - } - - @Override - public final boolean equals(Object o) { + public boolean equals(Object o) { if (!(o instanceof GCSPipesIteratorConfig that)) { return false; } - + if (!super.equals(o)) { + return false; + } return Objects.equals(bucket, that.bucket) && Objects.equals(prefix, that.prefix) && - Objects.equals(projectId, that.projectId) && - Objects.equals(baseConfig, that.baseConfig); + Objects.equals(projectId, that.projectId); } @Override public int hashCode() { - int result = Objects.hashCode(bucket); + int result = super.hashCode(); + result = 31 * result + Objects.hashCode(bucket); result = 31 * result + Objects.hashCode(prefix); result = 31 * result + Objects.hashCode(projectId); - result = 31 * result + Objects.hashCode(baseConfig); return result; } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIterator.java index fbf86f4fe63..be0fccfdfa1 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIterator.java @@ -34,10 +34,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; import org.apache.tika.pipes.pipesiterator.PipesIteratorBase; import org.apache.tika.plugins.ExtensionConfig; import org.apache.tika.utils.StringUtils; @@ -78,16 +76,15 @@ private JDBCPipesIterator(JDBCPipesIteratorConfig config, ExtensionConfig extens throw new TikaConfigException("select must not be empty"); } - PipesIteratorBaseConfig baseConfig = config.getBaseConfig(); - String fetcherName = baseConfig.fetcherId(); - String emitterName = baseConfig.emitterId(); + String fetcherName = config.getFetcherId(); + String emitterName = config.getEmitterId(); if (StringUtils.isBlank(fetcherName) && !StringUtils.isBlank(config.getFetchKeyColumn())) { - throw new TikaConfigException("If you specify a 'fetchKeyColumn', you must specify a 'fetcherPluginId'"); + throw new TikaConfigException("If you specify a 'fetchKeyColumn', you must specify a 'fetcherId'"); } if (StringUtils.isBlank(emitterName) && !StringUtils.isBlank(config.getEmitKeyColumn())) { - throw new TikaConfigException("If you specify an 'emitKeyColumn', you must specify an 'emitterPluginId'"); + throw new TikaConfigException("If you specify an 'emitKeyColumn', you must specify an 'emitterId'"); } if (StringUtils.isBlank(emitterName) && StringUtils.isBlank(fetcherName)) { @@ -120,13 +117,11 @@ public static JDBCPipesIterator build(ExtensionConfig extensionConfig) throws IO @Override protected void enqueue() throws InterruptedException, IOException, TimeoutException { - PipesIteratorBaseConfig baseConfig = config.getBaseConfig(); - String fetcherPluginId = baseConfig.fetcherId(); - String emitterName = baseConfig.emitterId(); + String fetcherId = config.getFetcherId(); + String emitterId = config.getEmitterId(); FetchEmitKeyIndices fetchEmitKeyIndices = null; List headers = new ArrayList<>(); int rowCount = 0; - HandlerConfig handlerConfig = baseConfig.handlerConfig(); LOGGER.debug("select: {}", config.getSelect()); try (Statement st = db.createStatement()) { if (config.getFetchSize() > 0) { @@ -139,10 +134,10 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept while (rs.next()) { if (headers.size() == 0) { fetchEmitKeyIndices = loadHeaders(rs.getMetaData(), headers); - checkFetchEmitValidity(fetcherPluginId, emitterName, fetchEmitKeyIndices, headers); + checkFetchEmitValidity(fetcherId, emitterId, fetchEmitKeyIndices, headers); } try { - processRow(fetcherPluginId, emitterName, headers, fetchEmitKeyIndices, rs, handlerConfig, baseConfig); + processRow(fetcherId, emitterId, headers, fetchEmitKeyIndices, rs); } catch (SQLException e) { LOGGER.warn("Failed to insert: " + rs, e); } @@ -164,7 +159,7 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept } } - private void checkFetchEmitValidity(String fetcherPluginId, String emitterName, FetchEmitKeyIndices fetchEmitKeyIndices, List headers) throws IOException { + private void checkFetchEmitValidity(String fetcherId, String emitterId, FetchEmitKeyIndices fetchEmitKeyIndices, List headers) throws IOException { if (!StringUtils.isBlank(config.getFetchKeyColumn()) && fetchEmitKeyIndices.fetchKeyIndex < 0) { throw new IOException(new TikaConfigException("Couldn't find fetchkey column: " + config.getFetchKeyColumn())); } @@ -180,9 +175,8 @@ private void checkFetchEmitValidity(String fetcherPluginId, String emitterName, } } - private void processRow(String fetcherPluginId, String emitterName, List headers, - FetchEmitKeyIndices fetchEmitKeyIndices, ResultSet rs, - HandlerConfig handlerConfig, PipesIteratorBaseConfig baseConfig) + private void processRow(String fetcherId, String emitterId, List headers, + FetchEmitKeyIndices fetchEmitKeyIndices, ResultSet rs) throws SQLException, TimeoutException, InterruptedException { Metadata metadata = new Metadata(); String fetchKey = ""; @@ -233,9 +227,8 @@ private void processRow(String fetcherPluginId, String emitterName, List } } ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, handlerConfig); - tryToAdd(new FetchEmitTuple(id, new FetchKey(fetcherPluginId, fetchKey, fetchStartRange, fetchEndRange), new EmitKey(emitterName, emitKey), metadata, parseContext, - baseConfig.onParseException())); + tryToAdd(new FetchEmitTuple(id, new FetchKey(fetcherId, fetchKey, fetchStartRange, fetchEndRange), new EmitKey(emitterId, emitKey), metadata, parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); } private String toString(ResultSet rs) throws SQLException { diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorConfig.java index 5cdfa0a7076..ff6b68d229e 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorConfig.java @@ -22,10 +22,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig; +import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig; -public class JDBCPipesIteratorConfig implements PipesIteratorConfig { +public class JDBCPipesIteratorConfig extends PipesIteratorConfig { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -49,7 +48,6 @@ public static JDBCPipesIteratorConfig load(final String json) private String select; private int fetchSize = -1; private int queryTimeoutSeconds = -1; - private PipesIteratorBaseConfig baseConfig = null; public String getIdColumn() { return idColumn; @@ -88,16 +86,13 @@ public int getQueryTimeoutSeconds() { } @Override - public PipesIteratorBaseConfig getBaseConfig() { - return baseConfig; - } - - @Override - public final boolean equals(Object o) { + public boolean equals(Object o) { if (!(o instanceof JDBCPipesIteratorConfig that)) { return false; } - + if (!super.equals(o)) { + return false; + } return fetchSize == that.fetchSize && queryTimeoutSeconds == that.queryTimeoutSeconds && Objects.equals(idColumn, that.idColumn) && @@ -106,13 +101,13 @@ public final boolean equals(Object o) { Objects.equals(fetchKeyRangeEndColumn, that.fetchKeyRangeEndColumn) && Objects.equals(emitKeyColumn, that.emitKeyColumn) && Objects.equals(connection, that.connection) && - Objects.equals(select, that.select) && - Objects.equals(baseConfig, that.baseConfig); + Objects.equals(select, that.select); } @Override public int hashCode() { - int result = Objects.hashCode(idColumn); + int result = super.hashCode(); + result = 31 * result + Objects.hashCode(idColumn); result = 31 * result + Objects.hashCode(fetchKeyColumn); result = 31 * result + Objects.hashCode(fetchKeyRangeStartColumn); result = 31 * result + Objects.hashCode(fetchKeyRangeEndColumn); @@ -121,7 +116,6 @@ public int hashCode() { result = 31 * result + Objects.hashCode(select); result = 31 * result + fetchSize; result = 31 * result + queryTimeoutSeconds; - result = 31 * result + Objects.hashCode(baseConfig); return result; } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorConfig.java index a9942a625ca..c3f6f53924b 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorConfig.java @@ -23,10 +23,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig; +import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig; -public class JsonPipesIteratorConfig implements PipesIteratorConfig { +public class JsonPipesIteratorConfig extends PipesIteratorConfig { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -42,31 +41,26 @@ public static JsonPipesIteratorConfig load(final String json) } private Path jsonPath; - private PipesIteratorBaseConfig baseConfig = null; public Path getJsonPath() { return jsonPath; } @Override - public PipesIteratorBaseConfig getBaseConfig() { - return baseConfig; - } - - @Override - public final boolean equals(Object o) { + public boolean equals(Object o) { if (!(o instanceof JsonPipesIteratorConfig that)) { return false; } - - return Objects.equals(jsonPath, that.jsonPath) && - Objects.equals(baseConfig, that.baseConfig); + if (!super.equals(o)) { + return false; + } + return Objects.equals(jsonPath, that.jsonPath); } @Override public int hashCode() { - int result = Objects.hashCode(jsonPath); - result = 31 * result + Objects.hashCode(baseConfig); + int result = super.hashCode(); + result = 31 * result + Objects.hashCode(jsonPath); return result; } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIterator.java index 285bc07188d..ccf9e2037f3 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIterator.java @@ -35,10 +35,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; import org.apache.tika.pipes.pipesiterator.PipesIteratorBase; import org.apache.tika.plugins.ExtensionConfig; @@ -99,10 +97,8 @@ private Object serializerClass(String className, Class defaultClass) { @Override protected void enqueue() throws InterruptedException, TimeoutException { - PipesIteratorBaseConfig baseConfig = config.getBaseConfig(); - String fetcherId = baseConfig.fetcherId(); - String emitterId = baseConfig.emitterId(); - HandlerConfig handlerConfig = baseConfig.handlerConfig(); + String fetcherId = config.getFetcherId(); + String emitterId = config.getEmitterId(); long start = System.currentTimeMillis(); int count = 0; @@ -117,10 +113,9 @@ protected void enqueue() throws InterruptedException, TimeoutException { LOGGER.debug("adding ({}) {} in {} ms", count, r.key(), elapsed); } ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, handlerConfig); tryToAdd(new FetchEmitTuple(r.key(), new FetchKey(fetcherId, r.key()), new EmitKey(emitterId, r.key()), new Metadata(), parseContext, - baseConfig.onParseException())); + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); ++count; } } while ((emitMax < 0 || count < emitMax) && !records.isEmpty()); diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorConfig.java index 53675ac233a..63342adbe31 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorConfig.java @@ -22,10 +22,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig; +import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig; -public class KafkaPipesIteratorConfig implements PipesIteratorConfig { +public class KafkaPipesIteratorConfig extends PipesIteratorConfig { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -49,7 +48,6 @@ public static KafkaPipesIteratorConfig load(final String json) private int pollDelayMs = 100; private int emitMax = -1; private int groupInitialRebalanceDelayMs = 3000; - private PipesIteratorBaseConfig baseConfig = null; public String getTopic() { return topic; @@ -88,16 +86,13 @@ public int getGroupInitialRebalanceDelayMs() { } @Override - public PipesIteratorBaseConfig getBaseConfig() { - return baseConfig; - } - - @Override - public final boolean equals(Object o) { + public boolean equals(Object o) { if (!(o instanceof KafkaPipesIteratorConfig that)) { return false; } - + if (!super.equals(o)) { + return false; + } return pollDelayMs == that.pollDelayMs && emitMax == that.emitMax && groupInitialRebalanceDelayMs == that.groupInitialRebalanceDelayMs && @@ -106,13 +101,13 @@ public final boolean equals(Object o) { Objects.equals(keySerializer, that.keySerializer) && Objects.equals(valueSerializer, that.valueSerializer) && Objects.equals(groupId, that.groupId) && - Objects.equals(autoOffsetReset, that.autoOffsetReset) && - Objects.equals(baseConfig, that.baseConfig); + Objects.equals(autoOffsetReset, that.autoOffsetReset); } @Override public int hashCode() { - int result = Objects.hashCode(topic); + int result = super.hashCode(); + result = 31 * result + Objects.hashCode(topic); result = 31 * result + Objects.hashCode(bootstrapServers); result = 31 * result + Objects.hashCode(keySerializer); result = 31 * result + Objects.hashCode(valueSerializer); @@ -121,7 +116,6 @@ public int hashCode() { result = 31 * result + pollDelayMs; result = 31 * result + emitMax; result = 31 * result + groupInitialRebalanceDelayMs; - result = 31 * result + Objects.hashCode(baseConfig); return result; } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIterator.java index 6a9539ca316..6e6daa11526 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIterator.java @@ -46,10 +46,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; import org.apache.tika.pipes.pipesiterator.PipesIteratorBase; import org.apache.tika.plugins.ExtensionConfig; import org.apache.tika.utils.StringUtils; @@ -125,12 +123,10 @@ public static S3PipesIterator build(ExtensionConfig extensionConfig) throws IOEx @Override protected void enqueue() throws InterruptedException, IOException, TimeoutException { - PipesIteratorBaseConfig baseConfig = config.getBaseConfig(); - String fetcherPluginId = baseConfig.fetcherId(); - String emitterName = baseConfig.emitterId(); + String fetcherId = config.getFetcherId(); + String emitterId = config.getEmitterId(); long start = System.currentTimeMillis(); int count = 0; - HandlerConfig handlerConfig = baseConfig.handlerConfig(); final Matcher fileNameMatcher; if (fileNamePattern != null) { fileNameMatcher = fileNamePattern.matcher(""); @@ -149,9 +145,8 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept long elapsed = System.currentTimeMillis() - start; LOGGER.debug("adding ({}) {} in {} ms", count, key, elapsed); ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, handlerConfig); - tryToAdd(new FetchEmitTuple(key, new FetchKey(fetcherPluginId, key), new EmitKey(emitterName, key), new Metadata(), parseContext, - baseConfig.onParseException())); + tryToAdd(new FetchEmitTuple(key, new FetchKey(fetcherId, key), new EmitKey(emitterId, key), new Metadata(), parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); count++; } long elapsed = System.currentTimeMillis() - start; diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorConfig.java index dc4bd12c2e8..4e8cf3ef20d 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorConfig.java @@ -22,10 +22,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig; +import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig; -public class S3PipesIteratorConfig implements PipesIteratorConfig { +public class S3PipesIteratorConfig extends PipesIteratorConfig { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -50,7 +49,6 @@ public static S3PipesIteratorConfig load(final String json) private String fileNamePattern; private int maxConnections = 50; private boolean pathStyleAccessEnabled = false; - private PipesIteratorBaseConfig baseConfig = null; public String getPrefix() { return prefix; @@ -97,16 +95,13 @@ public boolean isPathStyleAccessEnabled() { } @Override - public PipesIteratorBaseConfig getBaseConfig() { - return baseConfig; - } - - @Override - public final boolean equals(Object o) { + public boolean equals(Object o) { if (!(o instanceof S3PipesIteratorConfig that)) { return false; } - + if (!super.equals(o)) { + return false; + } return maxConnections == that.maxConnections && pathStyleAccessEnabled == that.pathStyleAccessEnabled && Objects.equals(prefix, that.prefix) && @@ -117,13 +112,13 @@ public final boolean equals(Object o) { Objects.equals(credentialsProvider, that.credentialsProvider) && Objects.equals(profile, that.profile) && Objects.equals(bucket, that.bucket) && - Objects.equals(fileNamePattern, that.fileNamePattern) && - Objects.equals(baseConfig, that.baseConfig); + Objects.equals(fileNamePattern, that.fileNamePattern); } @Override public int hashCode() { - int result = Objects.hashCode(prefix); + int result = super.hashCode(); + result = 31 * result + Objects.hashCode(prefix); result = 31 * result + Objects.hashCode(region); result = 31 * result + Objects.hashCode(accessKey); result = 31 * result + Objects.hashCode(secretKey); @@ -134,7 +129,6 @@ public int hashCode() { result = 31 * result + Objects.hashCode(fileNamePattern); result = 31 * result + maxConnections; result = 31 * result + Boolean.hashCode(pathStyleAccessEnabled); - result = 31 * result + Objects.hashCode(baseConfig); return result; } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIterator.java index 6be72029b8c..02615bf12e6 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIterator.java @@ -42,10 +42,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; import org.apache.tika.pipes.pipesiterator.PipesIteratorBase; import org.apache.tika.plugins.ExtensionConfig; import org.apache.tika.utils.StringUtils; @@ -119,9 +117,8 @@ private void configure() throws IOException, TikaConfigException { @Override protected void enqueue() throws InterruptedException, IOException, TimeoutException { - PipesIteratorBaseConfig baseConfig = config.getBaseConfig(); - String fetcherId = baseConfig.fetcherId(); - String emitterId = baseConfig.emitterId(); + String fetcherId = config.getFetcherId(); + String emitterId = config.getEmitterId(); try (SolrClient solrClient = createSolrClient()) { int fileCount = 0; @@ -145,8 +142,6 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept List filters = config.getFilters() != null ? config.getFilters() : Collections.emptyList(); query.setFilterQueries(filters.toArray(new String[]{})); - HandlerConfig handlerConfig = baseConfig.handlerConfig(); - String cursorMark = CursorMarkParams.CURSOR_MARK_START; boolean done = false; while (!done) { @@ -167,9 +162,8 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept } LOGGER.info("iterator doc: {}, idField={}, fetchKey={}", sd, config.getIdField(), fetchKey); ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, handlerConfig); tryToAdd(new FetchEmitTuple(fetchKey, new FetchKey(fetcherId, fetchKey), new EmitKey(emitterId, emitKey), new Metadata(), parseContext, - baseConfig.onParseException())); + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); } if (cursorMark.equals(nextCursorMark)) { done = true; diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorConfig.java index 60211ed9ac9..9c37a52819c 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorConfig.java @@ -24,10 +24,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig; +import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig; -public class SolrPipesIteratorConfig implements PipesIteratorConfig { +public class SolrPipesIteratorConfig extends PipesIteratorConfig { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -60,7 +59,6 @@ public static SolrPipesIteratorConfig load(final String json) private String authScheme; private String proxyHost; private int proxyPort = 0; - private PipesIteratorBaseConfig baseConfig = null; public String getSolrCollection() { return solrCollection; @@ -135,16 +133,13 @@ public int getProxyPort() { } @Override - public PipesIteratorBaseConfig getBaseConfig() { - return baseConfig; - } - - @Override - public final boolean equals(Object o) { + public boolean equals(Object o) { if (!(o instanceof SolrPipesIteratorConfig that)) { return false; } - + if (!super.equals(o)) { + return false; + } return rows == that.rows && connectionTimeout == that.connectionTimeout && socketTimeout == that.socketTimeout && @@ -162,13 +157,13 @@ public final boolean equals(Object o) { Objects.equals(userName, that.userName) && Objects.equals(password, that.password) && Objects.equals(authScheme, that.authScheme) && - Objects.equals(proxyHost, that.proxyHost) && - Objects.equals(baseConfig, that.baseConfig); + Objects.equals(proxyHost, that.proxyHost); } @Override public int hashCode() { - int result = Objects.hashCode(solrCollection); + int result = super.hashCode(); + result = 31 * result + Objects.hashCode(solrCollection); result = 31 * result + Objects.hashCode(solrUrls); result = 31 * result + Objects.hashCode(solrZkHosts); result = 31 * result + Objects.hashCode(solrZkChroot); @@ -186,7 +181,6 @@ public int hashCode() { result = 31 * result + Objects.hashCode(authScheme); result = 31 * result + Objects.hashCode(proxyHost); result = 31 * result + proxyPort; - result = 31 * result + Objects.hashCode(baseConfig); return result; } } diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java index ce3593f0ca0..50db8ef6acd 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java @@ -56,8 +56,6 @@ public class ComponentRegistry { private static Map createBuiltinAliases() { Map aliases = new HashMap<>(); - // HandlerConfig is in tika-pipes-api which can't depend on tika-core for @TikaComponent - aliases.put("handler-config", "org.apache.tika.pipes.api.HandlerConfig"); // EmbeddedDocumentBytesConfig is in tika-pipes-core which can't depend on tika-core for @TikaComponent aliases.put("embedded-document-bytes-config", "org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig"); diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java index b57aae89ee9..e262bd64129 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java @@ -40,10 +40,10 @@ * TikaLoader loader = TikaLoader.load(configPath); * * // Load by explicit key - * HandlerConfig config = loader.configs().load("handler-config", HandlerConfig.class); + * MyConfig config = loader.configs().load("my-config", MyConfig.class); * * // Load by class name (auto-converts to kebab-case) - * HandlerConfig config = loader.configs().load(HandlerConfig.class); + * MyConfig config = loader.configs().load(MyConfig.class); *

* *

JSON configuration example: @@ -57,7 +57,7 @@ * * // Custom configs MUST be in "other-configs" (loaded via configs()) * "other-configs": { - * "handler-config": { + * "my-config": { * "timeout": 5000, * "retries": 3 * }, @@ -93,7 +93,7 @@ public class ConfigLoader { /** * Loads a configuration object using the class name converted to kebab-case. *

- * For example, {@code HandlerConfig.class} will look for key "handler-config". + * For example, {@code MyAppConfig.class} will look for key "my-app-config". * Class name suffixes like "Config", "Configuration", "Settings" are stripped first. *

* For interfaces, the JSON must specify the implementation (see {@link #load(String, Class)}). @@ -213,7 +213,7 @@ public T load(String key, Class clazz, T defaultValue) throws TikaConfigE * *

Example: *

-     * HandlerConfig defaults = new HandlerConfig();
+     * MyConfig defaults = new MyConfig();
      * defaults.setTimeout(30000);
      * defaults.setRetries(2);
      * defaults.setEnabled(false);
@@ -221,9 +221,9 @@ public  T load(String key, Class clazz, T defaultValue) throws TikaConfigE
      * // JSON: { "enabled": true }
      * // Result: timeout=30000, retries=2, enabled=true (merged!)
      * // Note: 'defaults' object remains unchanged
-     * HandlerConfig config = loader.configs().loadWithDefaults("handler-config",
-     *                                                           HandlerConfig.class,
-     *                                                           defaults);
+     * MyConfig config = loader.configs().loadWithDefaults("my-config",
+     *                                                      MyConfig.class,
+     *                                                      defaults);
      * 
* * @param key The JSON key to load from diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index 52e17d9d133..7371b8f844e 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@ -340,9 +340,9 @@ public synchronized Parser loadAutoDetectParser() throws TikaConfigException, IO * *

Usage: *

-     * HandlerConfig config = loader.configs().load("handler-config", HandlerConfig.class);
+     * MyConfig config = loader.configs().load("my-config", MyConfig.class);
      * // Or use kebab-case auto-conversion:
-     * HandlerConfig config = loader.configs().load(HandlerConfig.class);
+     * MyConfig config = loader.configs().load(MyConfig.class);
      * 
* * @return the ConfigLoader instance diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java index 3f71ae67eac..abf39696ce8 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java @@ -45,7 +45,7 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.server.core.MetadataList; @@ -59,7 +59,7 @@ public class RecursiveMetadataResource { private static final Logger LOG = LoggerFactory.getLogger(RecursiveMetadataResource.class); public static List parseMetadata(TikaInputStream tis, Metadata metadata, MultivaluedMap httpHeaders, - UriInfo info, HandlerConfig handlerConfig) + UriInfo info, ServerHandlerConfig handlerConfig) throws Exception { final ParseContext context = new ParseContext(); @@ -69,10 +69,10 @@ public static List parseMetadata(TikaInputStream tis, Metadata metadat fillMetadata(parser, metadata, httpHeaders); TikaResource.logRequest(LOG, "/rmeta", metadata); - BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.getType(); + BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.type(); RecursiveParserWrapperHandler handler = - new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(type, handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), context), - handlerConfig.getMaxEmbeddedResources(), TikaResource + new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(type, handlerConfig.writeLimit(), handlerConfig.throwOnWriteLimitReached(), context), + handlerConfig.maxEmbeddedResources(), TikaResource .getTikaLoader() .loadMetadataFilters()); try { @@ -92,7 +92,7 @@ public static List parseMetadata(TikaInputStream tis, Metadata metadat return metadataFilter.filter(handler.getMetadataList()); } - static HandlerConfig buildHandlerConfig(MultivaluedMap httpHeaders, String handlerTypeName, HandlerConfig.PARSE_MODE parseMode) { + static ServerHandlerConfig buildHandlerConfig(MultivaluedMap httpHeaders, String handlerTypeName, ParseMode parseMode) { int writeLimit = -1; if (httpHeaders.containsKey("writeLimit")) { writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit")); @@ -102,7 +102,7 @@ static HandlerConfig buildHandlerConfig(MultivaluedMap httpHeade if (httpHeaders.containsKey("maxEmbeddedResources")) { maxEmbeddedResources = Integer.parseInt(httpHeaders.getFirst("maxEmbeddedResources")); } - return new HandlerConfig(BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE), parseMode, writeLimit, maxEmbeddedResources, + return new ServerHandlerConfig(BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE), parseMode, writeLimit, maxEmbeddedResources, TikaResource.getThrowOnWriteLimitReached(httpHeaders)); } @@ -138,7 +138,7 @@ public Response getMetadataFromMultipart(Attachment att, @Context UriInfo info, try (TikaInputStream tis = TikaInputStream.get(att.getObject(InputStream.class))) { return Response .ok(parseMetadataToMetadataList(tis, new Metadata(), att.getHeaders(), info, - buildHandlerConfig(att.getHeaders(), handlerTypeName, HandlerConfig.PARSE_MODE.RMETA))) + buildHandlerConfig(att.getHeaders(), handlerTypeName, ParseMode.RMETA))) .build(); } } @@ -165,21 +165,21 @@ public Response getMetadataWithConfig( return Response .ok(parseMetadataWithContext(tis, metadata, httpHeaders.getRequestHeaders(), info, - buildHandlerConfig(httpHeaders.getRequestHeaders(), handlerTypeName != null ? handlerTypeName.substring(1) : null, HandlerConfig.PARSE_MODE.RMETA), + buildHandlerConfig(httpHeaders.getRequestHeaders(), handlerTypeName != null ? handlerTypeName.substring(1) : null, ParseMode.RMETA), context)) .build(); } } private MetadataList parseMetadataWithContext(TikaInputStream tis, Metadata metadata, MultivaluedMap httpHeaders, - UriInfo info, HandlerConfig handlerConfig, ParseContext context) throws Exception { + UriInfo info, ServerHandlerConfig handlerConfig, ParseContext context) throws Exception { Parser parser = TikaResource.createParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); - BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.getType(); + BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.type(); RecursiveParserWrapperHandler handler = - new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(type, handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), context), - handlerConfig.getMaxEmbeddedResources(), TikaResource + new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(type, handlerConfig.writeLimit(), handlerConfig.throwOnWriteLimitReached(), context), + handlerConfig.maxEmbeddedResources(), TikaResource .getTikaLoader() .loadMetadataFilters()); try { @@ -227,12 +227,12 @@ public Response getMetadata(InputStream is, @Context HttpHeaders httpHeaders, @C try (TikaInputStream tis = TikaResource.getInputStream(is, metadata, httpHeaders, info)) { return Response .ok(parseMetadataToMetadataList(tis, metadata, httpHeaders.getRequestHeaders(), info, - buildHandlerConfig(httpHeaders.getRequestHeaders(), handlerTypeName, HandlerConfig.PARSE_MODE.RMETA))) + buildHandlerConfig(httpHeaders.getRequestHeaders(), handlerTypeName, ParseMode.RMETA))) .build(); } } - private MetadataList parseMetadataToMetadataList(TikaInputStream tis, Metadata metadata, MultivaluedMap httpHeaders, UriInfo info, HandlerConfig handlerConfig) + private MetadataList parseMetadataToMetadataList(TikaInputStream tis, Metadata metadata, MultivaluedMap httpHeaders, UriInfo info, ServerHandlerConfig handlerConfig) throws Exception { return new MetadataList(parseMetadata(tis, metadata, httpHeaders, info, handlerConfig)); } diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java index 00489d975f5..6cab25a4f54 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java @@ -17,7 +17,6 @@ package org.apache.tika.server.core.resource; import static java.nio.charset.StandardCharsets.UTF_8; -import static org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig.DEFAULT_HANDLER_CONFIG; import static org.apache.tika.server.core.resource.RecursiveMetadataResource.DEFAULT_HANDLER_TYPE; import static org.apache.tika.server.core.resource.RecursiveMetadataResource.HANDLER_TYPE_PARAM; @@ -341,7 +340,8 @@ public static boolean getThrowOnWriteLimitReached(MultivaluedMap throw new IllegalArgumentException("'throwOnWriteLimitReached' must be either 'true' or 'false'"); } } - return DEFAULT_HANDLER_CONFIG.isThrowOnWriteLimitReached(); + // Default: throw on write limit reached + return true; } public static long getTaskTimeout(ParseContext parseContext) { diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java index 7bff9149c01..ee899ab180e 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java @@ -54,13 +54,14 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.pipes.core.fetcher.FetcherManager; import org.apache.tika.pipes.core.serialization.JsonFetchEmitTuple; import org.apache.tika.plugins.TikaPluginManager; import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.serialization.JsonMetadataList; import org.apache.tika.server.core.resource.PipesResource; import org.apache.tika.server.core.writer.JSONObjWriter; @@ -203,8 +204,9 @@ public void testPostXML() throws Exception { userMetadata.add("my-key-multi", s); } ParseContext parseContext = new ParseContext(); - HandlerConfig handlerConfig = new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.XML, HandlerConfig.PARSE_MODE.RMETA, -1, -1, true); - parseContext.set(HandlerConfig.class, handlerConfig); + parseContext.set(ContentHandlerFactory.class, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + parseContext.set(ParseMode.class, ParseMode.RMETA); FetchEmitTuple t = new FetchEmitTuple("myId", new FetchKey(FETCHER_ID, "hello_world.xml"), new EmitKey(EMITTER_JSON_ID, ""), userMetadata, parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT); diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerAsyncIntegrationTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerAsyncIntegrationTest.java index 97e10201ac6..f957ef1d82f 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerAsyncIntegrationTest.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerAsyncIntegrationTest.java @@ -18,7 +18,6 @@ import static java.nio.charset.StandardCharsets.UTF_8; -import static org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig.DEFAULT_HANDLER_CONFIG; import static org.apache.tika.server.core.CXFTestBase.EMITTER_JSON_ID; import static org.apache.tika.server.core.CXFTestBase.FETCHER_ID; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -49,8 +48,10 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.emitter.EmitKey; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.pipes.core.serialization.JsonFetchEmitTupleList; @@ -170,7 +171,9 @@ private JsonNode sendAsync(List fileNames) throws Exception { private FetchEmitTuple getFetchEmitTuple(String fileName) throws IOException { ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG); + parseContext.set(ContentHandlerFactory.class, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + parseContext.set(ParseMode.class, ParseMode.RMETA); return new FetchEmitTuple(fileName, new FetchKey(FETCHER_ID, fileName), new EmitKey(EMITTER_JSON_ID, ""), new Metadata(), parseContext, ON_PARSE_EXCEPTION); } diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java index 5ba494c44d6..1abb0d03542 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java @@ -18,7 +18,6 @@ import static java.nio.charset.StandardCharsets.UTF_8; -import static org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig.DEFAULT_HANDLER_CONFIG; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -43,10 +42,12 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.pipes.core.serialization.JsonFetchEmitTuple; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.utils.ProcessUtils; public class TikaServerPipesIntegrationTest extends IntegrationTestBase { @@ -221,7 +222,9 @@ private JsonNode testOneWithPerRequestTimeout(String fileName, long timeoutMilli private String getJsonStringWithTimeout(String fileName, long timeoutMillis) throws IOException { ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG); + parseContext.set(ContentHandlerFactory.class, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + parseContext.set(ParseMode.class, ParseMode.RMETA); parseContext.addConfig("tika-task-timeout", "{\"timeoutMillis\":" + timeoutMillis + "}"); FetchEmitTuple t = new FetchEmitTuple(fileName, @@ -259,7 +262,9 @@ private JsonNode testOne(String fileName, boolean shouldFileExist, FetchEmitTupl private String getJsonString(String fileName, FetchEmitTuple.ON_PARSE_EXCEPTION onParseException) throws IOException { ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG); + parseContext.set(ContentHandlerFactory.class, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + parseContext.set(ParseMode.class, ParseMode.RMETA); FetchEmitTuple t = new FetchEmitTuple(fileName, new FetchKey(CXFTestBase.FETCHER_ID, fileName), new EmitKey(CXFTestBase.EMITTER_JSON_ID, ""), new Metadata(), parseContext, onParseException); return JsonFetchEmitTuple.toJson(t); diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java index 33aae76b859..fc1d44de4b4 100644 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java @@ -426,8 +426,8 @@ public void testEmbeddedResourceLimit() throws Exception { } } - // TIKA-3227 - TODO: re-enable once HandlerConfig is configurable via JSON - // Use maxEmbeddedResources=0 in handler-config to skip embedded documents + // TIKA-3227 - TODO: re-enable once maxEmbeddedResources is configurable via JSON + // Use maxEmbeddedResources=0 in config to skip embedded documents @Test public void testWriteLimit() throws Exception { diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java index ac82d820abe..6f12650bb48 100644 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java @@ -16,7 +16,6 @@ */ package org.apache.tika.server.standard; -import static org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig.DEFAULT_HANDLER_CONFIG; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -56,12 +55,14 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.pipes.core.fetcher.FetcherManager; import org.apache.tika.pipes.core.serialization.JsonFetchEmitTuple; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.plugins.TikaPluginManager; import org.apache.tika.serialization.JsonMetadataList; import org.apache.tika.server.core.CXFTestBase; @@ -247,7 +248,10 @@ public void testBytes() throws Exception { config.setZeroPadName(10); config.setSuffixStrategy(EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.EXISTING); ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG); + // Set default content handler and parse mode + parseContext.set(ContentHandlerFactory.class, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + parseContext.set(ParseMode.class, ParseMode.RMETA); parseContext.set(EmbeddedDocumentBytesConfig.class, config); FetchEmitTuple t = new FetchEmitTuple("myId", new FetchKey(FETCHER_ID, "test_recursive_embedded.docx"), From ce152a56a1de2300f47fc49d4f857a9099357ad3 Mon Sep 17 00:00:00 2001 From: tallison Date: Wed, 17 Dec 2025 17:05:57 -0500 Subject: [PATCH 2/5] TIKA-4582 -- refactor handler factory --- .../resources/configs/config-template.json | 26 +- .../tika/parser/RecursiveParserWrapper.java | 4 +- .../multiple/AbstractMultipleParser.java | 2 +- ...AbstractRecursiveParserWrapperHandler.java | 4 +- .../tika/sax/BasicContentHandlerFactory.java | 38 +- .../tika/sax/ContentHandlerFactory.java | 2 +- .../sax/StreamingContentHandlerFactory.java | 45 ++ .../sax/BasicContentHandlerFactoryTest.java | 36 +- .../resources/pipes-iterator-template.json | 18 +- .../example/PickBestTextEncodingParser.java | 5 +- .../resources/kafka/plugins-template.json | 26 +- .../opensearch/plugins-template.json | 29 +- .../opensearch/tika-config-opensearch.json | 26 +- .../test/resources/s3/plugins-template.json | 26 +- .../test/resources/solr/plugins-template.json | 26 +- .../apache/tika/parser/pdf/PDFParserTest.java | 2 +- .../src/main/resources/config-template.json | 30 +- .../tika/pipes/core/server/ParseHandler.java | 17 +- .../tika/pipes/core/server/PipesServer.java | 11 +- .../tika/pipes/core/PipesClientTest.java | 48 ++ .../tika/pipes/core/PipesServerTest.java | 3 - .../pipes/core/UppercasingContentHandler.java | 104 +++ .../UppercasingContentHandlerFactory.java | 41 ++ .../resources/configs/tika-config-basic.json | 32 +- .../configs/tika-config-passback.json | 32 +- .../configs/tika-config-truncate.json | 34 +- .../configs/tika-config-uppercasing.json | 52 ++ .../json/TestJsonPipesIterator.java | 30 - .../test-with-embedded-bytes.json | 600 ------------------ .../test/resources/test-documents/test.json | 600 ------------------ .../tika/config/loader/TikaJsonConfig.java | 1 + .../apache/tika/config/loader/TikaLoader.java | 44 ++ .../tika/config/loader/ConfigLoaderTest.java | 92 +-- .../resources/configs/test-config-loader.json | 2 +- .../configs/test-partial-config.json | 4 +- .../configs/test-unexpected-field.json | 2 +- .../resource/RecursiveMetadataResource.java | 24 +- .../core/resource/ServerHandlerConfig.java | 34 + .../server/core/resource/TikaResource.java | 12 +- .../core/TikaServerAsyncIntegrationTest.java | 4 +- .../tika/server/standard/TikaPipesTest.java | 7 +- 41 files changed, 661 insertions(+), 1514 deletions(-) create mode 100644 tika-core/src/main/java/org/apache/tika/sax/StreamingContentHandlerFactory.java create mode 100644 tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UppercasingContentHandler.java create mode 100644 tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UppercasingContentHandlerFactory.java create mode 100644 tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json create mode 100644 tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/ServerHandlerConfig.java diff --git a/tika-app/src/test/resources/configs/config-template.json b/tika-app/src/test/resources/configs/config-template.json index dc73dadfe1f..e25bc96833a 100644 --- a/tika-app/src/test/resources/configs/config-template.json +++ b/tika-app/src/test/resources/configs/config-template.json @@ -1,4 +1,12 @@ { + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "maxEmbeddedResources": -1, + "throwOnWriteLimitReached": true + } + }, "fetchers": { "fsf": { "file-system-fetcher": { @@ -21,23 +29,13 @@ "file-system-pipes-iterator": { "basePath": "FETCHER_BASE_PATH", "countTotal": true, - "baseConfig": { - "fetcherId": "fsf", - "emitterId": "fse", - "handlerConfig": { - "type": "TEXT", - "parseMode": "RMETA", - "writeLimit": -1, - "maxEmbeddedResources": -1, - "throwOnWriteLimitReached": true - }, - "onParseException": "EMIT", - "maxWaitMs": 600000, - "queueSize": 10000 - } + "fetcherId": "fsf", + "emitterId": "fse" } }, "pipes": { + "parseMode": "RMETA", + "onParseException": "EMIT", "emitWithinMillis": 10000, "emitMaxEstimatedBytes": 100000, "queueSize": 10000, diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java index 0c0599ec765..07159dba01b 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java +++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java @@ -141,7 +141,7 @@ public void parse(TikaInputStream tis, ContentHandler recursiveParserWrapperHand new EmbeddedParserDecorator(getWrappedParser(), "/", "/", parserState); context.set(Parser.class, decorator); ContentHandler localHandler = - parserState.recursiveParserWrapperHandler.getNewContentHandler(); + parserState.recursiveParserWrapperHandler.createHandler(); long started = System.currentTimeMillis(); parserState.recursiveParserWrapperHandler.startDocument(); int writeLimit = -1; @@ -241,7 +241,7 @@ public void parse(TikaInputStream tis, ContentHandler ignore, Metadata metadata, metadata.set(TikaCoreProperties.EMBEDDED_ID, parserState.embeddedCount); //get a fresh handler ContentHandler localHandler = - parserState.recursiveParserWrapperHandler.getNewContentHandler(); + parserState.recursiveParserWrapperHandler.createHandler(); parserState.recursiveParserWrapperHandler.startEmbeddedDocument(localHandler, metadata); Parser preContextParser = context.get(Parser.class); diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java index 568d61c2570..cc78a55be3e 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java @@ -260,7 +260,7 @@ private void parse(TikaInputStream tis, ContentHandler handler, // If not, the user will get text from every parser // mushed together onto the one solitary handler... if (handlerFactory != null) { - handler = handlerFactory.getNewContentHandler(); + handler = handlerFactory.createHandler(); } // Record that we used this parser diff --git a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java index d607c28142e..ea4efedf6b2 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java @@ -53,8 +53,8 @@ public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandle this.maxEmbeddedResources = maxEmbeddedResources; } - public ContentHandler getNewContentHandler() { - return contentHandlerFactory.getNewContentHandler(); + public ContentHandler createHandler() { + return contentHandlerFactory.createHandler(); } /** diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java index 2f4ca1d35e9..2612ec8650b 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java @@ -26,6 +26,7 @@ import org.xml.sax.ContentHandler; import org.xml.sax.helpers.DefaultHandler; +import org.apache.tika.config.TikaComponent; import org.apache.tika.parser.ParseContext; /** @@ -34,6 +35,7 @@ * Implements {@link StreamingContentHandlerFactory} to support both in-memory * content extraction and streaming output to an OutputStream. */ +@TikaComponent(contextKey = ContentHandlerFactory.class) public class BasicContentHandlerFactory implements StreamingContentHandlerFactory, WriteLimiter { private HANDLER_TYPE type = HANDLER_TYPE.TEXT; @@ -139,7 +141,7 @@ public static HANDLER_TYPE parseHandlerType(String handlerTypeName, HANDLER_TYPE } @Override - public ContentHandler getNewContentHandler() { + public ContentHandler createHandler() { if (type == HANDLER_TYPE.BODY) { return new BodyContentHandler( @@ -170,7 +172,7 @@ private ContentHandler getFormatHandler() { } @Override - public ContentHandler getNewContentHandler(OutputStream os, Charset charset) { + public ContentHandler createHandler(OutputStream os, Charset charset) { if (type == HANDLER_TYPE.IGNORE) { return new DefaultHandler(); @@ -230,14 +232,6 @@ public void setType(HANDLER_TYPE type) { this.type = type; } - /** - * Sets the handler type from a string. - * @param type the handler type name (text, html, xml, body, ignore) - */ - public void setType(String type) { - this.type = parseHandlerType(type, HANDLER_TYPE.TEXT); - } - /** * Common handler types for content. */ @@ -294,4 +288,28 @@ public void setMaxEmbeddedResources(int maxEmbeddedResources) { public void setParseContext(ParseContext parseContext) { this.parseContext = parseContext; } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + BasicContentHandlerFactory that = (BasicContentHandlerFactory) o; + return writeLimit == that.writeLimit && + throwOnWriteLimitReached == that.throwOnWriteLimitReached && + maxEmbeddedResources == that.maxEmbeddedResources && + type == that.type; + } + + @Override + public int hashCode() { + int result = type != null ? type.hashCode() : 0; + result = 31 * result + writeLimit; + result = 31 * result + (throwOnWriteLimitReached ? 1 : 0); + result = 31 * result + maxEmbeddedResources; + return result; + } } diff --git a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java index 1022e3f4ca3..4c7efd7231f 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java @@ -38,5 +38,5 @@ public interface ContentHandlerFactory extends Serializable { * * @return a new ContentHandler instance */ - ContentHandler getNewContentHandler(); + ContentHandler createHandler(); } diff --git a/tika-core/src/main/java/org/apache/tika/sax/StreamingContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/StreamingContentHandlerFactory.java new file mode 100644 index 00000000000..02279c16972 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/sax/StreamingContentHandlerFactory.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.sax; + +import java.io.OutputStream; +import java.nio.charset.Charset; + +import org.xml.sax.ContentHandler; + +/** + * Extended factory interface for creating ContentHandler instances that write + * directly to an OutputStream. + *

+ * This interface extends {@link ContentHandlerFactory} to add streaming output + * capability, primarily used by tika-server's /tika endpoint for streaming + * responses back to clients. + * + * @see ContentHandlerFactory + * @see BasicContentHandlerFactory + */ +public interface StreamingContentHandlerFactory extends ContentHandlerFactory { + + /** + * Creates a new ContentHandler that writes output directly to the given OutputStream. + * + * @param os the output stream to write to + * @param charset the character encoding to use + * @return a new ContentHandler instance that writes to the stream + */ + ContentHandler createHandler(OutputStream os, Charset charset); +} diff --git a/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java b/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java index 8a177c12ed4..bc6260d0a4a 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java @@ -73,7 +73,7 @@ public void testIgnore() throws Exception { Parser p = new MockParser(OVER_DEFAULT); ContentHandler handler = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1) - .getNewContentHandler(); + .createHandler(); assertTrue(handler instanceof DefaultHandler); p.parse(null, handler, null, null); //unfortunatley, the DefaultHandler does not return "", @@ -82,7 +82,7 @@ public void testIgnore() throws Exception { //tests that no write limit exception is thrown p = new MockParser(100); handler = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, 5) - .getNewContentHandler(); + .createHandler(); assertTrue(handler instanceof DefaultHandler); p.parse(null, handler, null, null); assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString()); @@ -92,7 +92,7 @@ public void testIgnore() throws Exception { public void testText() throws Exception { Parser p = new MockParser(OVER_DEFAULT); BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.TEXT; - ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(); + ContentHandler handler = new BasicContentHandlerFactory(type, -1).createHandler(); assertTrue(handler instanceof ToTextContentHandler); p.parse(null, handler, null, null); @@ -104,7 +104,7 @@ public void testText() throws Exception { assertTrue(extracted.length() > 110000); //now test write limit p = new MockParser(10); - handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); + handler = new BasicContentHandlerFactory(type, 5).createHandler(); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); extracted = handler.toString(); @@ -114,7 +114,7 @@ public void testText() throws Exception { //now test outputstream call p = new MockParser(OVER_DEFAULT); ByteArrayOutputStream os = new ByteArrayOutputStream(); - handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8); + handler = new BasicContentHandlerFactory(type, -1).createHandler(os, UTF_8); assertTrue(handler instanceof ToTextContentHandler); p.parse(null, handler, null, null); assertContains("This is the title", os.toByteArray()); @@ -125,7 +125,7 @@ public void testText() throws Exception { p = new MockParser(10); os = new ByteArrayOutputStream(); - handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8); + handler = new BasicContentHandlerFactory(type, 5).createHandler(os, UTF_8); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); //When writing to an OutputStream and a write limit is reached, @@ -137,7 +137,7 @@ public void testText() throws Exception { public void testHTML() throws Exception { Parser p = new MockParser(OVER_DEFAULT); BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.HTML; - ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(); + ContentHandler handler = new BasicContentHandlerFactory(type, -1).createHandler(); assertTrue(handler instanceof ToHTMLContentHandler); p.parse(null, handler, null, null); @@ -148,7 +148,7 @@ public void testHTML() throws Exception { //now test write limit p = new MockParser(10); - handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); + handler = new BasicContentHandlerFactory(type, 5).createHandler(); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); extracted = handler.toString(); @@ -158,7 +158,7 @@ public void testHTML() throws Exception { //now test outputstream call p = new MockParser(OVER_DEFAULT); ByteArrayOutputStream os = new ByteArrayOutputStream(); - handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8); + handler = new BasicContentHandlerFactory(type, -1).createHandler(os, UTF_8); assertTrue(handler instanceof ToHTMLContentHandler); p.parse(null, handler, null, null); assertContains("This is the title", os.toByteArray()); @@ -170,7 +170,7 @@ public void testHTML() throws Exception { p = new MockParser(10); os = new ByteArrayOutputStream(); - handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8); + handler = new BasicContentHandlerFactory(type, 5).createHandler(os, UTF_8); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); assertEquals(0, os.toByteArray().length); @@ -180,7 +180,7 @@ public void testHTML() throws Exception { public void testXML() throws Exception { Parser p = new MockParser(OVER_DEFAULT); BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.HTML; - ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(); + ContentHandler handler = new BasicContentHandlerFactory(type, -1).createHandler(); assertTrue(handler instanceof ToXMLContentHandler); p.parse(null, handler, new Metadata(), null); @@ -191,7 +191,7 @@ public void testXML() throws Exception { //now test write limit p = new MockParser(10); - handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); + handler = new BasicContentHandlerFactory(type, 5).createHandler(); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); extracted = handler.toString(); @@ -201,7 +201,7 @@ public void testXML() throws Exception { //now test outputstream call p = new MockParser(OVER_DEFAULT); ByteArrayOutputStream os = new ByteArrayOutputStream(); - handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8); + handler = new BasicContentHandlerFactory(type, -1).createHandler(os, UTF_8); assertTrue(handler instanceof ToXMLContentHandler); p.parse(null, handler, null, null); @@ -214,7 +214,7 @@ public void testXML() throws Exception { p = new MockParser(10); os = new ByteArrayOutputStream(); - handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8); + handler = new BasicContentHandlerFactory(type, 5).createHandler(os, UTF_8); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); assertEquals(0, os.toByteArray().length); @@ -224,7 +224,7 @@ public void testXML() throws Exception { public void testBody() throws Exception { Parser p = new MockParser(OVER_DEFAULT); BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.BODY; - ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(); + ContentHandler handler = new BasicContentHandlerFactory(type, -1).createHandler(); assertTrue(handler instanceof BodyContentHandler); @@ -236,7 +236,7 @@ public void testBody() throws Exception { //now test write limit p = new MockParser(10); - handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); + handler = new BasicContentHandlerFactory(type, 5).createHandler(); assertTrue(handler instanceof BodyContentHandler); assertWriteLimitReached(p, (BodyContentHandler) handler); extracted = handler.toString(); @@ -246,7 +246,7 @@ public void testBody() throws Exception { //now test outputstream call p = new MockParser(OVER_DEFAULT); ByteArrayOutputStream os = new ByteArrayOutputStream(); - handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8); + handler = new BasicContentHandlerFactory(type, -1).createHandler(os, UTF_8); assertTrue(handler instanceof BodyContentHandler); p.parse(null, handler, null, null); assertNotContains("title", os.toByteArray()); @@ -257,7 +257,7 @@ public void testBody() throws Exception { p = new MockParser(10); os = new ByteArrayOutputStream(); - handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8); + handler = new BasicContentHandlerFactory(type, 5).createHandler(os, UTF_8); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); assertEquals(0, os.toByteArray().length); diff --git a/tika-eval/tika-eval-app/src/main/resources/pipes-iterator-template.json b/tika-eval/tika-eval-app/src/main/resources/pipes-iterator-template.json index a5a7ddfad37..4ae623d6065 100644 --- a/tika-eval/tika-eval-app/src/main/resources/pipes-iterator-template.json +++ b/tika-eval/tika-eval-app/src/main/resources/pipes-iterator-template.json @@ -1,18 +1,6 @@ { "basePath": "FETCHER_BASE_PATH", "countTotal": true, - "baseConfig": { - "fetcherId": "fsf", - "emitterId": "", - "handlerConfig": { - "type": "TEXT", - "parseMode": "RMETA", - "writeLimit": -1, - "maxEmbeddedResources": -1, - "throwOnWriteLimitReached": true - }, - "onParseException": "EMIT", - "maxWaitMs": 600000, - "queueSize": 10000 - } -} \ No newline at end of file + "fetcherId": "fsf", + "emitterId": "" +} diff --git a/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java b/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java index 4afc9122903..c42a562c898 100644 --- a/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java +++ b/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java @@ -17,7 +17,6 @@ package org.apache.tika.example; import java.io.IOException; -import java.io.OutputStream; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.HashMap; @@ -147,7 +146,7 @@ public void parse(TikaInputStream tis, ContentHandler handler, Metadata original public void parse(TikaInputStream tis, ContentHandlerFactory handlers, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // We only work with one ContentHandler as far as the user is // concerned, any others are purely internal! - parse(tis, handlers.getNewContentHandler(), metadata, context); + parse(tis, handlers.createHandler(), metadata, context); } protected class CharsetContentHandlerFactory implements ContentHandlerFactory { @@ -157,7 +156,7 @@ protected class CharsetContentHandlerFactory implements ContentHandlerFactory { private ContentHandler handler; @Override - public ContentHandler getNewContentHandler() { + public ContentHandler createHandler() { index++; if (index < charsetsToTry.length) { return new BodyContentHandler(); diff --git a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/plugins-template.json b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/plugins-template.json index 7dc28288517..128a1a8b441 100644 --- a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/plugins-template.json +++ b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/plugins-template.json @@ -1,4 +1,12 @@ { + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "maxEmbeddedResources": -1, + "throwOnWriteLimitReached": true + } + }, "parsers": [ { "default-parser": {} @@ -77,23 +85,13 @@ "groupId": "grpid", "autoOffsetReset": "earliest", "pollDelayMs": 1000, - "baseConfig": { - "fetcherId": "fsf", - "emitterId": "ke", - "handlerConfig": { - "type": "TEXT", - "parseMode": "PARSE_MODE", - "writeLimit": -1, - "maxEmbeddedResources": -1, - "throwOnWriteLimitReached": true - }, - "onParseException": "EMIT", - "maxWaitMs": 600000, - "queueSize": 10000 - } + "fetcherId": "fsf", + "emitterId": "ke" } }, "pipes": { + "parseMode": "PARSE_MODE", + "onParseException": "EMIT", "emitMaxEstimatedBytes": 100000, "emitWithinMillis": 10, "numEmitters": 1, diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/plugins-template.json b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/plugins-template.json index 16e2a4fc968..2b4f98f92e3 100644 --- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/plugins-template.json +++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/plugins-template.json @@ -1,4 +1,12 @@ { + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "maxEmbeddedResources": -1, + "throwOnWriteLimitReached": true + } + }, "fetchers": { "fsf": { "file-system-fetcher": { @@ -29,20 +37,8 @@ "file-system-pipes-iterator": { "basePath": "FETCHER_BASE_PATH", "countTotal": true, - "baseConfig": { - "fetcherId": "fsf", - "emitterId": "ose", - "handlerConfig": { - "type": "TEXT", - "parseMode": "PARSE_MODE", - "writeLimit": -1, - "maxEmbeddedResources": -1, - "throwOnWriteLimitReached": true - }, - "onParseException": "EMIT", - "maxWaitMs": 600000, - "queueSize": 10000 - } + "fetcherId": "fsf", + "emitterId": "ose" } }, "pipes-reporters": { @@ -60,6 +56,8 @@ } }, "pipes": { + "parseMode": "PARSE_MODE", + "onParseException": "EMIT", "emitStrategy": { "type": "DYNAMIC", "thresholdBytes": 10000 @@ -93,6 +91,5 @@ } } ], - "plugin-roots": "target/plugins" -} \ No newline at end of file +} diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json index a6a0c512679..172a0c1c0ec 100644 --- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json +++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json @@ -1,4 +1,12 @@ { + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "maxEmbeddedResources": -1, + "throwOnWriteLimitReached": true + } + }, "parsers": [ { "default-parser": {} @@ -70,20 +78,8 @@ "file-system-pipes-iterator": { "basePath": "FETCHER_BASE_PATH", "countTotal": true, - "baseConfig": { - "fetcherId": "fsf", - "emitterId": "ose", - "handlerConfig": { - "type": "TEXT", - "parseMode": "PARSE_MODE", - "writeLimit": -1, - "maxEmbeddedResources": -1, - "throwOnWriteLimitReached": true - }, - "onParseException": "EMIT", - "maxWaitMs": 600000, - "queueSize": 10000 - } + "fetcherId": "fsf", + "emitterId": "ose" } }, "pipes-reporters": { @@ -101,6 +97,8 @@ } }, "pipes": { + "parseMode": "PARSE_MODE", + "onParseException": "EMIT", "emitStrategy": { "type": "DYNAMIC", "thresholdBytes": 10000 diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/plugins-template.json b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/plugins-template.json index 1efc929ce35..816d5c49e58 100644 --- a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/plugins-template.json +++ b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/plugins-template.json @@ -1,4 +1,12 @@ { + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "maxEmbeddedResources": -1, + "throwOnWriteLimitReached": true + } + }, "fetchers": { "s3f": { "s3-fetcher": { @@ -44,23 +52,13 @@ "secretKey": "SECRET_KEY", "endpointConfigurationService": "ENDPOINT_CONFIGURATION_SERVICE", "pathStyleAccessEnabled": true, - "baseConfig": { - "fetcherId": "s3f", - "emitterId": "s3e", - "handlerConfig": { - "type": "TEXT", - "parseMode": "PARSE_MODE", - "writeLimit": -1, - "maxEmbeddedResources": -1, - "throwOnWriteLimitReached": true - }, - "onParseException": "EMIT", - "maxWaitMs": 600000, - "queueSize": 10000 - } + "fetcherId": "s3f", + "emitterId": "s3e" } }, "pipes": { + "parseMode": "PARSE_MODE", + "onParseException": "EMIT", "emitMaxEstimatedBytes": 100000, "emitWithinMillis": 10, "numEmitters": 1, diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/plugins-template.json b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/plugins-template.json index 366be952746..63cf5d73b50 100644 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/plugins-template.json +++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/plugins-template.json @@ -1,4 +1,12 @@ { + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "maxEmbeddedResources": -1, + "throwOnWriteLimitReached": true + } + }, "parsers": [ { "default-parser": {} @@ -74,23 +82,13 @@ "rows": 100, "connectionTimeout": 10000, "socketTimeout": 60000, - "baseConfig": { - "fetcherId": "fsf", - "emitterId": "se", - "handlerConfig": { - "type": "TEXT", - "parseMode": "PARSE_MODE", - "writeLimit": -1, - "maxEmbeddedResources": -1, - "throwOnWriteLimitReached": true - }, - "onParseException": "EMIT", - "maxWaitMs": 600000, - "queueSize": 10000 - } + "fetcherId": "fsf", + "emitterId": "se" } }, "pipes": { + "parseMode": "PARSE_MODE", + "onParseException": "EMIT", "emitStrategy": { "type": "DYNAMIC", "thresholdBytes": 10000 diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index ffaa2cac246..195da525caa 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -1518,7 +1518,7 @@ private Metadata testWriteLimit(String fileName, int limit) throws Exception { BasicContentHandlerFactory factory = new BasicContentHandlerFactory( BasicContentHandlerFactory.HANDLER_TYPE.TEXT, limit ); - ContentHandler contentHandler = factory.getNewContentHandler(); + ContentHandler contentHandler = factory.createHandler(); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); try (TikaInputStream tis = getResourceAsStream("/test-documents/" + fileName)) { diff --git a/tika-pipes/tika-async-cli/src/main/resources/config-template.json b/tika-pipes/tika-async-cli/src/main/resources/config-template.json index e295290dd4b..d4c70d5d731 100644 --- a/tika-pipes/tika-async-cli/src/main/resources/config-template.json +++ b/tika-pipes/tika-async-cli/src/main/resources/config-template.json @@ -1,4 +1,12 @@ { + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "maxEmbeddedResources": -1, + "throwOnWriteLimitReached": true + } + }, "parsers": [ { "default-parser": {} @@ -45,21 +53,15 @@ "file-system-pipes-iterator": { "basePath": "FETCHER_BASE_PATH", "countTotal": true, - "baseConfig": { - "fetcherId": "fsf", - "emitterId": "fse", - "handlerConfig": { - "type": "TEXT", - "parseMode": "RMETA", - "writeLimit": -1, - "maxEmbeddedResources": -1, - "throwOnWriteLimitReached": true - }, - "onParseException": "EMIT", - "maxWaitMs": 600000, - "queueSize": 10000 - } + "fetcherId": "fsf", + "emitterId": "fse", + "onParseException": "EMIT", + "maxWaitMs": 600000, + "queueSize": 10000 } }, + "pipes": { + "parseMode": "RMETA" + }, "plugin-roots": "PLUGIN_ROOTS" } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java index 5ad637bd13f..f14bb3f17b9 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java @@ -61,17 +61,22 @@ class ParseHandler { private final CountDownLatch countDownLatch; private final AutoDetectParser autoDetectParser; private final RecursiveParserWrapper recursiveParserWrapper; + private final ContentHandlerFactory defaultContentHandlerFactory; + private final ParseMode defaultParseMode; ParseHandler(Detector detector, Digester digester, ArrayBlockingQueue intermediateResult, CountDownLatch countDownLatch, AutoDetectParser autoDetectParser, - RecursiveParserWrapper recursiveParserWrapper) { + RecursiveParserWrapper recursiveParserWrapper, ContentHandlerFactory defaultContentHandlerFactory, + ParseMode defaultParseMode) { this.detector = detector; this.digester = digester; this.intermediateResult = intermediateResult; this.countDownLatch = countDownLatch; this.autoDetectParser = autoDetectParser; this.recursiveParserWrapper = recursiveParserWrapper; + this.defaultContentHandlerFactory = defaultContentHandlerFactory; + this.defaultParseMode = defaultParseMode; } PipesWorker.ParseDataOrPipesResult parseWithStream(FetchEmitTuple fetchEmitTuple, TikaInputStream stream, Metadata metadata, ParseContext parseContext) @@ -98,8 +103,8 @@ private ParseMode getParseMode(ParseContext parseContext) { if (mode != null) { return mode; } - // Default to RMETA mode - return ParseMode.RMETA; + // Fall back to default loaded from TikaLoader + return defaultParseMode; } private ContentHandlerFactory getContentHandlerFactory(ParseContext parseContext) { @@ -107,8 +112,8 @@ private ContentHandlerFactory getContentHandlerFactory(ParseContext parseContext if (factory != null) { return factory; } - // Default to BasicContentHandlerFactory with TEXT handler, unlimited write - return new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1); + // Fall back to default loaded from TikaLoader + return defaultContentHandlerFactory; } @@ -192,7 +197,7 @@ public List parseConcatenated(FetchEmitTuple fetchEmitTuple, ContentHandlerFactory contentHandlerFactory, TikaInputStream stream, Metadata metadata, ParseContext parseContext) { - ContentHandler handler = contentHandlerFactory.getNewContentHandler(); + ContentHandler handler = contentHandlerFactory.createHandler(); int maxEmbedded = -1; if (contentHandlerFactory instanceof BasicContentHandlerFactory) { maxEmbedded = ((BasicContentHandlerFactory) contentHandlerFactory).getMaxEmbeddedResources(); diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java index 94c66477ef7..62a3955408f 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java @@ -73,6 +73,7 @@ import org.apache.tika.pipes.core.emitter.EmitterManager; import org.apache.tika.pipes.core.fetcher.FetcherManager; import org.apache.tika.plugins.TikaPluginManager; +import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.serialization.ParseContextUtils; import org.apache.tika.utils.ExceptionUtils; @@ -146,6 +147,7 @@ public byte getByte() { private final PipesConfig pipesConfig; private final Socket socket; private final MetadataFilter defaultMetadataFilter; + private final ContentHandlerFactory defaultContentHandlerFactory; private AutoDetectParser autoDetectParser; private RecursiveParserWrapper rMetaParser; private FetcherManager fetcherManager; @@ -171,7 +173,8 @@ public static PipesServer load(int port, Path tikaConfigPath) throws Exception { socket.setSoTimeout((int) pipesConfig.getSocketTimeoutMs()); MetadataFilter metadataFilter = tikaLoader.loadMetadataFilters(); - PipesServer pipesServer = new PipesServer(pipesClientId, tikaLoader, pipesConfig, socket, dis, dos, metadataFilter); + ContentHandlerFactory contentHandlerFactory = tikaLoader.loadContentHandlerFactory(); + PipesServer pipesServer = new PipesServer(pipesClientId, tikaLoader, pipesConfig, socket, dis, dos, metadataFilter, contentHandlerFactory); pipesServer.initializeResources(); LOG.debug("pipesClientId={}: PipesServer loaded and ready", pipesClientId); return pipesServer; @@ -204,7 +207,7 @@ public static PipesServer load(int port, Path tikaConfigPath) throws Exception { } public PipesServer(String pipesClientId, TikaLoader tikaLoader, PipesConfig pipesConfig, Socket socket, DataInputStream in, - DataOutputStream out, MetadataFilter metadataFilter) throws TikaConfigException, + DataOutputStream out, MetadataFilter metadataFilter, ContentHandlerFactory contentHandlerFactory) throws TikaConfigException, IOException { this.pipesClientId = pipesClientId; @@ -212,6 +215,7 @@ public PipesServer(String pipesClientId, TikaLoader tikaLoader, PipesConfig pipe this.pipesConfig = pipesConfig; this.socket = socket; this.defaultMetadataFilter = metadataFilter; + this.defaultContentHandlerFactory = contentHandlerFactory; this.input = new DataInputStream(in); this.output = new DataOutputStream(out); this.heartbeatIntervalMs = pipesConfig.getHeartbeatIntervalMs(); @@ -328,7 +332,8 @@ public void mainLoop() { private PipesWorker getPipesWorker(ArrayBlockingQueue intermediateResult, FetchEmitTuple fetchEmitTuple, CountDownLatch countDownLatch) { FetchHandler fetchHandler = new FetchHandler(fetcherManager); - ParseHandler parseHandler = new ParseHandler(detector, digester, intermediateResult, countDownLatch, autoDetectParser, rMetaParser); + ParseHandler parseHandler = new ParseHandler(detector, digester, intermediateResult, countDownLatch, autoDetectParser, + rMetaParser, defaultContentHandlerFactory, pipesConfig.getParseMode()); Long thresholdBytes = pipesConfig.getEmitStrategy().getThresholdBytes(); long threshold = (thresholdBytes != null) ? thresholdBytes : EmitStrategyConfig.DEFAULT_DIRECT_EMIT_THRESHOLD_BYTES; EmitHandler emitHandler = new EmitHandler(defaultMetadataFilter, emitStrategy, emitterManager, threshold); diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java index 9fcd2dd0634..69f51be3747 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java +++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java @@ -622,6 +622,54 @@ public void testEmitterNotFound(@TempDir Path tmp) throws Exception { } } + @Test + public void testCustomContentHandlerFactory(@TempDir Path tmp) throws Exception { + // Test that a custom ContentHandlerFactory configured in tika-config.json + // is properly used during parsing. The UppercasingContentHandlerFactory + // converts all extracted text to uppercase. + Path inputDir = tmp.resolve("input"); + Files.createDirectories(inputDir); + + // Create a simple mock XML file with known content + String mockContent = "" + "" + + "Test Author" + + "Hello World from Tika" + + ""; + String testFile = "test-uppercase.xml"; + Files.write(inputDir.resolve(testFile), mockContent.getBytes(StandardCharsets.UTF_8)); + + // Use the uppercasing config + Path tikaConfigPath = PluginsTestHelper.getFileSystemFetcherConfig( + "tika-config-uppercasing.json", tmp, inputDir, tmp.resolve("output"), false); + TikaJsonConfig tikaJsonConfig = TikaJsonConfig.load(tikaConfigPath); + PipesConfig pipesConfig = PipesConfig.load(tikaJsonConfig); + + try (PipesClient pipesClient = new PipesClient(pipesConfig, tikaConfigPath)) { + FetchEmitTuple tuple = new FetchEmitTuple(testFile, + new FetchKey(fetcherName, testFile), + new EmitKey(), new Metadata(), new ParseContext(), + FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP); + + PipesResult pipesResult = pipesClient.process(tuple); + + // Should succeed + assertTrue(pipesResult.isSuccess(), + "Processing should succeed. Got status: " + pipesResult.status() + + ", message: " + pipesResult.message()); + + Assertions.assertNotNull(pipesResult.emitData().getMetadataList()); + assertEquals(1, pipesResult.emitData().getMetadataList().size()); + + Metadata metadata = pipesResult.emitData().getMetadataList().get(0); + + // The content should be uppercased due to UppercasingContentHandlerFactory + String content = metadata.get(TikaCoreProperties.TIKA_CONTENT); + Assertions.assertNotNull(content, "Content should not be null"); + assertTrue(content.contains("HELLO WORLD FROM TIKA"), + "Content should be uppercased. Actual content: " + content); + } + } + @Test public void testHeartbeatProtocol(@TempDir Path tmp) throws Exception { // Test that heartbeat protocol works correctly and doesn't cause protocol errors diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesServerTest.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesServerTest.java index 7c137084c79..621822fd236 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesServerTest.java +++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesServerTest.java @@ -16,9 +16,6 @@ */ package org.apache.tika.pipes.core; - - - import org.apache.tika.TikaTest; public class PipesServerTest extends TikaTest { diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UppercasingContentHandler.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UppercasingContentHandler.java new file mode 100644 index 00000000000..f59b9d0c59a --- /dev/null +++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UppercasingContentHandler.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.core; + +import java.util.Locale; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; + +/** + * A ContentHandler decorator that converts all character content to uppercase. + * Used for testing custom ContentHandlerFactory configurations. + */ +public class UppercasingContentHandler implements ContentHandler { + + private final ContentHandler delegate; + + public UppercasingContentHandler(ContentHandler delegate) { + this.delegate = delegate; + } + + @Override + public void setDocumentLocator(Locator locator) { + delegate.setDocumentLocator(locator); + } + + @Override + public void startDocument() throws SAXException { + delegate.startDocument(); + } + + @Override + public void endDocument() throws SAXException { + delegate.endDocument(); + } + + @Override + public void startPrefixMapping(String prefix, String uri) throws SAXException { + delegate.startPrefixMapping(prefix, uri); + } + + @Override + public void endPrefixMapping(String prefix) throws SAXException { + delegate.endPrefixMapping(prefix); + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) + throws SAXException { + delegate.startElement(uri, localName, qName, atts); + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + delegate.endElement(uri, localName, qName); + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + // Convert characters to uppercase + char[] upper = new String(ch, start, length).toUpperCase(Locale.ROOT).toCharArray(); + delegate.characters(upper, 0, upper.length); + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + delegate.ignorableWhitespace(ch, start, length); + } + + @Override + public void processingInstruction(String target, String data) throws SAXException { + delegate.processingInstruction(target, data); + } + + @Override + public void skippedEntity(String name) throws SAXException { + delegate.skippedEntity(name); + } + + /** + * Returns the underlying delegate handler's string representation, + * which typically contains the extracted content. + */ + @Override + public String toString() { + return delegate.toString(); + } +} diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UppercasingContentHandlerFactory.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UppercasingContentHandlerFactory.java new file mode 100644 index 00000000000..db2ffdfec7d --- /dev/null +++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UppercasingContentHandlerFactory.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.core; + +import org.xml.sax.ContentHandler; + +import org.apache.tika.config.TikaComponent; +import org.apache.tika.sax.ContentHandlerFactory; +import org.apache.tika.sax.ToTextContentHandler; + +/** + * A ContentHandlerFactory that creates UppercasingContentHandler instances. + * This factory wraps a ToTextContentHandler with an uppercasing decorator + * to convert all extracted text to uppercase. + *

+ * Used for testing custom ContentHandlerFactory configurations in tika-pipes. + */ +@TikaComponent(contextKey = ContentHandlerFactory.class) +public class UppercasingContentHandlerFactory implements ContentHandlerFactory { + + private static final long serialVersionUID = 1L; + + @Override + public ContentHandler createHandler() { + return new UppercasingContentHandler(new ToTextContentHandler()); + } +} diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json index f0283182078..5873c39a87b 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json @@ -1,4 +1,12 @@ { + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "maxEmbeddedResources": -1, + "throwOnWriteLimitReached": true + } + }, "fetchers": { "fsf": { "file-system-fetcher": { @@ -18,27 +26,15 @@ }, "pipes-iterator": { "file-system-pipes-iterator": { - "fspi": { - "basePath": "FETCHER_BASE_PATH", - "countTotal": true, - "baseConfig": { - "fetcherId": "fsf", - "emitterId": "fse", - "handlerConfig": { - "type": "TEXT", - "parseMode": "RMETA", - "writeLimit": -1, - "maxEmbeddedResources": -1, - "throwOnWriteLimitReached": true - }, - "onParseException": "EMIT", - "maxWaitMs": 600000, - "queueSize": 10000 - } - } + "basePath": "FETCHER_BASE_PATH", + "countTotal": true, + "fetcherId": "fsf", + "emitterId": "fse" } }, "pipes": { + "parseMode": "RMETA", + "onParseException": "EMIT", "numClients": 4, "timeoutMillis": 5000, "emitIntermediateResults": "EMIT_INTERMEDIATE_RESULTS", diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json index 153a68796dc..529e878cb60 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json @@ -1,4 +1,12 @@ { + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "maxEmbeddedResources": -1, + "throwOnWriteLimitReached": true + } + }, "fetchers": { "fsf": { "file-system-fetcher": { @@ -18,27 +26,15 @@ }, "pipes-iterator": { "file-system-pipes-iterator": { - "fspi": { - "basePath": "FETCHER_BASE_PATH", - "countTotal": true, - "baseConfig": { - "fetcherId": "fsf", - "emitterId": "fse", - "handlerConfig": { - "type": "TEXT", - "parseMode": "RMETA", - "writeLimit": -1, - "maxEmbeddedResources": -1, - "throwOnWriteLimitReached": true - }, - "onParseException": "EMIT", - "maxWaitMs": 600000, - "queueSize": 10000 - } - } + "basePath": "FETCHER_BASE_PATH", + "countTotal": true, + "fetcherId": "fsf", + "emitterId": "fse" } }, "pipes": { + "parseMode": "RMETA", + "onParseException": "EMIT", "numClients": 4, "timeoutMillis": 5000, "emitIntermediateResults": "EMIT_INTERMEDIATE_RESULTS", diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json index fd859202075..7cba584c069 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json @@ -1,4 +1,12 @@ { + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "maxEmbeddedResources": -1, + "throwOnWriteLimitReached": true + } + }, "fetchers": { "fsf": { "file-system-fetcher": { @@ -18,30 +26,18 @@ }, "pipes-iterator": { "file-system-pipes-iterator": { - "fspi": { - "basePath": "FETCHER_BASE_PATH", - "countTotal": true, - "baseConfig": { - "fetcherId": "fsf", - "emitterId": "fse", - "handlerConfig": { - "type": "TEXT", - "parseMode": "RMETA", - "writeLimit": -1, - "maxEmbeddedResources": -1, - "throwOnWriteLimitReached": true - }, - "onParseException": "EMIT", - "maxWaitMs": 600000, - "queueSize": 10000 - } - } + "basePath": "FETCHER_BASE_PATH", + "countTotal": true, + "fetcherId": "fsf", + "emitterId": "fse" } }, "pipes": { + "parseMode": "RMETA", + "onParseException": "EMIT", "numClients": 4, "timeoutMillis": 5000, - "emitIntermediateResults": EMIT_INTERMEDIATE_RESULTS, + "emitIntermediateResults": "EMIT_INTERMEDIATE_RESULTS", "forkedJvmArgs": ["-Xmx512m"], "emitStrategy": { "type": "DYNAMIC", diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json new file mode 100644 index 00000000000..e7d8a21c028 --- /dev/null +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json @@ -0,0 +1,52 @@ +{ + "content-handler-factory": { + "uppercasing-content-handler-factory": {} + }, + "fetchers": { + "fsf": { + "file-system-fetcher": { + "basePath": "FETCHER_BASE_PATH", + "extractFileSystemMetadata": false + } + } + }, + "emitters": { + "fse": { + "file-system-emitter": { + "basePath": "EMITTER_BASE_PATH", + "fileExtension": "json", + "onExists": "EXCEPTION" + } + } + }, + "pipes-iterator": { + "file-system-pipes-iterator": { + "basePath": "FETCHER_BASE_PATH", + "countTotal": true, + "fetcherId": "fsf", + "emitterId": "fse" + } + }, + "pipes": { + "parseMode": "RMETA", + "onParseException": "EMIT", + "numClients": 4, + "timeoutMillis": 5000, + "emitIntermediateResults": "EMIT_INTERMEDIATE_RESULTS", + "forkedJvmArgs": ["-Xmx512m"], + "emitStrategy": { + "type": "DYNAMIC", + "thresholdBytes": 1000000 + } + }, + "auto-detect-parser": { + "spoolToDisk": 1000000, + "outputThreshold": 1000000, + "skipContainerDocumentDigest": false, + "digesterFactory": { + "mock-digester-factory": {} + }, + "throwOnZeroBytes": false + }, + "plugin-roots": "PLUGINS_PATHS" +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/java/org/apache/tika/pipes/pipesiterator/json/TestJsonPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/java/org/apache/tika/pipes/pipesiterator/json/TestJsonPipesIterator.java index 4211387888f..53aca506ffc 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/java/org/apache/tika/pipes/pipesiterator/json/TestJsonPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/java/org/apache/tika/pipes/pipesiterator/json/TestJsonPipesIterator.java @@ -71,34 +71,4 @@ private JsonPipesIterator createIterator(Path jsonPath) throws Exception { OBJECT_MAPPER.writeValueAsString(jsonConfig)); return JsonPipesIterator.build(extensionConfig); } - - - /* - //use this to generate test files - public static void main(String[] args) throws Exception { - Path p = Paths.get("/home/tallison/Intellij/tika-main/tika-pipes/tika-pipes-iterators" + - "/tika-pipes-iterator-json/src/test/resources/test-documents/test-with-embedded" + - "-bytes.json"); - try (BufferedWriter writer = Files.newBufferedWriter(p, StandardCharsets.UTF_8)) { - HandlerConfig handlerConfig = - new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, - HandlerConfig.PARSE_MODE.RMETA, -1, -1, - false); - EmbeddedDocumentBytesConfig config = new EmbeddedDocumentBytesConfig(true); - for (int i = 0; i < 100; i++) { - String id = "myid-"+i; - FetchEmitTuple t = new FetchEmitTuple( - id, - new FetchKey("fs", i + ".xml"), - new EmitKey("fs", i + ".xml.json"), - new Metadata(), - handlerConfig, - FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT, - config); - String line = JsonFetchEmitTuple.toJson(t); - writer.write(line); - writer.newLine(); - } - } - }*/ } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test-with-embedded-bytes.json b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test-with-embedded-bytes.json index 74883069062..daef89edaa6 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test-with-embedded-bytes.json +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test-with-embedded-bytes.json @@ -4,12 +4,6 @@ "fetchKey": "0.xml", "emitter": "fs", "emitKey": "0.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -25,12 +19,6 @@ "fetchKey": "1.xml", "emitter": "fs", "emitKey": "1.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -46,12 +34,6 @@ "fetchKey": "2.xml", "emitter": "fs", "emitKey": "2.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -67,12 +49,6 @@ "fetchKey": "3.xml", "emitter": "fs", "emitKey": "3.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -88,12 +64,6 @@ "fetchKey": "4.xml", "emitter": "fs", "emitKey": "4.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -109,12 +79,6 @@ "fetchKey": "5.xml", "emitter": "fs", "emitKey": "5.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -130,12 +94,6 @@ "fetchKey": "6.xml", "emitter": "fs", "emitKey": "6.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -151,12 +109,6 @@ "fetchKey": "7.xml", "emitter": "fs", "emitKey": "7.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -172,12 +124,6 @@ "fetchKey": "8.xml", "emitter": "fs", "emitKey": "8.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -193,12 +139,6 @@ "fetchKey": "9.xml", "emitter": "fs", "emitKey": "9.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -214,12 +154,6 @@ "fetchKey": "10.xml", "emitter": "fs", "emitKey": "10.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -235,12 +169,6 @@ "fetchKey": "11.xml", "emitter": "fs", "emitKey": "11.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -256,12 +184,6 @@ "fetchKey": "12.xml", "emitter": "fs", "emitKey": "12.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -277,12 +199,6 @@ "fetchKey": "13.xml", "emitter": "fs", "emitKey": "13.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -298,12 +214,6 @@ "fetchKey": "14.xml", "emitter": "fs", "emitKey": "14.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -319,12 +229,6 @@ "fetchKey": "15.xml", "emitter": "fs", "emitKey": "15.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -340,12 +244,6 @@ "fetchKey": "16.xml", "emitter": "fs", "emitKey": "16.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -361,12 +259,6 @@ "fetchKey": "17.xml", "emitter": "fs", "emitKey": "17.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -382,12 +274,6 @@ "fetchKey": "18.xml", "emitter": "fs", "emitKey": "18.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -403,12 +289,6 @@ "fetchKey": "19.xml", "emitter": "fs", "emitKey": "19.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -424,12 +304,6 @@ "fetchKey": "20.xml", "emitter": "fs", "emitKey": "20.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -445,12 +319,6 @@ "fetchKey": "21.xml", "emitter": "fs", "emitKey": "21.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -466,12 +334,6 @@ "fetchKey": "22.xml", "emitter": "fs", "emitKey": "22.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -487,12 +349,6 @@ "fetchKey": "23.xml", "emitter": "fs", "emitKey": "23.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -508,12 +364,6 @@ "fetchKey": "24.xml", "emitter": "fs", "emitKey": "24.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -529,12 +379,6 @@ "fetchKey": "25.xml", "emitter": "fs", "emitKey": "25.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -550,12 +394,6 @@ "fetchKey": "26.xml", "emitter": "fs", "emitKey": "26.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -571,12 +409,6 @@ "fetchKey": "27.xml", "emitter": "fs", "emitKey": "27.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -592,12 +424,6 @@ "fetchKey": "28.xml", "emitter": "fs", "emitKey": "28.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -613,12 +439,6 @@ "fetchKey": "29.xml", "emitter": "fs", "emitKey": "29.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -634,12 +454,6 @@ "fetchKey": "30.xml", "emitter": "fs", "emitKey": "30.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -655,12 +469,6 @@ "fetchKey": "31.xml", "emitter": "fs", "emitKey": "31.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -676,12 +484,6 @@ "fetchKey": "32.xml", "emitter": "fs", "emitKey": "32.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -697,12 +499,6 @@ "fetchKey": "33.xml", "emitter": "fs", "emitKey": "33.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -718,12 +514,6 @@ "fetchKey": "34.xml", "emitter": "fs", "emitKey": "34.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -739,12 +529,6 @@ "fetchKey": "35.xml", "emitter": "fs", "emitKey": "35.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -760,12 +544,6 @@ "fetchKey": "36.xml", "emitter": "fs", "emitKey": "36.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -781,12 +559,6 @@ "fetchKey": "37.xml", "emitter": "fs", "emitKey": "37.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -802,12 +574,6 @@ "fetchKey": "38.xml", "emitter": "fs", "emitKey": "38.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -823,12 +589,6 @@ "fetchKey": "39.xml", "emitter": "fs", "emitKey": "39.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -844,12 +604,6 @@ "fetchKey": "40.xml", "emitter": "fs", "emitKey": "40.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -865,12 +619,6 @@ "fetchKey": "41.xml", "emitter": "fs", "emitKey": "41.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -886,12 +634,6 @@ "fetchKey": "42.xml", "emitter": "fs", "emitKey": "42.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -907,12 +649,6 @@ "fetchKey": "43.xml", "emitter": "fs", "emitKey": "43.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -928,12 +664,6 @@ "fetchKey": "44.xml", "emitter": "fs", "emitKey": "44.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -949,12 +679,6 @@ "fetchKey": "45.xml", "emitter": "fs", "emitKey": "45.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -970,12 +694,6 @@ "fetchKey": "46.xml", "emitter": "fs", "emitKey": "46.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -991,12 +709,6 @@ "fetchKey": "47.xml", "emitter": "fs", "emitKey": "47.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1012,12 +724,6 @@ "fetchKey": "48.xml", "emitter": "fs", "emitKey": "48.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1033,12 +739,6 @@ "fetchKey": "49.xml", "emitter": "fs", "emitKey": "49.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1054,12 +754,6 @@ "fetchKey": "50.xml", "emitter": "fs", "emitKey": "50.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1075,12 +769,6 @@ "fetchKey": "51.xml", "emitter": "fs", "emitKey": "51.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1096,12 +784,6 @@ "fetchKey": "52.xml", "emitter": "fs", "emitKey": "52.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1117,12 +799,6 @@ "fetchKey": "53.xml", "emitter": "fs", "emitKey": "53.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1138,12 +814,6 @@ "fetchKey": "54.xml", "emitter": "fs", "emitKey": "54.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1159,12 +829,6 @@ "fetchKey": "55.xml", "emitter": "fs", "emitKey": "55.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1180,12 +844,6 @@ "fetchKey": "56.xml", "emitter": "fs", "emitKey": "56.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1201,12 +859,6 @@ "fetchKey": "57.xml", "emitter": "fs", "emitKey": "57.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1222,12 +874,6 @@ "fetchKey": "58.xml", "emitter": "fs", "emitKey": "58.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1243,12 +889,6 @@ "fetchKey": "59.xml", "emitter": "fs", "emitKey": "59.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1264,12 +904,6 @@ "fetchKey": "60.xml", "emitter": "fs", "emitKey": "60.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1285,12 +919,6 @@ "fetchKey": "61.xml", "emitter": "fs", "emitKey": "61.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1306,12 +934,6 @@ "fetchKey": "62.xml", "emitter": "fs", "emitKey": "62.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1327,12 +949,6 @@ "fetchKey": "63.xml", "emitter": "fs", "emitKey": "63.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1348,12 +964,6 @@ "fetchKey": "64.xml", "emitter": "fs", "emitKey": "64.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1369,12 +979,6 @@ "fetchKey": "65.xml", "emitter": "fs", "emitKey": "65.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1390,12 +994,6 @@ "fetchKey": "66.xml", "emitter": "fs", "emitKey": "66.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1411,12 +1009,6 @@ "fetchKey": "67.xml", "emitter": "fs", "emitKey": "67.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1432,12 +1024,6 @@ "fetchKey": "68.xml", "emitter": "fs", "emitKey": "68.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1453,12 +1039,6 @@ "fetchKey": "69.xml", "emitter": "fs", "emitKey": "69.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1474,12 +1054,6 @@ "fetchKey": "70.xml", "emitter": "fs", "emitKey": "70.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1495,12 +1069,6 @@ "fetchKey": "71.xml", "emitter": "fs", "emitKey": "71.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1516,12 +1084,6 @@ "fetchKey": "72.xml", "emitter": "fs", "emitKey": "72.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1537,12 +1099,6 @@ "fetchKey": "73.xml", "emitter": "fs", "emitKey": "73.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1558,12 +1114,6 @@ "fetchKey": "74.xml", "emitter": "fs", "emitKey": "74.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1579,12 +1129,6 @@ "fetchKey": "75.xml", "emitter": "fs", "emitKey": "75.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1600,12 +1144,6 @@ "fetchKey": "76.xml", "emitter": "fs", "emitKey": "76.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1621,12 +1159,6 @@ "fetchKey": "77.xml", "emitter": "fs", "emitKey": "77.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1642,12 +1174,6 @@ "fetchKey": "78.xml", "emitter": "fs", "emitKey": "78.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1663,12 +1189,6 @@ "fetchKey": "79.xml", "emitter": "fs", "emitKey": "79.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1684,12 +1204,6 @@ "fetchKey": "80.xml", "emitter": "fs", "emitKey": "80.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1705,12 +1219,6 @@ "fetchKey": "81.xml", "emitter": "fs", "emitKey": "81.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1726,12 +1234,6 @@ "fetchKey": "82.xml", "emitter": "fs", "emitKey": "82.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1747,12 +1249,6 @@ "fetchKey": "83.xml", "emitter": "fs", "emitKey": "83.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1768,12 +1264,6 @@ "fetchKey": "84.xml", "emitter": "fs", "emitKey": "84.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1789,12 +1279,6 @@ "fetchKey": "85.xml", "emitter": "fs", "emitKey": "85.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1810,12 +1294,6 @@ "fetchKey": "86.xml", "emitter": "fs", "emitKey": "86.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1831,12 +1309,6 @@ "fetchKey": "87.xml", "emitter": "fs", "emitKey": "87.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1852,12 +1324,6 @@ "fetchKey": "88.xml", "emitter": "fs", "emitKey": "88.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1873,12 +1339,6 @@ "fetchKey": "89.xml", "emitter": "fs", "emitKey": "89.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1894,12 +1354,6 @@ "fetchKey": "90.xml", "emitter": "fs", "emitKey": "90.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1915,12 +1369,6 @@ "fetchKey": "91.xml", "emitter": "fs", "emitKey": "91.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1936,12 +1384,6 @@ "fetchKey": "92.xml", "emitter": "fs", "emitKey": "92.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1957,12 +1399,6 @@ "fetchKey": "93.xml", "emitter": "fs", "emitKey": "93.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1978,12 +1414,6 @@ "fetchKey": "94.xml", "emitter": "fs", "emitKey": "94.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1999,12 +1429,6 @@ "fetchKey": "95.xml", "emitter": "fs", "emitKey": "95.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -2020,12 +1444,6 @@ "fetchKey": "96.xml", "emitter": "fs", "emitKey": "96.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -2041,12 +1459,6 @@ "fetchKey": "97.xml", "emitter": "fs", "emitKey": "97.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -2062,12 +1474,6 @@ "fetchKey": "98.xml", "emitter": "fs", "emitKey": "98.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -2083,12 +1489,6 @@ "fetchKey": "99.xml", "emitter": "fs", "emitKey": "99.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test.json b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test.json index 721410fd3a8..e5199c6cbac 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test.json +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test.json @@ -4,12 +4,6 @@ "fetchKey": "0.xml", "emitter": "fs", "emitKey": "0.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -18,12 +12,6 @@ "fetchKey": "1.xml", "emitter": "fs", "emitKey": "1.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -32,12 +20,6 @@ "fetchKey": "2.xml", "emitter": "fs", "emitKey": "2.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -46,12 +28,6 @@ "fetchKey": "3.xml", "emitter": "fs", "emitKey": "3.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -60,12 +36,6 @@ "fetchKey": "4.xml", "emitter": "fs", "emitKey": "4.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -74,12 +44,6 @@ "fetchKey": "5.xml", "emitter": "fs", "emitKey": "5.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -88,12 +52,6 @@ "fetchKey": "6.xml", "emitter": "fs", "emitKey": "6.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -102,12 +60,6 @@ "fetchKey": "7.xml", "emitter": "fs", "emitKey": "7.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -116,12 +68,6 @@ "fetchKey": "8.xml", "emitter": "fs", "emitKey": "8.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -130,12 +76,6 @@ "fetchKey": "9.xml", "emitter": "fs", "emitKey": "9.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -144,12 +84,6 @@ "fetchKey": "10.xml", "emitter": "fs", "emitKey": "10.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -158,12 +92,6 @@ "fetchKey": "11.xml", "emitter": "fs", "emitKey": "11.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -172,12 +100,6 @@ "fetchKey": "12.xml", "emitter": "fs", "emitKey": "12.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -186,12 +108,6 @@ "fetchKey": "13.xml", "emitter": "fs", "emitKey": "13.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -200,12 +116,6 @@ "fetchKey": "14.xml", "emitter": "fs", "emitKey": "14.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -214,12 +124,6 @@ "fetchKey": "15.xml", "emitter": "fs", "emitKey": "15.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -228,12 +132,6 @@ "fetchKey": "16.xml", "emitter": "fs", "emitKey": "16.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -242,12 +140,6 @@ "fetchKey": "17.xml", "emitter": "fs", "emitKey": "17.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -256,12 +148,6 @@ "fetchKey": "18.xml", "emitter": "fs", "emitKey": "18.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -270,12 +156,6 @@ "fetchKey": "19.xml", "emitter": "fs", "emitKey": "19.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -284,12 +164,6 @@ "fetchKey": "20.xml", "emitter": "fs", "emitKey": "20.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -298,12 +172,6 @@ "fetchKey": "21.xml", "emitter": "fs", "emitKey": "21.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -312,12 +180,6 @@ "fetchKey": "22.xml", "emitter": "fs", "emitKey": "22.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -326,12 +188,6 @@ "fetchKey": "23.xml", "emitter": "fs", "emitKey": "23.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -340,12 +196,6 @@ "fetchKey": "24.xml", "emitter": "fs", "emitKey": "24.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -354,12 +204,6 @@ "fetchKey": "25.xml", "emitter": "fs", "emitKey": "25.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -368,12 +212,6 @@ "fetchKey": "26.xml", "emitter": "fs", "emitKey": "26.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -382,12 +220,6 @@ "fetchKey": "27.xml", "emitter": "fs", "emitKey": "27.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -396,12 +228,6 @@ "fetchKey": "28.xml", "emitter": "fs", "emitKey": "28.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -410,12 +236,6 @@ "fetchKey": "29.xml", "emitter": "fs", "emitKey": "29.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -424,12 +244,6 @@ "fetchKey": "30.xml", "emitter": "fs", "emitKey": "30.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -438,12 +252,6 @@ "fetchKey": "31.xml", "emitter": "fs", "emitKey": "31.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -452,12 +260,6 @@ "fetchKey": "32.xml", "emitter": "fs", "emitKey": "32.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -466,12 +268,6 @@ "fetchKey": "33.xml", "emitter": "fs", "emitKey": "33.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -480,12 +276,6 @@ "fetchKey": "34.xml", "emitter": "fs", "emitKey": "34.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -494,12 +284,6 @@ "fetchKey": "35.xml", "emitter": "fs", "emitKey": "35.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -508,12 +292,6 @@ "fetchKey": "36.xml", "emitter": "fs", "emitKey": "36.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -522,12 +300,6 @@ "fetchKey": "37.xml", "emitter": "fs", "emitKey": "37.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -536,12 +308,6 @@ "fetchKey": "38.xml", "emitter": "fs", "emitKey": "38.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -550,12 +316,6 @@ "fetchKey": "39.xml", "emitter": "fs", "emitKey": "39.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -564,12 +324,6 @@ "fetchKey": "40.xml", "emitter": "fs", "emitKey": "40.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -578,12 +332,6 @@ "fetchKey": "41.xml", "emitter": "fs", "emitKey": "41.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -592,12 +340,6 @@ "fetchKey": "42.xml", "emitter": "fs", "emitKey": "42.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -606,12 +348,6 @@ "fetchKey": "43.xml", "emitter": "fs", "emitKey": "43.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -620,12 +356,6 @@ "fetchKey": "44.xml", "emitter": "fs", "emitKey": "44.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -634,12 +364,6 @@ "fetchKey": "45.xml", "emitter": "fs", "emitKey": "45.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -648,12 +372,6 @@ "fetchKey": "46.xml", "emitter": "fs", "emitKey": "46.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -662,12 +380,6 @@ "fetchKey": "47.xml", "emitter": "fs", "emitKey": "47.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -676,12 +388,6 @@ "fetchKey": "48.xml", "emitter": "fs", "emitKey": "48.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -690,12 +396,6 @@ "fetchKey": "49.xml", "emitter": "fs", "emitKey": "49.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -704,12 +404,6 @@ "fetchKey": "50.xml", "emitter": "fs", "emitKey": "50.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -718,12 +412,6 @@ "fetchKey": "51.xml", "emitter": "fs", "emitKey": "51.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -732,12 +420,6 @@ "fetchKey": "52.xml", "emitter": "fs", "emitKey": "52.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -746,12 +428,6 @@ "fetchKey": "53.xml", "emitter": "fs", "emitKey": "53.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -760,12 +436,6 @@ "fetchKey": "54.xml", "emitter": "fs", "emitKey": "54.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -774,12 +444,6 @@ "fetchKey": "55.xml", "emitter": "fs", "emitKey": "55.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -788,12 +452,6 @@ "fetchKey": "56.xml", "emitter": "fs", "emitKey": "56.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -802,12 +460,6 @@ "fetchKey": "57.xml", "emitter": "fs", "emitKey": "57.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -816,12 +468,6 @@ "fetchKey": "58.xml", "emitter": "fs", "emitKey": "58.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -830,12 +476,6 @@ "fetchKey": "59.xml", "emitter": "fs", "emitKey": "59.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -844,12 +484,6 @@ "fetchKey": "60.xml", "emitter": "fs", "emitKey": "60.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -858,12 +492,6 @@ "fetchKey": "61.xml", "emitter": "fs", "emitKey": "61.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -872,12 +500,6 @@ "fetchKey": "62.xml", "emitter": "fs", "emitKey": "62.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -886,12 +508,6 @@ "fetchKey": "63.xml", "emitter": "fs", "emitKey": "63.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -900,12 +516,6 @@ "fetchKey": "64.xml", "emitter": "fs", "emitKey": "64.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -914,12 +524,6 @@ "fetchKey": "65.xml", "emitter": "fs", "emitKey": "65.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -928,12 +532,6 @@ "fetchKey": "66.xml", "emitter": "fs", "emitKey": "66.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -942,12 +540,6 @@ "fetchKey": "67.xml", "emitter": "fs", "emitKey": "67.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -956,12 +548,6 @@ "fetchKey": "68.xml", "emitter": "fs", "emitKey": "68.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -970,12 +556,6 @@ "fetchKey": "69.xml", "emitter": "fs", "emitKey": "69.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -984,12 +564,6 @@ "fetchKey": "70.xml", "emitter": "fs", "emitKey": "70.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -998,12 +572,6 @@ "fetchKey": "71.xml", "emitter": "fs", "emitKey": "71.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1012,12 +580,6 @@ "fetchKey": "72.xml", "emitter": "fs", "emitKey": "72.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1026,12 +588,6 @@ "fetchKey": "73.xml", "emitter": "fs", "emitKey": "73.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1040,12 +596,6 @@ "fetchKey": "74.xml", "emitter": "fs", "emitKey": "74.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1054,12 +604,6 @@ "fetchKey": "75.xml", "emitter": "fs", "emitKey": "75.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1068,12 +612,6 @@ "fetchKey": "76.xml", "emitter": "fs", "emitKey": "76.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1082,12 +620,6 @@ "fetchKey": "77.xml", "emitter": "fs", "emitKey": "77.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1096,12 +628,6 @@ "fetchKey": "78.xml", "emitter": "fs", "emitKey": "78.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1110,12 +636,6 @@ "fetchKey": "79.xml", "emitter": "fs", "emitKey": "79.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1124,12 +644,6 @@ "fetchKey": "80.xml", "emitter": "fs", "emitKey": "80.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1138,12 +652,6 @@ "fetchKey": "81.xml", "emitter": "fs", "emitKey": "81.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1152,12 +660,6 @@ "fetchKey": "82.xml", "emitter": "fs", "emitKey": "82.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1166,12 +668,6 @@ "fetchKey": "83.xml", "emitter": "fs", "emitKey": "83.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1180,12 +676,6 @@ "fetchKey": "84.xml", "emitter": "fs", "emitKey": "84.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1194,12 +684,6 @@ "fetchKey": "85.xml", "emitter": "fs", "emitKey": "85.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1208,12 +692,6 @@ "fetchKey": "86.xml", "emitter": "fs", "emitKey": "86.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1222,12 +700,6 @@ "fetchKey": "87.xml", "emitter": "fs", "emitKey": "87.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1236,12 +708,6 @@ "fetchKey": "88.xml", "emitter": "fs", "emitKey": "88.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1250,12 +716,6 @@ "fetchKey": "89.xml", "emitter": "fs", "emitKey": "89.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1264,12 +724,6 @@ "fetchKey": "90.xml", "emitter": "fs", "emitKey": "90.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1278,12 +732,6 @@ "fetchKey": "91.xml", "emitter": "fs", "emitKey": "91.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1292,12 +740,6 @@ "fetchKey": "92.xml", "emitter": "fs", "emitKey": "92.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1306,12 +748,6 @@ "fetchKey": "93.xml", "emitter": "fs", "emitKey": "93.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1320,12 +756,6 @@ "fetchKey": "94.xml", "emitter": "fs", "emitKey": "94.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1334,12 +764,6 @@ "fetchKey": "95.xml", "emitter": "fs", "emitKey": "95.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1348,12 +772,6 @@ "fetchKey": "96.xml", "emitter": "fs", "emitKey": "96.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1362,12 +780,6 @@ "fetchKey": "97.xml", "emitter": "fs", "emitKey": "97.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1376,12 +788,6 @@ "fetchKey": "98.xml", "emitter": "fs", "emitKey": "98.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1390,11 +796,5 @@ "fetchKey": "99.xml", "emitter": "fs", "emitKey": "99.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java index 2eeb8bc7a20..f40abba13d5 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java @@ -111,6 +111,7 @@ public class TikaJsonConfig { "detectors", "encoding-detectors", "metadata-filters", + "content-handler-factory", "renderers", "translator", "auto-detect-parser", diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index 7371b8f844e..278ee388665 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@ -39,6 +39,8 @@ import org.apache.tika.parser.Parser; import org.apache.tika.renderer.CompositeRenderer; import org.apache.tika.renderer.Renderer; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; /** * Main entry point for loading Tika components from JSON configuration. @@ -84,6 +86,7 @@ public class TikaLoader { private Detector detectors; private EncodingDetector encodingDetectors; private MetadataFilter metadataFilter; + private ContentHandlerFactory contentHandlerFactory; private Renderer renderers; private Translator translator; private ConfigLoader configLoader; @@ -265,6 +268,47 @@ public synchronized MetadataFilter loadMetadataFilters() throws TikaConfigExcept return metadataFilter; } + /** + * Loads and returns the content handler factory. + * If "content-handler-factory" section exists in config, uses that factory. + * If section missing, returns a default BasicContentHandlerFactory with TEXT handler. + * Results are cached - subsequent calls return the same instance. + * + *

Example JSON: + *

+     * {
+     *   "content-handler-factory": {
+     *     "basic-content-handler-factory": {
+     *       "type": "HTML",
+     *       "writeLimit": 100000
+     *     }
+     *   }
+     * }
+     * 
+ * + * @return the content handler factory + * @throws TikaConfigException if loading fails + */ + public synchronized ContentHandlerFactory loadContentHandlerFactory() throws TikaConfigException { + if (contentHandlerFactory == null) { + // Check if content-handler-factory section exists in config + if (config.hasComponentSection("content-handler-factory")) { + try { + contentHandlerFactory = config.deserialize("content-handler-factory", + ContentHandlerFactory.class); + } catch (IOException e) { + throw new TikaConfigException("Failed to load content-handler-factory", e); + } + } + // Default to BasicContentHandlerFactory with TEXT handler if not configured + if (contentHandlerFactory == null) { + contentHandlerFactory = new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1); + } + } + return contentHandlerFactory; + } + /** * Loads and returns all renderers. * If "renderers" section exists in config, uses only those listed (no SPI fallback). diff --git a/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java index 80063b151d5..1db87866e7f 100644 --- a/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java @@ -51,9 +51,9 @@ public void setUp() throws Exception { // ==================== Test POJOs ==================== /** - * Simple config POJO with properties. + * Simple config POJO with properties for testing config loading. */ - public static class HandlerConfig { + public static class RetryConfig { private int timeout; private int retries; private boolean enabled; @@ -185,7 +185,7 @@ public abstract static class AbstractHandler implements TestHandler { @Test public void testLoadByExplicitKey() throws Exception { - HandlerConfig config = configLoader.load("handler-config", HandlerConfig.class); + RetryConfig config = configLoader.load("retry-config", RetryConfig.class); assertNotNull(config); assertEquals(5000, config.getTimeout()); @@ -195,7 +195,7 @@ public void testLoadByExplicitKey() throws Exception { @Test public void testLoadByClassNameKebabCase() throws Exception { - HandlerConfig config = configLoader.load(HandlerConfig.class); + RetryConfig config = configLoader.load(RetryConfig.class); assertNotNull(config); assertEquals(5000, config.getTimeout()); @@ -224,20 +224,20 @@ public void testLoadByClassNameMyFeatureSettings() throws Exception { @Test public void testLoadWithDefaultValue() throws Exception { - HandlerConfig config = configLoader.load("handler-config", HandlerConfig.class); + RetryConfig config = configLoader.load("retry-config", RetryConfig.class); assertNotNull(config); // Non-existent key with default - HandlerConfig defaultConfig = new HandlerConfig(); + RetryConfig defaultConfig = new RetryConfig(); defaultConfig.setTimeout(9999); - HandlerConfig result = configLoader.load("non-existent", HandlerConfig.class, defaultConfig); + RetryConfig result = configLoader.load("non-existent", RetryConfig.class, defaultConfig); assertEquals(9999, result.getTimeout()); } @Test public void testLoadMissingKeyReturnsNull() throws Exception { - HandlerConfig config = configLoader.load("non-existent-key", HandlerConfig.class); + RetryConfig config = configLoader.load("non-existent-key", RetryConfig.class); assertNull(config); } @@ -312,7 +312,7 @@ public void testLoadProhibitedKeyMetadataFilters() throws Exception { @Test public void testHasKey() throws Exception { - assertTrue(configLoader.hasKey("handler-config")); + assertTrue(configLoader.hasKey("retry-config")); assertTrue(configLoader.hasKey("simple-handler")); assertFalse(configLoader.hasKey("non-existent")); } @@ -350,10 +350,10 @@ public void testLoadWithUnexpectedFieldFails() throws Exception { TikaLoader loader = TikaLoader.load(configPath); TikaConfigException ex = assertThrows(TikaConfigException.class, () -> - loader.configs().load("handler-config", HandlerConfig.class)); + loader.configs().load("retry-config", RetryConfig.class)); // Should contain information about the unrecognized field - assertTrue(ex.getMessage().contains("handler-config") || + assertTrue(ex.getMessage().contains("retry-config") || ex.getCause().getMessage().contains("Unrecognized") || ex.getCause().getMessage().contains("unexpectedField"), "Exception should mention the unrecognized field"); @@ -370,7 +370,7 @@ public void testKebabCaseConversion() throws Exception { @Test public void testLoadByClassWithDefault() throws Exception { - HandlerConfig config = configLoader.load(HandlerConfig.class); + RetryConfig config = configLoader.load(RetryConfig.class); assertNotNull(config); // Non-existent class @@ -394,14 +394,14 @@ public void testLoadWithDefaultsPartialConfig() throws Exception { TikaLoader loader = TikaLoader.load(configPath); // Set up defaults - HandlerConfig defaults = new HandlerConfig(); + RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); defaults.setRetries(2); defaults.setEnabled(false); // JSON only has: { "enabled": true } - HandlerConfig config = loader.configs().loadWithDefaults("handler-config", - HandlerConfig.class, + RetryConfig config = loader.configs().loadWithDefaults("retry-config", + RetryConfig.class, defaults); assertNotNull(config); @@ -417,14 +417,14 @@ public void testLoadWithDefaultsFullOverride() throws Exception { getClass().getResource("/configs/test-partial-config.json").toURI()); TikaLoader loader = TikaLoader.load(configPath); - HandlerConfig defaults = new HandlerConfig(); + RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); defaults.setRetries(2); defaults.setEnabled(false); // JSON has: { "timeout": 10000, "retries": 5, "enabled": false } - HandlerConfig config = loader.configs().loadWithDefaults("handler-config-full", - HandlerConfig.class, + RetryConfig config = loader.configs().loadWithDefaults("retry-config-full", + RetryConfig.class, defaults); assertNotNull(config); @@ -436,13 +436,13 @@ public void testLoadWithDefaultsFullOverride() throws Exception { @Test public void testLoadWithDefaultsMissingKey() throws Exception { // When key doesn't exist, should return original defaults unchanged - HandlerConfig defaults = new HandlerConfig(); + RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); defaults.setRetries(2); defaults.setEnabled(false); - HandlerConfig config = configLoader.loadWithDefaults("non-existent-key", - HandlerConfig.class, + RetryConfig config = configLoader.loadWithDefaults("non-existent-key", + RetryConfig.class, defaults); assertNotNull(config); @@ -458,13 +458,13 @@ public void testLoadWithDefaultsByClass() throws Exception { getClass().getResource("/configs/test-partial-config.json").toURI()); TikaLoader loader = TikaLoader.load(configPath); - HandlerConfig defaults = new HandlerConfig(); + RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); defaults.setRetries(2); defaults.setEnabled(false); - // Uses kebab-case: HandlerConfig -> "handler-config" - HandlerConfig config = loader.configs().loadWithDefaults(HandlerConfig.class, defaults); + // Uses kebab-case: RetryConfig -> "retry-config" + RetryConfig config = loader.configs().loadWithDefaults(RetryConfig.class, defaults); assertNotNull(config); assertEquals(30000, config.getTimeout()); @@ -479,20 +479,20 @@ public void testLoadVsLoadWithDefaults() throws Exception { getClass().getResource("/configs/test-partial-config.json").toURI()); TikaLoader loader = TikaLoader.load(configPath); - HandlerConfig defaults = new HandlerConfig(); + RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); defaults.setRetries(2); defaults.setEnabled(false); // Using load() - creates new object, loses defaults - HandlerConfig config1 = loader.configs().load("handler-config", HandlerConfig.class); + RetryConfig config1 = loader.configs().load("retry-config", RetryConfig.class); assertEquals(0, config1.getTimeout()); // ❌ Lost default! assertEquals(0, config1.getRetries()); // ❌ Lost default! assertTrue(config1.isEnabled()); // ✅ From JSON // Using loadWithDefaults() - merges into defaults - HandlerConfig config2 = loader.configs().loadWithDefaults("handler-config", - HandlerConfig.class, + RetryConfig config2 = loader.configs().loadWithDefaults("retry-config", + RetryConfig.class, defaults); assertEquals(30000, config2.getTimeout()); // ✅ Kept default! assertEquals(2, config2.getRetries()); // ✅ Kept default! @@ -508,14 +508,14 @@ public void testLoadWithDefaultsDoesNotMutateOriginal() throws Exception { getClass().getResource("/configs/test-partial-config.json").toURI()); TikaLoader loader = TikaLoader.load(configPath); - HandlerConfig defaults = new HandlerConfig(); + RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); defaults.setRetries(2); defaults.setEnabled(false); // Load config with partial override (JSON only has "enabled": true) - HandlerConfig result = loader.configs().loadWithDefaults("handler-config", - HandlerConfig.class, + RetryConfig result = loader.configs().loadWithDefaults("retry-config", + RetryConfig.class, defaults); // Verify result has merged values @@ -541,17 +541,17 @@ public void testLoadWithDefaultsReusableDefaults() throws Exception { getClass().getResource("/configs/test-partial-config.json").toURI()); TikaLoader loader = TikaLoader.load(configPath); - HandlerConfig defaults = new HandlerConfig(); + RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); defaults.setRetries(2); defaults.setEnabled(false); // Load multiple times with same defaults - HandlerConfig config1 = loader.configs().loadWithDefaults("handler-config", - HandlerConfig.class, + RetryConfig config1 = loader.configs().loadWithDefaults("retry-config", + RetryConfig.class, defaults); - HandlerConfig config2 = loader.configs().loadWithDefaults("handler-config-full", - HandlerConfig.class, + RetryConfig config2 = loader.configs().loadWithDefaults("retry-config-full", + RetryConfig.class, defaults); // Verify results are different @@ -564,8 +564,8 @@ public void testLoadWithDefaultsReusableDefaults() throws Exception { assertFalse(defaults.isEnabled()); // Use defaults one more time - HandlerConfig config3 = loader.configs().loadWithDefaults("non-existent", - HandlerConfig.class, + RetryConfig config3 = loader.configs().loadWithDefaults("non-existent", + RetryConfig.class, defaults); assertEquals(defaults, config3); // Should return original when key missing } @@ -595,13 +595,13 @@ public void testLoadWithDefaultsComplexObjectImmutability() throws Exception { @Test public void testLoadWithDefaultsMissingKeyDoesNotClone() throws Exception { // When key is missing, should return the original object (no unnecessary cloning) - HandlerConfig defaults = new HandlerConfig(); + RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); defaults.setRetries(2); defaults.setEnabled(false); - HandlerConfig result = configLoader.loadWithDefaults("non-existent-key", - HandlerConfig.class, + RetryConfig result = configLoader.loadWithDefaults("non-existent-key", + RetryConfig.class, defaults); // Should return the exact same object when key is missing @@ -619,17 +619,17 @@ public void testLoadWithDefaultsThreadSafety() throws Exception { TikaLoader loader = TikaLoader.load(configPath); // Shared defaults object - HandlerConfig sharedDefaults = new HandlerConfig(); + RetryConfig sharedDefaults = new RetryConfig(); sharedDefaults.setTimeout(30000); sharedDefaults.setRetries(2); sharedDefaults.setEnabled(false); // Simulate concurrent usage (not a real concurrency test, just demonstrates safety) - HandlerConfig result1 = loader.configs().loadWithDefaults("handler-config", - HandlerConfig.class, + RetryConfig result1 = loader.configs().loadWithDefaults("retry-config", + RetryConfig.class, sharedDefaults); - HandlerConfig result2 = loader.configs().loadWithDefaults("handler-config-full", - HandlerConfig.class, + RetryConfig result2 = loader.configs().loadWithDefaults("retry-config-full", + RetryConfig.class, sharedDefaults); // Both results should be valid diff --git a/tika-serialization/src/test/resources/configs/test-config-loader.json b/tika-serialization/src/test/resources/configs/test-config-loader.json index 5305f2a43a9..dd657c81e05 100644 --- a/tika-serialization/src/test/resources/configs/test-config-loader.json +++ b/tika-serialization/src/test/resources/configs/test-config-loader.json @@ -4,7 +4,7 @@ ], "other-configs": { - "handler-config": { + "retry-config": { "timeout": 5000, "retries": 3, "enabled": true diff --git a/tika-serialization/src/test/resources/configs/test-partial-config.json b/tika-serialization/src/test/resources/configs/test-partial-config.json index 866f2594b7c..5c5eab6992a 100644 --- a/tika-serialization/src/test/resources/configs/test-partial-config.json +++ b/tika-serialization/src/test/resources/configs/test-partial-config.json @@ -1,10 +1,10 @@ { "other-configs": { - "handler-config": { + "retry-config": { "enabled": true }, - "handler-config-full": { + "retry-config-full": { "timeout": 10000, "retries": 5, "enabled": false diff --git a/tika-serialization/src/test/resources/configs/test-unexpected-field.json b/tika-serialization/src/test/resources/configs/test-unexpected-field.json index d250d5fa1d3..5946b399ea9 100644 --- a/tika-serialization/src/test/resources/configs/test-unexpected-field.json +++ b/tika-serialization/src/test/resources/configs/test-unexpected-field.json @@ -1,6 +1,6 @@ { "other-configs": { - "handler-config": { + "retry-config": { "timeout": 5000, "retries": 3, "enabled": true, diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java index abf39696ce8..f046de0a54a 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java @@ -47,6 +47,7 @@ import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.server.core.MetadataList; import org.apache.tika.server.core.TikaServerParseException; @@ -69,9 +70,15 @@ public static List parseMetadata(TikaInputStream tis, Metadata metadat fillMetadata(parser, metadata, httpHeaders); TikaResource.logRequest(LOG, "/rmeta", metadata); - BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.type(); + // Check if a ContentHandlerFactory was provided in ParseContext + ContentHandlerFactory factory = context.get(ContentHandlerFactory.class); + if (factory == null) { + // Fall back to creating one from HTTP headers + BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.type(); + factory = new BasicContentHandlerFactory(type, handlerConfig.writeLimit(), handlerConfig.throwOnWriteLimitReached(), context); + } RecursiveParserWrapperHandler handler = - new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(type, handlerConfig.writeLimit(), handlerConfig.throwOnWriteLimitReached(), context), + new RecursiveParserWrapperHandler(factory, handlerConfig.maxEmbeddedResources(), TikaResource .getTikaLoader() .loadMetadataFilters()); @@ -176,9 +183,15 @@ private MetadataList parseMetadataWithContext(TikaInputStream tis, Metadata meta Parser parser = TikaResource.createParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); - BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.type(); + // Check if a ContentHandlerFactory was provided in ParseContext (e.g., from config JSON) + ContentHandlerFactory factory = context.get(ContentHandlerFactory.class); + if (factory == null) { + // Fall back to creating one from HTTP headers + BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.type(); + factory = new BasicContentHandlerFactory(type, handlerConfig.writeLimit(), handlerConfig.throwOnWriteLimitReached(), context); + } RecursiveParserWrapperHandler handler = - new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(type, handlerConfig.writeLimit(), handlerConfig.throwOnWriteLimitReached(), context), + new RecursiveParserWrapperHandler(factory, handlerConfig.maxEmbeddedResources(), TikaResource .getTikaLoader() .loadMetadataFilters()); @@ -232,7 +245,8 @@ public Response getMetadata(InputStream is, @Context HttpHeaders httpHeaders, @C } } - private MetadataList parseMetadataToMetadataList(TikaInputStream tis, Metadata metadata, MultivaluedMap httpHeaders, UriInfo info, ServerHandlerConfig handlerConfig) + private MetadataList parseMetadataToMetadataList(TikaInputStream tis, Metadata metadata, + MultivaluedMap httpHeaders, UriInfo info, ServerHandlerConfig handlerConfig) throws Exception { return new MetadataList(parseMetadata(tis, metadata, httpHeaders, info, handlerConfig)); } diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/ServerHandlerConfig.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/ServerHandlerConfig.java new file mode 100644 index 00000000000..b46802aecd1 --- /dev/null +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/ServerHandlerConfig.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.server.core.resource; + +import org.apache.tika.pipes.api.ParseMode; +import org.apache.tika.sax.BasicContentHandlerFactory; + +/** + * Server-internal configuration for request handlers. + * This holds configuration parsed from HTTP headers for a single request + * for the BasicContentHandlerFactory kinds of elements. + */ +public record ServerHandlerConfig( + BasicContentHandlerFactory.HANDLER_TYPE type, + ParseMode parseMode, + int writeLimit, + int maxEmbeddedResources, + boolean throwOnWriteLimitReached +) { +} diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java index 6cab25a4f54..0df8e7b498b 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java @@ -74,6 +74,7 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.sax.ExpandedTitleContentHandler; import org.apache.tika.sax.RichTextContentHandler; import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler; @@ -532,9 +533,14 @@ private void parseToMetadata(TikaInputStream tis, Metadata metadata, Multivalued writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit")); } - BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE); - BasicContentHandlerFactory fact = new BasicContentHandlerFactory(type, writeLimit, throwOnWriteLimitReached, context); - ContentHandler contentHandler = fact.getNewContentHandler(); + // Check if a ContentHandlerFactory was provided in ParseContext (e.g., from config JSON) + ContentHandlerFactory fact = context.get(ContentHandlerFactory.class); + if (fact == null) { + // Fall back to creating one from HTTP headers + BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE); + fact = new BasicContentHandlerFactory(type, writeLimit, throwOnWriteLimitReached, context); + } + ContentHandler contentHandler = fact.createHandler(); try { parse(parser, LOG, info.getPath(), tis, contentHandler, metadata, context); diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerAsyncIntegrationTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerAsyncIntegrationTest.java index f957ef1d82f..fb7aaf554bd 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerAsyncIntegrationTest.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerAsyncIntegrationTest.java @@ -50,10 +50,10 @@ import org.apache.tika.pipes.api.FetchEmitTuple; import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.emitter.EmitKey; -import org.apache.tika.sax.BasicContentHandlerFactory; -import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.pipes.core.serialization.JsonFetchEmitTupleList; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; @Disabled("useful for development...need to turn it into a real unit test") public class TikaServerAsyncIntegrationTest extends IntegrationTestBase { diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java index 6f12650bb48..4729b8fbd16 100644 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java @@ -61,9 +61,9 @@ import org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.pipes.core.fetcher.FetcherManager; import org.apache.tika.pipes.core.serialization.JsonFetchEmitTuple; +import org.apache.tika.plugins.TikaPluginManager; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.ContentHandlerFactory; -import org.apache.tika.plugins.TikaPluginManager; import org.apache.tika.serialization.JsonMetadataList; import org.apache.tika.server.core.CXFTestBase; import org.apache.tika.server.core.FetcherStreamFactory; @@ -182,9 +182,8 @@ public void testBasic() throws Exception { @Test public void testConcatenated() throws Exception { ParseContext parseContext = new ParseContext(); - // Use addConfig with JSON for handler-config - parseContext.addConfig("handler-config", - "{\"type\": \"TEXT\", \"parseMode\": \"CONCATENATE\", \"writeLimit\": -1, \"maxEmbeddedResources\": -1, \"throwOnWriteLimitReached\": true}"); + // Set ParseMode directly - it's now separate from ContentHandlerFactory + parseContext.set(ParseMode.class, ParseMode.CONCATENATE); FetchEmitTuple t = new FetchEmitTuple("myId", new FetchKey(FETCHER_ID, "test_recursive_embedded.docx"), new EmitKey(EMITTER_JSON_ID, ""), new Metadata(), parseContext, From c1ad7552a01e5234841478199399f3d4ea7c6161 Mon Sep 17 00:00:00 2001 From: tallison Date: Wed, 17 Dec 2025 17:19:39 -0500 Subject: [PATCH 3/5] TIKA-4582 -- refactor handler factory --- .../main/java/org/apache/tika/config/loader/TikaLoader.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index 9b2d3f5c4d1..e584b6b2c2f 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@ -41,10 +41,10 @@ import org.apache.tika.parser.Parser; import org.apache.tika.renderer.CompositeRenderer; import org.apache.tika.renderer.Renderer; -import org.apache.tika.serialization.JsonMetadata; -import org.apache.tika.serialization.JsonMetadataList; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.ContentHandlerFactory; +import org.apache.tika.serialization.JsonMetadata; +import org.apache.tika.serialization.JsonMetadataList; /** * Main entry point for loading Tika components from JSON configuration. From 9194294ea8355380a089da810f865773a213437d Mon Sep 17 00:00:00 2001 From: tallison Date: Fri, 19 Dec 2025 20:06:52 -0500 Subject: [PATCH 4/5] TIKA-4582 -- refactor handler factor - WIP --- .../annotation/TikaComponentProcessor.java | 2 + .../tika-pipes-integration-tests/pom.xml | 8 ++ .../pipes/core/UppercasingContentHandler.java | 104 ------------------ .../UppercasingContentHandlerFactory.java | 41 ------- .../pipes/core/async/MockDigesterFactory.java | 49 --------- .../azblob/TestAZBlobPipesIterator.java | 7 +- .../iterator/csv/TestCSVPipesIterator.java | 8 +- .../iterator/gcs/TestGCSPipesIterator.java | 8 +- .../iterator/jdbc/TestJDBCPipesIterator.java | 9 +- .../kafka/TestKafkaPipesIterator.java | 7 +- .../iterator/s3/TestS3PipesIterator.java | 7 +- .../loader/TikaObjectMapperFactory.java | 1 + .../serialization/ComponentNameResolver.java | 5 + .../ParseContextDeserializer.java | 29 +++-- .../TestParseContextSerialization.java | 26 +++++ .../server/core/resource/TikaResource.java | 5 + 16 files changed, 85 insertions(+), 231 deletions(-) delete mode 100644 tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UppercasingContentHandler.java delete mode 100644 tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UppercasingContentHandlerFactory.java delete mode 100644 tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/async/MockDigesterFactory.java diff --git a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java index 273dfeda1cf..9e818627c62 100644 --- a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java +++ b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java @@ -77,6 +77,8 @@ public class TikaComponentProcessor extends AbstractProcessor { SERVICE_INTERFACES.put("org.apache.tika.renderer.Renderer", "renderers"); SERVICE_INTERFACES.put("org.apache.tika.metadata.filter.MetadataFilter", "metadata-filters"); SERVICE_INTERFACES.put("org.apache.tika.digest.DigesterFactory", "digester-factories"); + SERVICE_INTERFACES.put("org.apache.tika.sax.ContentHandlerFactory", + "content-handler-factories"); } private Messager messager; diff --git a/tika-pipes/tika-pipes-integration-tests/pom.xml b/tika-pipes/tika-pipes-integration-tests/pom.xml index 1dd82c28c9b..7a2495932a1 100644 --- a/tika-pipes/tika-pipes-integration-tests/pom.xml +++ b/tika-pipes/tika-pipes-integration-tests/pom.xml @@ -141,6 +141,14 @@ + + org.apache.maven.plugins + maven-surefire-plugin + + + false + + diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UppercasingContentHandler.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UppercasingContentHandler.java deleted file mode 100644 index f59b9d0c59a..00000000000 --- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UppercasingContentHandler.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.pipes.core; - -import java.util.Locale; - -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; -import org.xml.sax.Locator; -import org.xml.sax.SAXException; - -/** - * A ContentHandler decorator that converts all character content to uppercase. - * Used for testing custom ContentHandlerFactory configurations. - */ -public class UppercasingContentHandler implements ContentHandler { - - private final ContentHandler delegate; - - public UppercasingContentHandler(ContentHandler delegate) { - this.delegate = delegate; - } - - @Override - public void setDocumentLocator(Locator locator) { - delegate.setDocumentLocator(locator); - } - - @Override - public void startDocument() throws SAXException { - delegate.startDocument(); - } - - @Override - public void endDocument() throws SAXException { - delegate.endDocument(); - } - - @Override - public void startPrefixMapping(String prefix, String uri) throws SAXException { - delegate.startPrefixMapping(prefix, uri); - } - - @Override - public void endPrefixMapping(String prefix) throws SAXException { - delegate.endPrefixMapping(prefix); - } - - @Override - public void startElement(String uri, String localName, String qName, Attributes atts) - throws SAXException { - delegate.startElement(uri, localName, qName, atts); - } - - @Override - public void endElement(String uri, String localName, String qName) throws SAXException { - delegate.endElement(uri, localName, qName); - } - - @Override - public void characters(char[] ch, int start, int length) throws SAXException { - // Convert characters to uppercase - char[] upper = new String(ch, start, length).toUpperCase(Locale.ROOT).toCharArray(); - delegate.characters(upper, 0, upper.length); - } - - @Override - public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { - delegate.ignorableWhitespace(ch, start, length); - } - - @Override - public void processingInstruction(String target, String data) throws SAXException { - delegate.processingInstruction(target, data); - } - - @Override - public void skippedEntity(String name) throws SAXException { - delegate.skippedEntity(name); - } - - /** - * Returns the underlying delegate handler's string representation, - * which typically contains the extracted content. - */ - @Override - public String toString() { - return delegate.toString(); - } -} diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UppercasingContentHandlerFactory.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UppercasingContentHandlerFactory.java deleted file mode 100644 index db2ffdfec7d..00000000000 --- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UppercasingContentHandlerFactory.java +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.pipes.core; - -import org.xml.sax.ContentHandler; - -import org.apache.tika.config.TikaComponent; -import org.apache.tika.sax.ContentHandlerFactory; -import org.apache.tika.sax.ToTextContentHandler; - -/** - * A ContentHandlerFactory that creates UppercasingContentHandler instances. - * This factory wraps a ToTextContentHandler with an uppercasing decorator - * to convert all extracted text to uppercase. - *

- * Used for testing custom ContentHandlerFactory configurations in tika-pipes. - */ -@TikaComponent(contextKey = ContentHandlerFactory.class) -public class UppercasingContentHandlerFactory implements ContentHandlerFactory { - - private static final long serialVersionUID = 1L; - - @Override - public ContentHandler createHandler() { - return new UppercasingContentHandler(new ToTextContentHandler()); - } -} diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/async/MockDigesterFactory.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/async/MockDigesterFactory.java deleted file mode 100644 index 9df3e9866f3..00000000000 --- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/async/MockDigesterFactory.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.pipes.core.async; - -import org.apache.tika.config.TikaComponent; -import org.apache.tika.digest.Digester; -import org.apache.tika.digest.DigesterFactory; -import org.apache.tika.digest.Encoder; -import org.apache.tika.digest.InputStreamDigester; - -@TikaComponent -public class MockDigesterFactory implements DigesterFactory { - - @Override - public Digester build() { - return new InputStreamDigester(1000000, "SHA-256", "X-TIKA:digest:SHA-256", new MockEncoder()); - } - - private static class MockEncoder implements Encoder { - - @Override - public String encode(byte[] bytes) { - StringBuilder hexString = new StringBuilder(2 * bytes.length); - for (int i = 0; i < bytes.length; i++) { - String hex = Integer.toHexString(0xff & bytes[i]); - if (hex.length() == 1) { - hexString.append('0'); - } - hexString.append(hex); - } - return hexString.toString(); - } - } - -} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/iterator/azblob/TestAZBlobPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/iterator/azblob/TestAZBlobPipesIterator.java index 298b16ebc84..4c81e4ae8df 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/iterator/azblob/TestAZBlobPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/iterator/azblob/TestAZBlobPipesIterator.java @@ -48,10 +48,9 @@ public void testSimple() throws Exception { configNode.put("endpoint", ""); // use one configNode.put("sasToken", ""); // find one - ObjectNode baseConfigNode = MAPPER.createObjectNode(); - baseConfigNode.put("fetcherId", "az-blob"); - baseConfigNode.put("emitterId", "test-emitter"); - configNode.set("baseConfig", baseConfigNode); + // Add fetcherId and emitterId at root level (not nested in baseConfig) + configNode.put("fetcherId", "az-blob"); + configNode.put("emitterId", "test-emitter"); ExtensionConfig extensionConfig = new ExtensionConfig("test-az-blob", "az-blob-pipes-iterator", MAPPER.writeValueAsString(configNode)); diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/java/org/apache/tika/pipes/iterator/csv/TestCSVPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/java/org/apache/tika/pipes/iterator/csv/TestCSVPipesIterator.java index d423119e9d9..b2f090231b3 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/java/org/apache/tika/pipes/iterator/csv/TestCSVPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/java/org/apache/tika/pipes/iterator/csv/TestCSVPipesIterator.java @@ -114,11 +114,9 @@ private CSVPipesIterator createIterator(Path csvPath, String fetcherName, String jsonConfig.put("idColumn", idColumn); } - // Add baseConfig - ObjectNode baseConfig = OBJECT_MAPPER.createObjectNode(); - baseConfig.put("fetcherId", fetcherName); - baseConfig.put("emitterId", emitterName); - jsonConfig.set("baseConfig", baseConfig); + // Add fetcherId and emitterId at root level (not nested in baseConfig) + jsonConfig.put("fetcherId", fetcherName); + jsonConfig.put("emitterId", emitterName); ExtensionConfig extensionConfig = new ExtensionConfig("test-csv-iterator", "csv-pipes-iterator", OBJECT_MAPPER.writeValueAsString(jsonConfig)); diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/iterator/gcs/TestGCSPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/iterator/gcs/TestGCSPipesIterator.java index a8f2310b71e..3af3c4342c2 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/iterator/gcs/TestGCSPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/iterator/gcs/TestGCSPipesIterator.java @@ -86,11 +86,9 @@ private GCSPipesIterator createIterator(String bucket, String projectId, String jsonConfig.put("prefix", prefix); } - // Add baseConfig - ObjectNode baseConfig = OBJECT_MAPPER.createObjectNode(); - baseConfig.put("fetcherId", fetcherName); - baseConfig.put("emitterId", emitterName); - jsonConfig.set("baseConfig", baseConfig); + // Add fetcherId and emitterId at root level (not nested in baseConfig) + jsonConfig.put("fetcherId", fetcherName); + jsonConfig.put("emitterId", emitterName); ExtensionConfig extensionConfig = new ExtensionConfig("test-gcs-iterator", "gcs-pipes-iterator", OBJECT_MAPPER.writeValueAsString(jsonConfig)); diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/java/org/apache/tika/pipes/iterator/jdbc/TestJDBCPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/java/org/apache/tika/pipes/iterator/jdbc/TestJDBCPipesIterator.java index 30aac2d57f8..ce7c5989433 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/java/org/apache/tika/pipes/iterator/jdbc/TestJDBCPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/java/org/apache/tika/pipes/iterator/jdbc/TestJDBCPipesIterator.java @@ -163,12 +163,9 @@ private JDBCPipesIterator createIterator() throws Exception { jsonConfig.put("fetchKeyColumn", "my_fetchkey"); jsonConfig.put("emitKeyColumn", "my_fetchkey"); - // Add baseConfig - ObjectNode baseConfig = OBJECT_MAPPER.createObjectNode(); - baseConfig.put("fetcherId", "s3f"); - baseConfig.put("emitterId", "s3e"); - baseConfig.put("queueSize", 57); - jsonConfig.set("baseConfig", baseConfig); + // Add fetcherId and emitterId at root level (not nested in baseConfig) + jsonConfig.put("fetcherId", "s3f"); + jsonConfig.put("emitterId", "s3e"); ExtensionConfig extensionConfig = new ExtensionConfig("test-jdbc-iterator", "jdbc-pipes-iterator", OBJECT_MAPPER.writeValueAsString(jsonConfig)); diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/java/org/apache/tika/pipes/iterator/kafka/TestKafkaPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/java/org/apache/tika/pipes/iterator/kafka/TestKafkaPipesIterator.java index f71c0b4db86..00e8e364203 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/java/org/apache/tika/pipes/iterator/kafka/TestKafkaPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/java/org/apache/tika/pipes/iterator/kafka/TestKafkaPipesIterator.java @@ -49,10 +49,9 @@ public void testSimple() throws Exception { configNode.put("bootstrapServers", ""); // use one configNode.put("groupId", ""); // find one - ObjectNode baseConfigNode = MAPPER.createObjectNode(); - baseConfigNode.put("fetcherId", "kafka"); - baseConfigNode.put("emitterId", "test-emitter"); - configNode.set("baseConfig", baseConfigNode); + // Add fetcherId and emitterId at root level (not nested in baseConfig) + configNode.put("fetcherId", "kafka"); + configNode.put("emitterId", "test-emitter"); ExtensionConfig extensionConfig = new ExtensionConfig("test-kafka", "kafka-pipes-iterator", MAPPER.writeValueAsString(configNode)); diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/iterator/s3/TestS3PipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/iterator/s3/TestS3PipesIterator.java index d840fc29509..4104b54e698 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/iterator/s3/TestS3PipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/iterator/s3/TestS3PipesIterator.java @@ -50,10 +50,9 @@ public void testSimple() throws Exception { jsonConfig.put("profile", ""); // use one jsonConfig.put("credentialsProvider", "profile"); - ObjectNode baseConfig = OBJECT_MAPPER.createObjectNode(); - baseConfig.put("fetcherId", "s3"); - baseConfig.put("emitterId", "fs"); - jsonConfig.set("baseConfig", baseConfig); + // Add fetcherId and emitterId at root level (not nested in baseConfig) + jsonConfig.put("fetcherId", "s3"); + jsonConfig.put("emitterId", "fs"); ExtensionConfig extensionConfig = new ExtensionConfig("test-s3-iterator", "s3-pipes-iterator", OBJECT_MAPPER.writeValueAsString(jsonConfig)); diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java index 93716884d6d..1ca9cc45efa 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java @@ -47,6 +47,7 @@ public class TikaObjectMapperFactory { "renderers", "translators", "digester-factories", + "content-handler-factories", "other-configs" }; diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java index 0b7d9a70040..e53837f2107 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java @@ -110,6 +110,11 @@ public static boolean hasComponent(String name) { * @return Optional containing the ComponentInfo, or empty if not found */ public static Optional getComponentInfo(String name) { + if (REGISTRIES.isEmpty()) { + System.err.println("ComponentNameResolver: WARNING - No registries loaded! Looking for: " + name); + } else { + System.err.println("ComponentNameResolver: Looking up '" + name + "' in " + REGISTRIES.size() + " registries: " + REGISTRIES.keySet()); + } for (ComponentRegistry registry : REGISTRIES.values()) { if (registry.hasComponent(name)) { try { diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java index d575786779e..07d73a6437d 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java @@ -113,6 +113,7 @@ public static ParseContext readParseContext(JsonNode jsonNode) throws IOExceptio Class contextKey = null; // The key to use when adding to ParseContext if (keyClass == null) { Optional infoOpt = ComponentNameResolver.getComponentInfo(fieldName); + LOG.info("Looking up '{}' in registry, found: {}", fieldName, infoOpt.isPresent()); if (infoOpt.isPresent()) { ComponentInfo info = infoOpt.get(); keyClass = info.componentClass(); @@ -139,18 +140,28 @@ public static ParseContext readParseContext(JsonNode jsonNode) throws IOExceptio // Non-SelfConfiguring - deserialize directly into ParseContext try { // Check if fieldValue is a wrapper object format: {"concrete-class": {props}} + // Only treat as wrapper format if the single field name is a resolvable type Object value; if (fieldValue.isObject() && fieldValue.size() == 1) { String typeName = fieldValue.fieldNames().next(); - JsonNode configNode = fieldValue.get(typeName); - // Try to resolve the concrete class - try { - Class concreteClass = ComponentNameResolver.resolveClass(typeName, - ParseContextDeserializer.class.getClassLoader()); - value = MAPPER.treeToValue(configNode, concreteClass); - } catch (ClassNotFoundException ex) { - // Fall back to key class - value = MAPPER.treeToValue(configNode, keyClass); + // Check if typeName is resolvable as a component/class before treating + // as wrapper format. This prevents {"password": "value"} from being + // misinterpreted as wrapper format with type "password". + if (ComponentNameResolver.hasComponent(typeName) || + typeName.contains(".")) { + JsonNode configNode = fieldValue.get(typeName); + // Try to resolve the concrete class + try { + Class concreteClass = ComponentNameResolver.resolveClass(typeName, + ParseContextDeserializer.class.getClassLoader()); + value = MAPPER.treeToValue(configNode, concreteClass); + } catch (ClassNotFoundException ex) { + // Fall back to deserializing the whole fieldValue with key class + value = MAPPER.treeToValue(fieldValue, keyClass); + } + } else { + // Single field but not a type name - deserialize as properties + value = MAPPER.treeToValue(fieldValue, keyClass); } } else { // Not wrapper format, deserialize directly diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java index 4a300830cf9..a36db264515 100644 --- a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java +++ b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java @@ -43,6 +43,8 @@ import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.filter.MockUpperCaseFilter; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.parser.SimplePasswordProvider; /** * Tests for ParseContext serialization/deserialization. @@ -373,4 +375,28 @@ public void testContextKeyDeserialization() throws Exception { assertFalse(selector.select(new org.apache.tika.metadata.Metadata()), "SkipEmbeddedDocumentSelector should return false for all documents"); } + + @Test + public void testSimplePasswordProviderDeserialization() throws Exception { + // Test that SimplePasswordProvider with contextKey=PasswordProvider.class + // is stored in ParseContext with the contextKey + String json = """ + { + "simple-password-provider": { + "password": "secret123" + } + } + """; + + ObjectMapper mapper = createMapper(); + ParseContext deserialized = mapper.readValue(json, ParseContext.class); + + // Should be accessible via PasswordProvider.class (the contextKey) + PasswordProvider provider = deserialized.get(PasswordProvider.class); + assertNotNull(provider, "PasswordProvider should be found via contextKey"); + assertTrue(provider instanceof SimplePasswordProvider, + "Should be SimplePasswordProvider instance"); + assertEquals("secret123", provider.getPassword(null), + "Password should match the configured value"); + } } diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java index 0df8e7b498b..c1d8143729b 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java @@ -149,11 +149,16 @@ public static void mergeParseContextFromConfig(String configJson, ParseContext c JsonNode root = mapper.readTree(configJson); // Use root directly - the JSON should contain parser configs at the top level ParseContext configuredContext = ParseContextDeserializer.readParseContext(root); + LOG.info("After readParseContext, configuredContext has {} entries: {}", + configuredContext.getContextMap().size(), configuredContext.getContextMap().keySet()); ParseContextUtils.resolveAll(configuredContext, Thread.currentThread().getContextClassLoader()); + LOG.info("After resolveAll, configuredContext has {} entries: {}", + configuredContext.getContextMap().size(), configuredContext.getContextMap().keySet()); for (Map.Entry entry : configuredContext.getContextMap().entrySet()) { try { Class clazz = Class.forName(entry.getKey()); context.set((Class) clazz, entry.getValue()); + LOG.info("Merged entry {} into context", entry.getKey()); } catch (ClassNotFoundException e) { LOG.warn("Could not load class for parseContext entry: {}", entry.getKey()); } From 0b2e205daec93e48766fe080677a19ae407808bf Mon Sep 17 00:00:00 2001 From: tallison Date: Tue, 30 Dec 2025 22:36:56 -0500 Subject: [PATCH 5/5] TIKA-4582 -- cleanup --- .../org/apache/tika/config/TikaComponent.java | 6 ++--- .../serialization/JsonFetchEmitTuple.java | 23 ++++++++----------- .../serialization/ComponentNameResolver.java | 19 ++++++++++----- .../tika/serialization/JsonMetadata.java | 8 +++---- .../tika/serialization/JsonMetadataList.java | 8 +++---- .../apache/tika/serialization/TikaModule.java | 14 +++++++++++ .../serdes/ParseContextSerializer.java | 12 +++++++--- 7 files changed, 55 insertions(+), 35 deletions(-) diff --git a/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java b/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java index f1aac4afb03..69e42570b30 100644 --- a/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java +++ b/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java @@ -34,8 +34,8 @@ *

  • Component index files (META-INF/tika/{type}.idx) for name-based lookup
  • * * - *

    This annotation is used at compile time by the annotation processor and - * at runtime for contextKey resolution via reflection. + *

    This annotation is processed at compile time by the annotation processor. + * The contextKey is recorded in the .idx file for runtime resolution. * *

    Example usage: *

    @@ -62,7 +62,7 @@
      *
      * @since 3.1.0
      */
    -@Retention(RetentionPolicy.RUNTIME)
    +@Retention(RetentionPolicy.CLASS)
     @Target(ElementType.TYPE)
     public @interface TikaComponent {
     
    diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java
    index cdf89be42d8..278410e7dbf 100644
    --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java
    +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java
    @@ -24,25 +24,22 @@
     import com.fasterxml.jackson.databind.ObjectMapper;
     import com.fasterxml.jackson.databind.module.SimpleModule;
     
    -import org.apache.tika.metadata.Metadata;
    -import org.apache.tika.parser.ParseContext;
    +import org.apache.tika.config.loader.TikaObjectMapperFactory;
     import org.apache.tika.pipes.api.FetchEmitTuple;
    -import org.apache.tika.serialization.serdes.MetadataSerializer;
    -import org.apache.tika.serialization.serdes.ParseContextDeserializer;
    -import org.apache.tika.serialization.serdes.ParseContextSerializer;
     
     public class JsonFetchEmitTuple {
     
    -    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
    +    private static final ObjectMapper OBJECT_MAPPER;
     
         static {
    -        SimpleModule module = new SimpleModule();
    -        module.addDeserializer(FetchEmitTuple.class, new FetchEmitTupleDeserializer());
    -        module.addSerializer(FetchEmitTuple.class, new FetchEmitTupleSerializer());
    -        module.addSerializer(Metadata.class, new MetadataSerializer());
    -        module.addSerializer(ParseContext.class, new ParseContextSerializer());
    -        module.addDeserializer(ParseContext.class, new ParseContextDeserializer());
    -        OBJECT_MAPPER.registerModule(module);
    +        // Use TikaObjectMapperFactory which provides TikaModule with Metadata/ParseContext serializers
    +        OBJECT_MAPPER = TikaObjectMapperFactory.createMapper();
    +
    +        // Add FetchEmitTuple-specific serializers
    +        SimpleModule fetchEmitModule = new SimpleModule();
    +        fetchEmitModule.addDeserializer(FetchEmitTuple.class, new FetchEmitTupleDeserializer());
    +        fetchEmitModule.addSerializer(FetchEmitTuple.class, new FetchEmitTupleSerializer());
    +        OBJECT_MAPPER.registerModule(fetchEmitModule);
         }
     
         public static FetchEmitTuple fromJson(Reader reader) throws IOException {
    diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java
    index fa1a0394dd0..195cfd6df05 100644
    --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java
    +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java
    @@ -205,16 +205,23 @@ public static Set getComponentFields() {
         }
     
         /**
    -     * Gets the contextKey for a class from its TikaComponent annotation.
    +     * Gets the contextKey for a class from the component registry.
    +     * The contextKey is recorded in the .idx file by the annotation processor.
          *
          * @param clazz the class to check
    -     * @return the contextKey class if specified, or null if not annotated or no contextKey
    +     * @return the contextKey class if specified, or null if not registered or no contextKey
          */
         public static Class getContextKey(Class clazz) {
    -        org.apache.tika.config.TikaComponent annotation =
    -                clazz.getAnnotation(org.apache.tika.config.TikaComponent.class);
    -        if (annotation != null && annotation.contextKey() != void.class) {
    -            return annotation.contextKey();
    +        for (ComponentRegistry registry : REGISTRIES.values()) {
    +            String friendlyName = registry.getFriendlyName(clazz);
    +            if (friendlyName != null) {
    +                try {
    +                    ComponentInfo info = registry.getComponentInfo(friendlyName);
    +                    return info.contextKey();
    +                } catch (TikaConfigException e) {
    +                    // continue to next registry
    +                }
    +            }
             }
             return null;
         }
    diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java
    index 16607e2ade3..049d9d0327e 100644
    --- a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java
    +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java
    @@ -27,7 +27,6 @@
     import com.fasterxml.jackson.databind.module.SimpleModule;
     
     import org.apache.tika.metadata.Metadata;
    -import org.apache.tika.serialization.serdes.MetadataDeserializer;
     import org.apache.tika.serialization.serdes.MetadataSerializer;
     
     public class JsonMetadata {
    @@ -56,13 +55,12 @@ private static void rebuildObjectMappers() {
             JsonFactory factory = new JsonFactory();
             factory.setStreamReadConstraints(streamReadConstraints);
     
    +        // Use TikaModule which includes Metadata serializers
             ObjectMapper mapper = new ObjectMapper(factory);
    -        SimpleModule baseModule = new SimpleModule();
    -        baseModule.addDeserializer(Metadata.class, new MetadataDeserializer());
    -        baseModule.addSerializer(Metadata.class, new MetadataSerializer());
    -        mapper.registerModule(baseModule);
    +        mapper.registerModule(new TikaModule());
             OBJECT_MAPPER = mapper;
     
    +        // Pretty printer needs custom serializer with sort flag
             ObjectMapper prettyMapper = new ObjectMapper(factory);
             SimpleModule prettySerializerModule = new SimpleModule();
             prettySerializerModule.addSerializer(Metadata.class, new MetadataSerializer(true));
    diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java
    index 2571c4c4b95..21f413087fd 100644
    --- a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java
    +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java
    @@ -28,7 +28,6 @@
     import com.fasterxml.jackson.databind.module.SimpleModule;
     
     import org.apache.tika.metadata.Metadata;
    -import org.apache.tika.serialization.serdes.MetadataDeserializer;
     import org.apache.tika.serialization.serdes.MetadataSerializer;
     
     public class JsonMetadataList {
    @@ -57,13 +56,12 @@ private static void rebuildObjectMappers() {
             JsonFactory factory = new JsonFactory();
             factory.setStreamReadConstraints(streamReadConstraints);
     
    +        // Use TikaModule which includes Metadata serializers
             ObjectMapper mapper = new ObjectMapper(factory);
    -        SimpleModule baseModule = new SimpleModule();
    -        baseModule.addDeserializer(Metadata.class, new MetadataDeserializer());
    -        baseModule.addSerializer(Metadata.class, new MetadataSerializer());
    -        mapper.registerModule(baseModule);
    +        mapper.registerModule(new TikaModule());
             OBJECT_MAPPER = mapper;
     
    +        // Pretty printer needs custom serializer with sort flag
             ObjectMapper prettyMapper = new ObjectMapper(factory);
             SimpleModule prettySerializerModule = new SimpleModule();
             prettySerializerModule.addSerializer(Metadata.class, new MetadataSerializer(true));
    diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
    index d21f31b5945..249f7f71cfb 100644
    --- a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
    +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
    @@ -53,11 +53,13 @@
     import org.apache.tika.exception.TikaConfigException;
     import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
     import org.apache.tika.language.translate.Translator;
    +import org.apache.tika.metadata.Metadata;
     import org.apache.tika.metadata.filter.MetadataFilter;
     import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory;
     import org.apache.tika.mime.MediaType;
     import org.apache.tika.mime.MimeTypes;
     import org.apache.tika.parser.DefaultParser;
    +import org.apache.tika.parser.ParseContext;
     import org.apache.tika.parser.Parser;
     import org.apache.tika.parser.ParserDecorator;
     import org.apache.tika.renderer.Renderer;
    @@ -65,6 +67,10 @@
     import org.apache.tika.sax.ContentHandlerFactory;
     import org.apache.tika.serialization.serdes.DefaultDetectorSerializer;
     import org.apache.tika.serialization.serdes.DefaultParserSerializer;
    +import org.apache.tika.serialization.serdes.MetadataDeserializer;
    +import org.apache.tika.serialization.serdes.MetadataSerializer;
    +import org.apache.tika.serialization.serdes.ParseContextDeserializer;
    +import org.apache.tika.serialization.serdes.ParseContextSerializer;
     
     /**
      * Jackson module that provides compact serialization for Tika components.
    @@ -122,6 +128,14 @@ private static boolean usesCompactFormat(Class type) {
     
         public TikaModule() {
             super("TikaModule");
    +
    +        // Register Metadata serializers
    +        addSerializer(Metadata.class, new MetadataSerializer());
    +        addDeserializer(Metadata.class, new MetadataDeserializer());
    +
    +        // Register ParseContext serializers
    +        addSerializer(ParseContext.class, new ParseContextSerializer());
    +        addDeserializer(ParseContext.class, new ParseContextDeserializer());
         }
     
         /**
    diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java
    index 6497e7a877d..e2545d4033a 100644
    --- a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java
    +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java
    @@ -49,13 +49,19 @@ public class ParseContextSerializer extends JsonSerializer {
         public static final String PARSE_CONTEXT = "parseContext";
         public static final String TYPED = "typed";
     
    +    // Plain mapper for serializing values without TikaModule's component wrapping
    +    private static final ObjectMapper PLAIN_MAPPER = new ObjectMapper();
    +
    +    static {
    +        // Allow serialization of classes with no properties
    +        PLAIN_MAPPER.disable(com.fasterxml.jackson.databind.SerializationFeature.FAIL_ON_EMPTY_BEANS);
    +    }
    +
         @Override
         public void serialize(ParseContext parseContext, JsonGenerator gen,
                              SerializerProvider serializers) throws IOException {
             gen.writeStartObject();
     
    -        ObjectMapper mapper = (ObjectMapper) gen.getCodec();
    -
             // First, serialize typed objects from the context map under "typed" key
             Map contextMap = parseContext.getContextMap();
             boolean hasTypedObjects = false;
    @@ -85,7 +91,7 @@ public void serialize(ParseContext parseContext, JsonGenerator gen,
                     hasTypedObjects = true;
                 }
                 gen.writeFieldName(keyName);
    -            gen.writeRawValue(mapper.writeValueAsString(value));
    +            gen.writeRawValue(PLAIN_MAPPER.writeValueAsString(value));
             }
     
             if (hasTypedObjects) {