diff --git a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java index 273dfeda1cf..9e818627c62 100644 --- a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java +++ b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java @@ -77,6 +77,8 @@ public class TikaComponentProcessor extends AbstractProcessor { SERVICE_INTERFACES.put("org.apache.tika.renderer.Renderer", "renderers"); SERVICE_INTERFACES.put("org.apache.tika.metadata.filter.MetadataFilter", "metadata-filters"); SERVICE_INTERFACES.put("org.apache.tika.digest.DigesterFactory", "digester-factories"); + SERVICE_INTERFACES.put("org.apache.tika.sax.ContentHandlerFactory", + "content-handler-factories"); } private Messager messager; diff --git a/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java b/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java index e7f35814cb9..69e42570b30 100644 --- a/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java +++ b/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java @@ -34,8 +34,8 @@ *
  • Component index files (META-INF/tika/{type}.idx) for name-based lookup
  • * * - *

    This annotation is only used at compile time by the annotation processor. - * It is retained in .class files for tooling but not loaded by the runtime JVM. + *

    This annotation is processed at compile time by the annotation processor. + * The contextKey is recorded in the .idx file for runtime resolution. * *

    Example usage: *

    diff --git a/tika-app/src/test/resources/configs/config-template.json b/tika-app/src/test/resources/configs/config-template.json
    index dc73dadfe1f..e25bc96833a 100644
    --- a/tika-app/src/test/resources/configs/config-template.json
    +++ b/tika-app/src/test/resources/configs/config-template.json
    @@ -1,4 +1,12 @@
     {
    +  "content-handler-factory": {
    +    "basic-content-handler-factory": {
    +      "type": "TEXT",
    +      "writeLimit": -1,
    +      "maxEmbeddedResources": -1,
    +      "throwOnWriteLimitReached": true
    +    }
    +  },
       "fetchers": {
         "fsf": {
           "file-system-fetcher": {
    @@ -21,23 +29,13 @@
         "file-system-pipes-iterator": {
           "basePath": "FETCHER_BASE_PATH",
           "countTotal": true,
    -      "baseConfig": {
    -        "fetcherId": "fsf",
    -        "emitterId": "fse",
    -        "handlerConfig": {
    -          "type": "TEXT",
    -          "parseMode": "RMETA",
    -          "writeLimit": -1,
    -          "maxEmbeddedResources": -1,
    -          "throwOnWriteLimitReached": true
    -        },
    -        "onParseException": "EMIT",
    -        "maxWaitMs": 600000,
    -        "queueSize": 10000
    -      }
    +      "fetcherId": "fsf",
    +      "emitterId": "fse"
         }
       },
       "pipes": {
    +    "parseMode": "RMETA",
    +    "onParseException": "EMIT",
         "emitWithinMillis": 10000,
         "emitMaxEstimatedBytes": 100000,
         "queueSize": 10000,
    diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
    index 0c0599ec765..07159dba01b 100644
    --- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
    +++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
    @@ -141,7 +141,7 @@ public void parse(TikaInputStream tis, ContentHandler recursiveParserWrapperHand
                     new EmbeddedParserDecorator(getWrappedParser(), "/", "/", parserState);
             context.set(Parser.class, decorator);
             ContentHandler localHandler =
    -                parserState.recursiveParserWrapperHandler.getNewContentHandler();
    +                parserState.recursiveParserWrapperHandler.createHandler();
             long started = System.currentTimeMillis();
             parserState.recursiveParserWrapperHandler.startDocument();
             int writeLimit = -1;
    @@ -241,7 +241,7 @@ public void parse(TikaInputStream tis, ContentHandler ignore, Metadata metadata,
                 metadata.set(TikaCoreProperties.EMBEDDED_ID, parserState.embeddedCount);
                 //get a fresh handler
                 ContentHandler localHandler =
    -                    parserState.recursiveParserWrapperHandler.getNewContentHandler();
    +                    parserState.recursiveParserWrapperHandler.createHandler();
                 parserState.recursiveParserWrapperHandler.startEmbeddedDocument(localHandler, metadata);
     
                 Parser preContextParser = context.get(Parser.class);
    diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
    index 568d61c2570..cc78a55be3e 100644
    --- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
    +++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
    @@ -260,7 +260,7 @@ private void parse(TikaInputStream tis, ContentHandler handler,
                     // If not, the user will get text from every parser
                     //  mushed together onto the one solitary handler...
                     if (handlerFactory != null) {
    -                    handler = handlerFactory.getNewContentHandler();
    +                    handler = handlerFactory.createHandler();
                     }
     
                     // Record that we used this parser
    diff --git a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
    index 850ceb4147c..ea4efedf6b2 100644
    --- a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
    +++ b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
    @@ -16,9 +16,7 @@
      */
     package org.apache.tika.sax;
     
    -import java.io.OutputStream;
     import java.io.Serializable;
    -import java.nio.charset.Charset;
     
     import org.xml.sax.ContentHandler;
     import org.xml.sax.SAXException;
    @@ -55,12 +53,8 @@ public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandle
             this.maxEmbeddedResources = maxEmbeddedResources;
         }
     
    -    public ContentHandler getNewContentHandler() {
    -        return contentHandlerFactory.getNewContentHandler();
    -    }
    -
    -    public ContentHandler getNewContentHandler(OutputStream os, Charset charset) {
    -        return contentHandlerFactory.getNewContentHandler(os, charset);
    +    public ContentHandler createHandler() {
    +        return contentHandlerFactory.createHandler();
         }
     
         /**
    diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
    index 361b7817c72..2612ec8650b 100644
    --- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
    +++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
    @@ -26,19 +26,30 @@
     import org.xml.sax.ContentHandler;
     import org.xml.sax.helpers.DefaultHandler;
     
    +import org.apache.tika.config.TikaComponent;
     import org.apache.tika.parser.ParseContext;
     
     /**
    - * Basic factory for creating common types of ContentHandlers
    + * Basic factory for creating common types of ContentHandlers.
    + * 

    + * Implements {@link StreamingContentHandlerFactory} to support both in-memory + * content extraction and streaming output to an OutputStream. */ -public class BasicContentHandlerFactory implements ContentHandlerFactory, WriteLimiter { +@TikaComponent(contextKey = ContentHandlerFactory.class) +public class BasicContentHandlerFactory implements StreamingContentHandlerFactory, WriteLimiter { - private final HANDLER_TYPE type; - private final int writeLimit; + private HANDLER_TYPE type = HANDLER_TYPE.TEXT; + private int writeLimit = -1; + private boolean throwOnWriteLimitReached = true; + private int maxEmbeddedResources = -1; + private transient ParseContext parseContext; - private final boolean throwOnWriteLimitReached; - - private final ParseContext parseContext; + /** + * No-arg constructor for bean-style configuration (e.g., Jackson deserialization). + * Creates a factory with TEXT handler type, unlimited write, and throwOnWriteLimitReached=true. + */ + public BasicContentHandlerFactory() { + } /** * Create a BasicContentHandlerFactory with {@link #throwOnWriteLimitReached} is true @@ -70,7 +81,29 @@ public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit, throw new IllegalArgumentException("parse context must not be null if " + "throwOnWriteLimitReached is false"); } + } + /** + * Full constructor with all parameters including maxEmbeddedResources. + * + * @param type basic type of handler + * @param writeLimit maximum number of characters to store; -1 for unlimited + * @param throwOnWriteLimitReached whether to throw when write limit is reached + * @param maxEmbeddedResources maximum number of embedded resources to process; -1 for unlimited + * @param parseContext to store warnings if throwOnWriteLimitReached is false + */ + public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit, + boolean throwOnWriteLimitReached, int maxEmbeddedResources, + ParseContext parseContext) { + this.type = type; + this.writeLimit = writeLimit; + this.throwOnWriteLimitReached = throwOnWriteLimitReached; + this.maxEmbeddedResources = maxEmbeddedResources; + this.parseContext = parseContext; + if (throwOnWriteLimitReached == false && parseContext == null) { + throw new IllegalArgumentException("parse context must not be null if " + + "throwOnWriteLimitReached is false"); + } } /** @@ -108,7 +141,7 @@ public static HANDLER_TYPE parseHandlerType(String handlerTypeName, HANDLER_TYPE } @Override - public ContentHandler getNewContentHandler() { + public ContentHandler createHandler() { if (type == HANDLER_TYPE.BODY) { return new BodyContentHandler( @@ -139,7 +172,7 @@ private ContentHandler getFormatHandler() { } @Override - public ContentHandler getNewContentHandler(OutputStream os, Charset charset) { + public ContentHandler createHandler(OutputStream os, Charset charset) { if (type == HANDLER_TYPE.IGNORE) { return new DefaultHandler(); @@ -191,6 +224,14 @@ public HANDLER_TYPE getType() { return type; } + /** + * Sets the handler type. + * @param type the handler type + */ + public void setType(HANDLER_TYPE type) { + this.type = type; + } + /** * Common handler types for content. */ @@ -203,8 +244,72 @@ public int getWriteLimit() { return writeLimit; } + /** + * Sets the write limit. + * @param writeLimit max characters to extract; -1 for unlimited + */ + public void setWriteLimit(int writeLimit) { + this.writeLimit = writeLimit; + } + @Override public boolean isThrowOnWriteLimitReached() { return throwOnWriteLimitReached; } + + /** + * Sets whether to throw an exception when write limit is reached. + * @param throwOnWriteLimitReached true to throw, false to silently stop + */ + public void setThrowOnWriteLimitReached(boolean throwOnWriteLimitReached) { + this.throwOnWriteLimitReached = throwOnWriteLimitReached; + } + + /** + * Gets the maximum number of embedded resources to process. + * @return max embedded resources; -1 for unlimited + */ + public int getMaxEmbeddedResources() { + return maxEmbeddedResources; + } + + /** + * Sets the maximum number of embedded resources to process. + * @param maxEmbeddedResources max embedded resources; -1 for unlimited + */ + public void setMaxEmbeddedResources(int maxEmbeddedResources) { + this.maxEmbeddedResources = maxEmbeddedResources; + } + + /** + * Sets the parse context for storing warnings when throwOnWriteLimitReached is false. + * @param parseContext the parse context + */ + public void setParseContext(ParseContext parseContext) { + this.parseContext = parseContext; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + BasicContentHandlerFactory that = (BasicContentHandlerFactory) o; + return writeLimit == that.writeLimit && + throwOnWriteLimitReached == that.throwOnWriteLimitReached && + maxEmbeddedResources == that.maxEmbeddedResources && + type == that.type; + } + + @Override + public int hashCode() { + int result = type != null ? type.hashCode() : 0; + result = 31 * result + writeLimit; + result = 31 * result + (throwOnWriteLimitReached ? 1 : 0); + result = 31 * result + maxEmbeddedResources; + return result; + } } diff --git a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java index dc2f3384fcf..4c7efd7231f 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java @@ -16,19 +16,27 @@ */ package org.apache.tika.sax; - -import java.io.OutputStream; import java.io.Serializable; -import java.nio.charset.Charset; import org.xml.sax.ContentHandler; /** - * Interface to allow easier injection of code for getting a new ContentHandler + * Factory interface for creating ContentHandler instances. + *

    + * This is the base interface used by tika-pipes, RecursiveParserWrapper, and other + * components that need to create content handlers for in-memory content extraction. + *

    + * For streaming output to an OutputStream, see {@link StreamingContentHandlerFactory}. + * + * @see StreamingContentHandlerFactory + * @see BasicContentHandlerFactory */ public interface ContentHandlerFactory extends Serializable { - ContentHandler getNewContentHandler(); - - ContentHandler getNewContentHandler(OutputStream os, Charset charset); + /** + * Creates a new ContentHandler for extracting content. + * + * @return a new ContentHandler instance + */ + ContentHandler createHandler(); } diff --git a/tika-core/src/main/java/org/apache/tika/sax/StreamingContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/StreamingContentHandlerFactory.java new file mode 100644 index 00000000000..02279c16972 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/sax/StreamingContentHandlerFactory.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.sax; + +import java.io.OutputStream; +import java.nio.charset.Charset; + +import org.xml.sax.ContentHandler; + +/** + * Extended factory interface for creating ContentHandler instances that write + * directly to an OutputStream. + *

    + * This interface extends {@link ContentHandlerFactory} to add streaming output + * capability, primarily used by tika-server's /tika endpoint for streaming + * responses back to clients. + * + * @see ContentHandlerFactory + * @see BasicContentHandlerFactory + */ +public interface StreamingContentHandlerFactory extends ContentHandlerFactory { + + /** + * Creates a new ContentHandler that writes output directly to the given OutputStream. + * + * @param os the output stream to write to + * @param charset the character encoding to use + * @return a new ContentHandler instance that writes to the stream + */ + ContentHandler createHandler(OutputStream os, Charset charset); +} diff --git a/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java b/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java index 8a177c12ed4..bc6260d0a4a 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java @@ -73,7 +73,7 @@ public void testIgnore() throws Exception { Parser p = new MockParser(OVER_DEFAULT); ContentHandler handler = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1) - .getNewContentHandler(); + .createHandler(); assertTrue(handler instanceof DefaultHandler); p.parse(null, handler, null, null); //unfortunatley, the DefaultHandler does not return "", @@ -82,7 +82,7 @@ public void testIgnore() throws Exception { //tests that no write limit exception is thrown p = new MockParser(100); handler = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, 5) - .getNewContentHandler(); + .createHandler(); assertTrue(handler instanceof DefaultHandler); p.parse(null, handler, null, null); assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString()); @@ -92,7 +92,7 @@ public void testIgnore() throws Exception { public void testText() throws Exception { Parser p = new MockParser(OVER_DEFAULT); BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.TEXT; - ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(); + ContentHandler handler = new BasicContentHandlerFactory(type, -1).createHandler(); assertTrue(handler instanceof ToTextContentHandler); p.parse(null, handler, null, null); @@ -104,7 +104,7 @@ public void testText() throws Exception { assertTrue(extracted.length() > 110000); //now test write limit p = new MockParser(10); - handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); + handler = new BasicContentHandlerFactory(type, 5).createHandler(); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); extracted = handler.toString(); @@ -114,7 +114,7 @@ public void testText() throws Exception { //now test outputstream call p = new MockParser(OVER_DEFAULT); ByteArrayOutputStream os = new ByteArrayOutputStream(); - handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8); + handler = new BasicContentHandlerFactory(type, -1).createHandler(os, UTF_8); assertTrue(handler instanceof ToTextContentHandler); p.parse(null, handler, null, null); assertContains("This is the title", os.toByteArray()); @@ -125,7 +125,7 @@ public void testText() throws Exception { p = new MockParser(10); os = new ByteArrayOutputStream(); - handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8); + handler = new BasicContentHandlerFactory(type, 5).createHandler(os, UTF_8); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); //When writing to an OutputStream and a write limit is reached, @@ -137,7 +137,7 @@ public void testText() throws Exception { public void testHTML() throws Exception { Parser p = new MockParser(OVER_DEFAULT); BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.HTML; - ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(); + ContentHandler handler = new BasicContentHandlerFactory(type, -1).createHandler(); assertTrue(handler instanceof ToHTMLContentHandler); p.parse(null, handler, null, null); @@ -148,7 +148,7 @@ public void testHTML() throws Exception { //now test write limit p = new MockParser(10); - handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); + handler = new BasicContentHandlerFactory(type, 5).createHandler(); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); extracted = handler.toString(); @@ -158,7 +158,7 @@ public void testHTML() throws Exception { //now test outputstream call p = new MockParser(OVER_DEFAULT); ByteArrayOutputStream os = new ByteArrayOutputStream(); - handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8); + handler = new BasicContentHandlerFactory(type, -1).createHandler(os, UTF_8); assertTrue(handler instanceof ToHTMLContentHandler); p.parse(null, handler, null, null); assertContains("This is the title", os.toByteArray()); @@ -170,7 +170,7 @@ public void testHTML() throws Exception { p = new MockParser(10); os = new ByteArrayOutputStream(); - handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8); + handler = new BasicContentHandlerFactory(type, 5).createHandler(os, UTF_8); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); assertEquals(0, os.toByteArray().length); @@ -180,7 +180,7 @@ public void testHTML() throws Exception { public void testXML() throws Exception { Parser p = new MockParser(OVER_DEFAULT); BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.HTML; - ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(); + ContentHandler handler = new BasicContentHandlerFactory(type, -1).createHandler(); assertTrue(handler instanceof ToXMLContentHandler); p.parse(null, handler, new Metadata(), null); @@ -191,7 +191,7 @@ public void testXML() throws Exception { //now test write limit p = new MockParser(10); - handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); + handler = new BasicContentHandlerFactory(type, 5).createHandler(); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); extracted = handler.toString(); @@ -201,7 +201,7 @@ public void testXML() throws Exception { //now test outputstream call p = new MockParser(OVER_DEFAULT); ByteArrayOutputStream os = new ByteArrayOutputStream(); - handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8); + handler = new BasicContentHandlerFactory(type, -1).createHandler(os, UTF_8); assertTrue(handler instanceof ToXMLContentHandler); p.parse(null, handler, null, null); @@ -214,7 +214,7 @@ public void testXML() throws Exception { p = new MockParser(10); os = new ByteArrayOutputStream(); - handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8); + handler = new BasicContentHandlerFactory(type, 5).createHandler(os, UTF_8); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); assertEquals(0, os.toByteArray().length); @@ -224,7 +224,7 @@ public void testXML() throws Exception { public void testBody() throws Exception { Parser p = new MockParser(OVER_DEFAULT); BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.BODY; - ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(); + ContentHandler handler = new BasicContentHandlerFactory(type, -1).createHandler(); assertTrue(handler instanceof BodyContentHandler); @@ -236,7 +236,7 @@ public void testBody() throws Exception { //now test write limit p = new MockParser(10); - handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); + handler = new BasicContentHandlerFactory(type, 5).createHandler(); assertTrue(handler instanceof BodyContentHandler); assertWriteLimitReached(p, (BodyContentHandler) handler); extracted = handler.toString(); @@ -246,7 +246,7 @@ public void testBody() throws Exception { //now test outputstream call p = new MockParser(OVER_DEFAULT); ByteArrayOutputStream os = new ByteArrayOutputStream(); - handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8); + handler = new BasicContentHandlerFactory(type, -1).createHandler(os, UTF_8); assertTrue(handler instanceof BodyContentHandler); p.parse(null, handler, null, null); assertNotContains("title", os.toByteArray()); @@ -257,7 +257,7 @@ public void testBody() throws Exception { p = new MockParser(10); os = new ByteArrayOutputStream(); - handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8); + handler = new BasicContentHandlerFactory(type, 5).createHandler(os, UTF_8); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); assertEquals(0, os.toByteArray().length); diff --git a/tika-eval/tika-eval-app/src/main/resources/pipes-iterator-template.json b/tika-eval/tika-eval-app/src/main/resources/pipes-iterator-template.json index a5a7ddfad37..4ae623d6065 100644 --- a/tika-eval/tika-eval-app/src/main/resources/pipes-iterator-template.json +++ b/tika-eval/tika-eval-app/src/main/resources/pipes-iterator-template.json @@ -1,18 +1,6 @@ { "basePath": "FETCHER_BASE_PATH", "countTotal": true, - "baseConfig": { - "fetcherId": "fsf", - "emitterId": "", - "handlerConfig": { - "type": "TEXT", - "parseMode": "RMETA", - "writeLimit": -1, - "maxEmbeddedResources": -1, - "throwOnWriteLimitReached": true - }, - "onParseException": "EMIT", - "maxWaitMs": 600000, - "queueSize": 10000 - } -} \ No newline at end of file + "fetcherId": "fsf", + "emitterId": "" +} diff --git a/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java b/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java index 4796401ebf8..c42a562c898 100644 --- a/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java +++ b/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java @@ -17,7 +17,6 @@ package org.apache.tika.example; import java.io.IOException; -import java.io.OutputStream; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.HashMap; @@ -147,7 +146,7 @@ public void parse(TikaInputStream tis, ContentHandler handler, Metadata original public void parse(TikaInputStream tis, ContentHandlerFactory handlers, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // We only work with one ContentHandler as far as the user is // concerned, any others are purely internal! - parse(tis, handlers.getNewContentHandler(), metadata, context); + parse(tis, handlers.createHandler(), metadata, context); } protected class CharsetContentHandlerFactory implements ContentHandlerFactory { @@ -157,18 +156,13 @@ protected class CharsetContentHandlerFactory implements ContentHandlerFactory { private ContentHandler handler; @Override - public ContentHandler getNewContentHandler() { + public ContentHandler createHandler() { index++; if (index < charsetsToTry.length) { return new BodyContentHandler(); } return handler; } - - @Override - public ContentHandler getNewContentHandler(OutputStream os, Charset charset) { - return getNewContentHandler(); - } } protected class CharsetTester { diff --git a/tika-example/src/main/java/org/apache/tika/example/PipesForkParserExample.java b/tika-example/src/main/java/org/apache/tika/example/PipesForkParserExample.java index e4439b801f6..4b69d10afa0 100644 --- a/tika-example/src/main/java/org/apache/tika/example/PipesForkParserExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/PipesForkParserExample.java @@ -26,7 +26,7 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.core.PipesException; import org.apache.tika.pipes.fork.PipesForkParser; import org.apache.tika.pipes.fork.PipesForkParserConfig; @@ -277,7 +277,7 @@ public void parseWithMetadata(Path filePath) public void parseEmbeddedDocumentsRmeta(Path filePath) throws IOException, InterruptedException, TikaException, PipesException { PipesForkParserConfig config = new PipesForkParserConfig() - .setParseMode(HandlerConfig.PARSE_MODE.RMETA); + .setParseMode(ParseMode.RMETA); try (PipesForkParser parser = new PipesForkParser(config); TikaInputStream tis = TikaInputStream.get(filePath)) { @@ -334,7 +334,7 @@ public void parseEmbeddedDocumentsRmeta(Path filePath) public void parseEmbeddedDocumentsConcatenate(Path filePath) throws IOException, InterruptedException, TikaException, PipesException { PipesForkParserConfig config = new PipesForkParserConfig() - .setParseMode(HandlerConfig.PARSE_MODE.CONCATENATE); + .setParseMode(ParseMode.CONCATENATE); try (PipesForkParser parser = new PipesForkParser(config); TikaInputStream tis = TikaInputStream.get(filePath)) { diff --git a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java index e1b32ceb259..cdfb7391b99 100644 --- a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java +++ b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java @@ -64,7 +64,7 @@ import org.apache.tika.cli.TikaCLI; import org.apache.tika.config.JsonConfigHelper; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.utils.SystemUtils; /** @@ -220,7 +220,7 @@ private Path getTikaConfig(Path pipesDirectory, Path testFileFolderPath) throws replacements.put("EMITTER_TOPIC", EMITTER_TOPIC); replacements.put("BOOTSTRAP_SERVERS", kafka.getBootstrapServers()); replacements.put("FETCHER_BASE_PATH", testFileFolderPath); - replacements.put("PARSE_MODE", HandlerConfig.PARSE_MODE.RMETA.name()); + replacements.put("PARSE_MODE", ParseMode.RMETA.name()); replacements.put("LOG4J_JVM_ARG", "-Dlog4j.configurationFile=" + log4jPropFile.toAbsolutePath()); JsonConfigHelper.writeConfigFromResource("/kafka/plugins-template.json", diff --git a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/plugins-template.json b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/plugins-template.json index 7dc28288517..128a1a8b441 100644 --- a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/plugins-template.json +++ b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/plugins-template.json @@ -1,4 +1,12 @@ { + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "maxEmbeddedResources": -1, + "throwOnWriteLimitReached": true + } + }, "parsers": [ { "default-parser": {} @@ -77,23 +85,13 @@ "groupId": "grpid", "autoOffsetReset": "earliest", "pollDelayMs": 1000, - "baseConfig": { - "fetcherId": "fsf", - "emitterId": "ke", - "handlerConfig": { - "type": "TEXT", - "parseMode": "PARSE_MODE", - "writeLimit": -1, - "maxEmbeddedResources": -1, - "throwOnWriteLimitReached": true - }, - "onParseException": "EMIT", - "maxWaitMs": 600000, - "queueSize": 10000 - } + "fetcherId": "fsf", + "emitterId": "ke" } }, "pipes": { + "parseMode": "PARSE_MODE", + "onParseException": "EMIT", "emitMaxEstimatedBytes": 100000, "emitWithinMillis": 10, "numEmitters": 1, diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java index f105be64acf..ee5145f2840 100644 --- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java +++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java @@ -53,7 +53,7 @@ import org.apache.tika.exception.TikaConfigException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.emitter.Emitter; import org.apache.tika.pipes.core.emitter.EmitterManager; import org.apache.tika.pipes.emitter.opensearch.HttpClientConfig; @@ -98,7 +98,7 @@ public void testPluginsConfig(@TempDir Path pipesDirectory) throws Exception { Path pluginsConfg = getPluginsConfig( pipesDirectory, OpenSearchEmitterConfig.AttachmentStrategy.PARENT_CHILD, OpenSearchEmitterConfig.UpdateStrategy.OVERWRITE, - HandlerConfig.PARSE_MODE.RMETA, "https://opensearch", Paths.get("testDocs")); + ParseMode.RMETA, "https://opensearch", Paths.get("testDocs")); // PipesReporter reporter = ReporterManager.load(pluginsConfg); // System.out.println(reporter); // PipesIterator pipesIterator = PipesIteratorManager.load(pluginsConfg); @@ -115,7 +115,7 @@ public void testBasicFSToOpenSearch(@TempDir Path pipesDirectory, @TempDir Path sendMappings(client, endpoint, TEST_INDEX, "opensearch-mappings.json"); runPipes(client, OpenSearchEmitterConfig.AttachmentStrategy.SEPARATE_DOCUMENTS, - OpenSearchEmitterConfig.UpdateStrategy.UPSERT, HandlerConfig.PARSE_MODE.CONCATENATE, endpoint, + OpenSearchEmitterConfig.UpdateStrategy.UPSERT, ParseMode.CONCATENATE, endpoint, pipesDirectory, testDocDirectory); String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " + @@ -184,7 +184,7 @@ public void testParentChildFSToOpenSearch(@TempDir Path pipesDirectory, @TempDir runPipes(client, OpenSearchEmitterConfig.AttachmentStrategy.PARENT_CHILD, OpenSearchEmitterConfig.UpdateStrategy.OVERWRITE, - HandlerConfig.PARSE_MODE.RMETA, endpoint, pipesDirectory, testDocDirectory); + ParseMode.RMETA, endpoint, pipesDirectory, testDocDirectory); String query = "{ \"track_total_hits\": true, \"from\":0, \"size\": 10000, \"query\": { \"match\": { \"content\": { " + "\"query\": \"happiness\" } } } }"; @@ -252,7 +252,7 @@ public void testSeparateDocsFSToOpenSearch(@TempDir Path pipesDirectory, @TempDi runPipes(client, OpenSearchEmitterConfig.AttachmentStrategy.SEPARATE_DOCUMENTS, OpenSearchEmitterConfig.UpdateStrategy.OVERWRITE, - HandlerConfig.PARSE_MODE.RMETA, endpoint, + ParseMode.RMETA, endpoint, pipesDirectory, testDocDirectory); String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " + @@ -318,7 +318,7 @@ public void testUpsertSeparateDocsFSToOpenSearch(@TempDir Path pipesDirectory, @ runPipes(client, OpenSearchEmitterConfig.AttachmentStrategy.SEPARATE_DOCUMENTS, OpenSearchEmitterConfig.UpdateStrategy.UPSERT, - HandlerConfig.PARSE_MODE.RMETA, endpoint, pipesDirectory, testDocDirectory); + ParseMode.RMETA, endpoint, pipesDirectory, testDocDirectory); String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " + "\"query\": \"happiness\" } } } }"; @@ -378,7 +378,7 @@ public void testUpsert(@TempDir Path pipesDirectory, @TempDir Path testDocDirect String endpoint = CONTAINER.getHttpHostAddress() + "/" + TEST_INDEX; sendMappings(client, endpoint, TEST_INDEX, "opensearch-mappings.json"); Path pluginsConfigFile = getPluginsConfig(pipesDirectory, OpenSearchEmitterConfig.AttachmentStrategy.SEPARATE_DOCUMENTS, - OpenSearchEmitterConfig.UpdateStrategy.UPSERT, HandlerConfig.PARSE_MODE.RMETA, + OpenSearchEmitterConfig.UpdateStrategy.UPSERT, ParseMode.RMETA, endpoint, testDocDirectory); TikaJsonConfig tikaJsonConfig = TikaJsonConfig.load(pluginsConfigFile); @@ -450,7 +450,7 @@ protected void sendMappings(OpensearchTestClient client, String endpoint, String private void runPipes(OpensearchTestClient client, OpenSearchEmitterConfig.AttachmentStrategy attachmentStrategy, OpenSearchEmitterConfig.UpdateStrategy updateStrategy, - HandlerConfig.PARSE_MODE parseMode, String endpoint, Path pipesDirectory, Path testDocDirectory) throws Exception { + ParseMode parseMode, String endpoint, Path pipesDirectory, Path testDocDirectory) throws Exception { Path pluginsConfig = getPluginsConfig(pipesDirectory, attachmentStrategy, updateStrategy, parseMode, endpoint, testDocDirectory); @@ -466,7 +466,7 @@ private void runPipes(OpensearchTestClient client, OpenSearchEmitterConfig.Attac @NotNull private Path getPluginsConfig(Path pipesDirectory, OpenSearchEmitterConfig.AttachmentStrategy attachmentStrategy, OpenSearchEmitterConfig.UpdateStrategy updateStrategy, - HandlerConfig.PARSE_MODE parseMode, String endpoint, Path testDocDirectory) throws IOException { + ParseMode parseMode, String endpoint, Path testDocDirectory) throws IOException { Path tikaConfig = pipesDirectory.resolve("plugins-config.json"); Path log4jPropFile = pipesDirectory.resolve("log4j2.xml"); diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/plugins-template.json b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/plugins-template.json index 16e2a4fc968..2b4f98f92e3 100644 --- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/plugins-template.json +++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/plugins-template.json @@ -1,4 +1,12 @@ { + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "maxEmbeddedResources": -1, + "throwOnWriteLimitReached": true + } + }, "fetchers": { "fsf": { "file-system-fetcher": { @@ -29,20 +37,8 @@ "file-system-pipes-iterator": { "basePath": "FETCHER_BASE_PATH", "countTotal": true, - "baseConfig": { - "fetcherId": "fsf", - "emitterId": "ose", - "handlerConfig": { - "type": "TEXT", - "parseMode": "PARSE_MODE", - "writeLimit": -1, - "maxEmbeddedResources": -1, - "throwOnWriteLimitReached": true - }, - "onParseException": "EMIT", - "maxWaitMs": 600000, - "queueSize": 10000 - } + "fetcherId": "fsf", + "emitterId": "ose" } }, "pipes-reporters": { @@ -60,6 +56,8 @@ } }, "pipes": { + "parseMode": "PARSE_MODE", + "onParseException": "EMIT", "emitStrategy": { "type": "DYNAMIC", "thresholdBytes": 10000 @@ -93,6 +91,5 @@ } } ], - "plugin-roots": "target/plugins" -} \ No newline at end of file +} diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json index a6a0c512679..172a0c1c0ec 100644 --- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json +++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json @@ -1,4 +1,12 @@ { + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "maxEmbeddedResources": -1, + "throwOnWriteLimitReached": true + } + }, "parsers": [ { "default-parser": {} @@ -70,20 +78,8 @@ "file-system-pipes-iterator": { "basePath": "FETCHER_BASE_PATH", "countTotal": true, - "baseConfig": { - "fetcherId": "fsf", - "emitterId": "ose", - "handlerConfig": { - "type": "TEXT", - "parseMode": "PARSE_MODE", - "writeLimit": -1, - "maxEmbeddedResources": -1, - "throwOnWriteLimitReached": true - }, - "onParseException": "EMIT", - "maxWaitMs": 600000, - "queueSize": 10000 - } + "fetcherId": "fsf", + "emitterId": "ose" } }, "pipes-reporters": { @@ -101,6 +97,8 @@ } }, "pipes": { + "parseMode": "PARSE_MODE", + "onParseException": "EMIT", "emitStrategy": { "type": "DYNAMIC", "thresholdBytes": 10000 diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java index 92b3c6b2479..888396343fa 100644 --- a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java +++ b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java @@ -55,6 +55,7 @@ import org.apache.tika.cli.TikaCLI; import org.apache.tika.config.JsonConfigHelper; +import org.apache.tika.pipes.api.ParseMode; @TestInstance(TestInstance.Lifecycle.PER_CLASS) @Testcontainers(disabledWithoutDocker = true) @@ -140,7 +141,7 @@ void s3PipelineIteratorS3FetcherAndS3Emitter() throws Exception { // Create plugins config JSON Map replacements = new HashMap<>(); replacements.put("LOG4J_JVM_ARG", "-Dlog4j.configurationFile=" + log4jPropFile.toAbsolutePath()); - replacements.put("PARSE_MODE", org.apache.tika.pipes.api.HandlerConfig.PARSE_MODE.RMETA.name()); + replacements.put("PARSE_MODE", ParseMode.RMETA.name()); replacements.put("PIPE_ITERATOR_BUCKET", FETCH_BUCKET); replacements.put("EMIT_BUCKET", EMIT_BUCKET); replacements.put("FETCH_BUCKET", FETCH_BUCKET); diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/plugins-template.json b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/plugins-template.json index 1efc929ce35..816d5c49e58 100644 --- a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/plugins-template.json +++ b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/plugins-template.json @@ -1,4 +1,12 @@ { + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "maxEmbeddedResources": -1, + "throwOnWriteLimitReached": true + } + }, "fetchers": { "s3f": { "s3-fetcher": { @@ -44,23 +52,13 @@ "secretKey": "SECRET_KEY", "endpointConfigurationService": "ENDPOINT_CONFIGURATION_SERVICE", "pathStyleAccessEnabled": true, - "baseConfig": { - "fetcherId": "s3f", - "emitterId": "s3e", - "handlerConfig": { - "type": "TEXT", - "parseMode": "PARSE_MODE", - "writeLimit": -1, - "maxEmbeddedResources": -1, - "throwOnWriteLimitReached": true - }, - "onParseException": "EMIT", - "maxWaitMs": 600000, - "queueSize": 10000 - } + "fetcherId": "s3f", + "emitterId": "s3e" } }, "pipes": { + "parseMode": "PARSE_MODE", + "onParseException": "EMIT", "emitMaxEstimatedBytes": 100000, "emitWithinMillis": 10, "numEmitters": 1, diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java index fb195df8562..0fea4b0cd07 100644 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java +++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java @@ -48,7 +48,7 @@ import org.apache.tika.cli.TikaCLI; import org.apache.tika.config.JsonConfigHelper; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.emitter.solr.SolrEmitterConfig; import org.apache.tika.utils.SystemUtils; @@ -210,7 +210,7 @@ protected void runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter(Path pipesDire Path tikaConfigFile = getTikaConfig(pipesDirectory, SolrEmitterConfig.UpdateStrategy.ADD, SolrEmitterConfig.AttachmentStrategy.PARENT_CHILD, - HandlerConfig.PARSE_MODE.RMETA); + ParseMode.RMETA); TikaCLI.main(new String[]{"-a", "-c", tikaConfigFile.toAbsolutePath().toString()}); @@ -244,7 +244,7 @@ protected void runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter(Path pipesDire tikaConfigFile = getTikaConfig(pipesDirectory, SolrEmitterConfig.UpdateStrategy.UPDATE_MUST_EXIST, SolrEmitterConfig.AttachmentStrategy.PARENT_CHILD, - HandlerConfig.PARSE_MODE.RMETA); + ParseMode.RMETA); TikaCLI.main(new String[]{"-a", "-c", tikaConfigFile.toAbsolutePath().toString()}); @@ -263,7 +263,7 @@ protected void runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter(Path pipesDire private Path getTikaConfig(Path pipesDirectory, SolrEmitterConfig.UpdateStrategy updateStrategy, SolrEmitterConfig.AttachmentStrategy attachmentStrategy, - HandlerConfig.PARSE_MODE parseMode) throws IOException { + ParseMode parseMode) throws IOException { Path tikaConfig = pipesDirectory.resolve("plugins-config.json"); Path log4jPropFile = pipesDirectory.resolve("log4j2.xml"); diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/plugins-template.json b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/plugins-template.json index 366be952746..63cf5d73b50 100644 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/plugins-template.json +++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/plugins-template.json @@ -1,4 +1,12 @@ { + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "maxEmbeddedResources": -1, + "throwOnWriteLimitReached": true + } + }, "parsers": [ { "default-parser": {} @@ -74,23 +82,13 @@ "rows": 100, "connectionTimeout": 10000, "socketTimeout": 60000, - "baseConfig": { - "fetcherId": "fsf", - "emitterId": "se", - "handlerConfig": { - "type": "TEXT", - "parseMode": "PARSE_MODE", - "writeLimit": -1, - "maxEmbeddedResources": -1, - "throwOnWriteLimitReached": true - }, - "onParseException": "EMIT", - "maxWaitMs": 600000, - "queueSize": 10000 - } + "fetcherId": "fsf", + "emitterId": "se" } }, "pipes": { + "parseMode": "PARSE_MODE", + "onParseException": "EMIT", "emitStrategy": { "type": "DYNAMIC", "thresholdBytes": 10000 diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index ffaa2cac246..195da525caa 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -1518,7 +1518,7 @@ private Metadata testWriteLimit(String fileName, int limit) throws Exception { BasicContentHandlerFactory factory = new BasicContentHandlerFactory( BasicContentHandlerFactory.HANDLER_TYPE.TEXT, limit ); - ContentHandler contentHandler = factory.getNewContentHandler(); + ContentHandler contentHandler = factory.createHandler(); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); try (TikaInputStream tis = getResourceAsStream("/test-documents/" + fileName)) { diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java index 6576c904ea2..15586c526cf 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java @@ -37,7 +37,6 @@ import org.apache.tika.exception.TikaConfigException; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.pipes.api.pipesiterator.PipesIterator; @@ -47,6 +46,7 @@ import org.apache.tika.plugins.ExtensionConfig; import org.apache.tika.plugins.TikaPluginManager; import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.utils.StringUtils; public class TikaAsyncCLI { @@ -290,9 +290,8 @@ private static void configureHandler(FetchEmitTuple t, SimpleAsyncConfig asyncCo if (asyncConfig.getHandlerType() == BasicContentHandlerFactory.HANDLER_TYPE.TEXT) { return; } - HandlerConfig handlerConfig = new HandlerConfig(asyncConfig.getHandlerType(), HandlerConfig.PARSE_MODE.RMETA, - -1, -1, false); - t.getParseContext().set(HandlerConfig.class, handlerConfig); + ContentHandlerFactory factory = new BasicContentHandlerFactory(asyncConfig.getHandlerType(), -1); + t.getParseContext().set(ContentHandlerFactory.class, factory); } private static void configureExtractBytes(FetchEmitTuple t, SimpleAsyncConfig asyncConfig) { diff --git a/tika-pipes/tika-async-cli/src/main/resources/config-template.json b/tika-pipes/tika-async-cli/src/main/resources/config-template.json index e295290dd4b..d4c70d5d731 100644 --- a/tika-pipes/tika-async-cli/src/main/resources/config-template.json +++ b/tika-pipes/tika-async-cli/src/main/resources/config-template.json @@ -1,4 +1,12 @@ { + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "maxEmbeddedResources": -1, + "throwOnWriteLimitReached": true + } + }, "parsers": [ { "default-parser": {} @@ -45,21 +53,15 @@ "file-system-pipes-iterator": { "basePath": "FETCHER_BASE_PATH", "countTotal": true, - "baseConfig": { - "fetcherId": "fsf", - "emitterId": "fse", - "handlerConfig": { - "type": "TEXT", - "parseMode": "RMETA", - "writeLimit": -1, - "maxEmbeddedResources": -1, - "throwOnWriteLimitReached": true - }, - "onParseException": "EMIT", - "maxWaitMs": 600000, - "queueSize": 10000 - } + "fetcherId": "fsf", + "emitterId": "fse", + "onParseException": "EMIT", + "maxWaitMs": 600000, + "queueSize": 10000 } }, + "pipes": { + "parseMode": "RMETA" + }, "plugin-roots": "PLUGIN_ROOTS" } diff --git a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java index 4bd181699e6..6d26b6dd0fa 100644 --- a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java +++ b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java @@ -17,7 +17,6 @@ package org.apache.tika.async.cli; -import static org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig.DEFAULT_HANDLER_CONFIG; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -44,7 +43,6 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.pipes.api.pipesiterator.PipesIterator; @@ -112,8 +110,6 @@ public void setUp() throws Exception { @Test public void testRecursiveUnpacking() throws Exception { -// TikaAsyncCLI cli = new TikaAsyncCLI(); - // cli.main(new String[]{ configDir.resolve("tika-config.xml").toAbsolutePath().toString()}); AsyncProcessor processor = AsyncProcessor.load(configDir.resolve("tika-config.json")); EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = new EmbeddedDocumentBytesConfig(true); @@ -122,7 +118,6 @@ public void testRecursiveUnpacking() throws Exception { embeddedDocumentBytesConfig.setSuffixStrategy(EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.NONE); embeddedDocumentBytesConfig.setEmbeddedIdPrefix("-"); ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG); parseContext.set(EmbeddedDocumentBytesConfig.class, embeddedDocumentBytesConfig); FetchEmitTuple t = new FetchEmitTuple("myId-1", new FetchKey("fsf", "mock.xml"), @@ -133,7 +128,6 @@ public void testRecursiveUnpacking() throws Exception { for (int i = 0; i < 10; i++) { processor.offer(PipesIterator.COMPLETED_SEMAPHORE, 1000); } - //TODO clean this up while (processor.checkActive()) { Thread.sleep(100); } @@ -161,14 +155,9 @@ public void testRecursiveUnpacking() throws Exception { @Test public void testStopsOnApplicationError() throws Exception { - // Test that AsyncProcessor stops processing when an application error occurs - // (TIKA-4570) AsyncProcessor processor = AsyncProcessor.load(configDir.resolve("tika-config.json")); - // Create a tuple with a non-existent fetcher - this will cause FETCHER_NOT_FOUND - // which is a TASK_EXCEPTION but will stop processing in CLI mode (default) ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG); FetchEmitTuple badTuple = new FetchEmitTuple( "bad-tuple-1", new FetchKey("non-existent-fetcher", "some-file.txt"), @@ -177,10 +166,8 @@ public void testStopsOnApplicationError() throws Exception { parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT); - // Offer the bad tuple processor.offer(badTuple, 1000); - // Wait for the error to be detected int maxWaitMs = 30000; int waited = 0; while (!processor.hasApplicationError() && waited < maxWaitMs) { @@ -188,11 +175,9 @@ public void testStopsOnApplicationError() throws Exception { waited += 100; } - // Verify that the application error was detected assertTrue(processor.hasApplicationError(), "AsyncProcessor should detect application error from bad fetcher"); - // Verify that subsequent offers throw PipesException FetchEmitTuple anotherTuple = new FetchEmitTuple( "another-tuple", new FetchKey("fsf", "mock.xml"), diff --git a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/HandlerConfig.java b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/HandlerConfig.java deleted file mode 100644 index b336f1a4fcc..00000000000 --- a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/HandlerConfig.java +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.pipes.api; - -import java.io.Serializable; -import java.util.Locale; -import java.util.Objects; - -import org.apache.tika.sax.BasicContentHandlerFactory; - -/** - * Configuration for content handler behavior during parsing. - */ -public class HandlerConfig implements Serializable { - - /** - * {@link PARSE_MODE#RMETA} "recursive metadata" is the same as the -J option - * in tika-app and the /rmeta endpoint in tika-server. Each embedded file is represented as - * its own metadata object. - * - * {@link PARSE_MODE#CONCATENATE} is similar - * to the legacy tika-app behavior and the /tika endpoint (accept: application/json) in - * tika-server. This concatenates the - * contents of embedded files and returns a single metadata object for the file no - * matter how many embedded objects there are; this option throws away metadata from - * embedded objects and silently skips exceptions in embedded objects. - */ - public enum PARSE_MODE { - RMETA, - CONCATENATE; - - public static PARSE_MODE parseMode(String modeString) { - for (PARSE_MODE m : PARSE_MODE.values()) { - if (m.name().equalsIgnoreCase(modeString)) { - return m; - } - } - StringBuilder sb = new StringBuilder(); - int i = 0; - for (PARSE_MODE m : PARSE_MODE.values()) { - if (i++ > 0) { - sb.append(", "); - } - sb.append(m.name().toLowerCase(Locale.US)); - } - throw new IllegalArgumentException("mode must be one of: (" + sb + - "). I regret I do not understand: " + modeString); - } - } - BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.TEXT; - PARSE_MODE parseMode = PARSE_MODE.RMETA; - int writeLimit = -1; - int maxEmbeddedResources = -1; - boolean throwOnWriteLimitReached = true; - - public HandlerConfig() { - - } - - public HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE type, PARSE_MODE parseMode, int writeLimit, int maxEmbeddedResources, boolean throwOnWriteLimitReached) { - this.type = type; - this.parseMode = parseMode; - this.writeLimit = writeLimit; - this.maxEmbeddedResources = maxEmbeddedResources; - this.throwOnWriteLimitReached = throwOnWriteLimitReached; - } - - public BasicContentHandlerFactory.HANDLER_TYPE getType() { - return type; - } - - public void setType(BasicContentHandlerFactory.HANDLER_TYPE type) { - this.type = type; - } - - public void setType(String typeString) { - this.type = BasicContentHandlerFactory.HANDLER_TYPE.valueOf(typeString); - } - - public PARSE_MODE getParseMode() { - return parseMode; - } - - public void setParseMode(PARSE_MODE parseMode) { - this.parseMode = parseMode; - } - - public void setParseMode(String parseMode) { - this.parseMode = PARSE_MODE.valueOf(parseMode); - } - - public int getWriteLimit() { - return writeLimit; - } - - public void setWriteLimit(int writeLimit) { - this.writeLimit = writeLimit; - } - - public int getMaxEmbeddedResources() { - return maxEmbeddedResources; - } - - public void setMaxEmbeddedResources(int maxEmbeddedResources) { - this.maxEmbeddedResources = maxEmbeddedResources; - } - - public boolean isThrowOnWriteLimitReached() { - return throwOnWriteLimitReached; - } - - public void setThrowOnWriteLimitReached(boolean throwOnWriteLimitReached) { - this.throwOnWriteLimitReached = throwOnWriteLimitReached; - } - - @Override - public final boolean equals(Object o) { - if (!(o instanceof HandlerConfig that)) { - return false; - } - - return writeLimit == that.writeLimit && maxEmbeddedResources == that.maxEmbeddedResources && throwOnWriteLimitReached == that.throwOnWriteLimitReached && - type == that.type && parseMode == that.parseMode; - } - - @Override - public int hashCode() { - int result = Objects.hashCode(type); - result = 31 * result + Objects.hashCode(parseMode); - result = 31 * result + writeLimit; - result = 31 * result + maxEmbeddedResources; - result = 31 * result + Boolean.hashCode(throwOnWriteLimitReached); - return result; - } -} diff --git a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java new file mode 100644 index 00000000000..edd82729dad --- /dev/null +++ b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.api; + +import java.util.Locale; + +/** + * Controls how embedded documents are handled during parsing. + *

    + * This can be set as a default in PipesConfig (loaded from tika-config.json) + * or overridden per-file via ParseContext. + */ +public enum ParseMode { + + /** + * Each embedded file gets its own metadata object in a list. + *

    + * This is equivalent to the -J option in tika-app and the /rmeta endpoint + * in tika-server. The result is a list of metadata objects, one for each + * document (container + all embedded documents). + */ + RMETA, + + /** + * Concatenates content from all embedded files into a single document. + *

    + * This is equivalent to the legacy tika-app behavior and the /tika endpoint + * in tika-server. The result is a single metadata object with concatenated + * content from all documents. + */ + CONCATENATE; + + /** + * Parses a string to a ParseMode enum value. + * + * @param modeString the string to parse (case-insensitive) + * @return the corresponding ParseMode + * @throws IllegalArgumentException if the string doesn't match any mode + */ + public static ParseMode parse(String modeString) { + if (modeString == null) { + throw new IllegalArgumentException("Parse mode cannot be null"); + } + String normalized = modeString.toUpperCase(Locale.ROOT).trim(); + try { + return ParseMode.valueOf(normalized); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException( + "Invalid parse mode: '" + modeString + "'. " + + "Must be one of: RMETA, CONCATENATE"); + } + } +} diff --git a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorBaseConfig.java b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorBaseConfig.java deleted file mode 100644 index 021d62e400a..00000000000 --- a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorBaseConfig.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.pipes.api.pipesiterator; - -import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; -import org.apache.tika.sax.BasicContentHandlerFactory; - - -public record PipesIteratorBaseConfig(String fetcherId, String emitterId, HandlerConfig handlerConfig, - FetchEmitTuple.ON_PARSE_EXCEPTION onParseException, long maxWaitMs, int queueSize) { - - public static final HandlerConfig DEFAULT_HANDLER_CONFIG = new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, HandlerConfig.PARSE_MODE.RMETA, - -1, -1, true); - private static final FetchEmitTuple.ON_PARSE_EXCEPTION DEFAULT_ON_PARSE_EXCEPTION = FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT; - private static final long DEFAULT_MAX_WAIT_MS = 600_000; - private static final int DEFAULT_QUEUE_SIZE = 10000; - - public PipesIteratorBaseConfig(String fetcherId, String emitterId) { - this(fetcherId, emitterId, DEFAULT_HANDLER_CONFIG, DEFAULT_ON_PARSE_EXCEPTION, DEFAULT_MAX_WAIT_MS, DEFAULT_QUEUE_SIZE); - } - -} diff --git a/tika-pipes/tika-pipes-api/src/main/resources/META-INF/tika/other-configs.idx b/tika-pipes/tika-pipes-api/src/main/resources/META-INF/tika/other-configs.idx deleted file mode 100644 index 4b66482790f..00000000000 --- a/tika-pipes/tika-pipes-api/src/main/resources/META-INF/tika/other-configs.idx +++ /dev/null @@ -1,20 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Component registry for tika-pipes-api -# Format: friendly-name=fully.qualified.ClassName -# this has to be manually generated for now because of the dependency graph - -handler-config=org.apache.tika.pipes.api.HandlerConfig diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesConfig.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesConfig.java index 8daae166ff0..74cd509a0a7 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesConfig.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesConfig.java @@ -21,6 +21,8 @@ import org.apache.tika.config.loader.TikaJsonConfig; import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.pipes.api.FetchEmitTuple; +import org.apache.tika.pipes.api.ParseMode; public class PipesConfig { @@ -85,6 +87,17 @@ public class PipesConfig { */ private boolean stopOnlyOnFatal = false; + /** + * Default parse mode for how embedded documents are handled. + * Can be overridden per-file via ParseContext. + */ + private ParseMode parseMode = ParseMode.RMETA; + + /** + * Default behavior when a parse exception occurs. + */ + private FetchEmitTuple.ON_PARSE_EXCEPTION onParseException = FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT; + private ArrayList forkedJvmArgs = new ArrayList<>(); private String javaPath = "java"; @@ -361,6 +374,52 @@ public void setStopOnlyOnFatal(boolean stopOnlyOnFatal) { this.stopOnlyOnFatal = stopOnlyOnFatal; } + /** + * Gets the default parse mode for how embedded documents are handled. + * + * @return the default parse mode + */ + public ParseMode getParseMode() { + return parseMode; + } + + /** + * Sets the default parse mode for how embedded documents are handled. + * This can be overridden per-file via ParseContext. + * + * @param parseMode the parse mode (RMETA or CONCATENATE) + */ + public void setParseMode(ParseMode parseMode) { + this.parseMode = parseMode; + } + + /** + * Sets the default parse mode from a string. + * + * @param parseMode the parse mode name (rmeta or concatenate) + */ + public void setParseMode(String parseMode) { + this.parseMode = ParseMode.parse(parseMode); + } + + /** + * Gets the default behavior when a parse exception occurs. + * + * @return the parse exception behavior + */ + public FetchEmitTuple.ON_PARSE_EXCEPTION getOnParseException() { + return onParseException; + } + + /** + * Sets the default behavior when a parse exception occurs. + * + * @param onParseException the parse exception behavior + */ + public void setOnParseException(FetchEmitTuple.ON_PARSE_EXCEPTION onParseException) { + this.onParseException = onParseException; + } + public String getConfigStoreType() { return configStoreType; } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java index cdf89be42d8..278410e7dbf 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java @@ -24,25 +24,22 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.module.SimpleModule; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; +import org.apache.tika.config.loader.TikaObjectMapperFactory; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.serialization.serdes.MetadataSerializer; -import org.apache.tika.serialization.serdes.ParseContextDeserializer; -import org.apache.tika.serialization.serdes.ParseContextSerializer; public class JsonFetchEmitTuple { - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER; static { - SimpleModule module = new SimpleModule(); - module.addDeserializer(FetchEmitTuple.class, new FetchEmitTupleDeserializer()); - module.addSerializer(FetchEmitTuple.class, new FetchEmitTupleSerializer()); - module.addSerializer(Metadata.class, new MetadataSerializer()); - module.addSerializer(ParseContext.class, new ParseContextSerializer()); - module.addDeserializer(ParseContext.class, new ParseContextDeserializer()); - OBJECT_MAPPER.registerModule(module); + // Use TikaObjectMapperFactory which provides TikaModule with Metadata/ParseContext serializers + OBJECT_MAPPER = TikaObjectMapperFactory.createMapper(); + + // Add FetchEmitTuple-specific serializers + SimpleModule fetchEmitModule = new SimpleModule(); + fetchEmitModule.addDeserializer(FetchEmitTuple.class, new FetchEmitTupleDeserializer()); + fetchEmitModule.addSerializer(FetchEmitTuple.class, new FetchEmitTupleSerializer()); + OBJECT_MAPPER.registerModule(fetchEmitModule); } public static FetchEmitTuple fromJson(Reader reader) throws IOException { diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java index a395677c96a..af3e75f50af 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java @@ -44,7 +44,7 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.ContentHandlerFactory; @@ -61,17 +61,22 @@ class ParseHandler { private final CountDownLatch countDownLatch; private final AutoDetectParser autoDetectParser; private final RecursiveParserWrapper recursiveParserWrapper; + private final ContentHandlerFactory defaultContentHandlerFactory; + private final ParseMode defaultParseMode; ParseHandler(Detector detector, Digester digester, ArrayBlockingQueue intermediateResult, CountDownLatch countDownLatch, AutoDetectParser autoDetectParser, - RecursiveParserWrapper recursiveParserWrapper) { + RecursiveParserWrapper recursiveParserWrapper, ContentHandlerFactory defaultContentHandlerFactory, + ParseMode defaultParseMode) { this.detector = detector; this.digester = digester; this.intermediateResult = intermediateResult; this.countDownLatch = countDownLatch; this.autoDetectParser = autoDetectParser; this.recursiveParserWrapper = recursiveParserWrapper; + this.defaultContentHandlerFactory = defaultContentHandlerFactory; + this.defaultParseMode = defaultParseMode; } PipesWorker.ParseDataOrPipesResult parseWithStream(FetchEmitTuple fetchEmitTuple, TikaInputStream stream, Metadata metadata, ParseContext parseContext) @@ -79,12 +84,13 @@ PipesWorker.ParseDataOrPipesResult parseWithStream(FetchEmitTuple fetchEmitTuple List metadataList; //this adds the EmbeddedDocumentByteStore to the parsecontext - HandlerConfig handlerConfig = parseContext.get(HandlerConfig.class); - if (handlerConfig.getParseMode() == HandlerConfig.PARSE_MODE.RMETA) { + ParseMode parseMode = getParseMode(parseContext); + ContentHandlerFactory contentHandlerFactory = getContentHandlerFactory(parseContext); + if (parseMode == ParseMode.RMETA) { metadataList = - parseRecursive(fetchEmitTuple, handlerConfig, stream, metadata, parseContext); + parseRecursive(fetchEmitTuple, contentHandlerFactory, stream, metadata, parseContext); } else { - metadataList = parseConcatenated(fetchEmitTuple, handlerConfig, stream, metadata, + metadataList = parseConcatenated(fetchEmitTuple, contentHandlerFactory, stream, metadata, parseContext); } @@ -92,6 +98,24 @@ PipesWorker.ParseDataOrPipesResult parseWithStream(FetchEmitTuple fetchEmitTuple parseContext.get(EmbeddedDocumentBytesHandler.class)), null); } + private ParseMode getParseMode(ParseContext parseContext) { + ParseMode mode = parseContext.get(ParseMode.class); + if (mode != null) { + return mode; + } + // Fall back to default loaded from TikaLoader + return defaultParseMode; + } + + private ContentHandlerFactory getContentHandlerFactory(ParseContext parseContext) { + ContentHandlerFactory factory = parseContext.get(ContentHandlerFactory.class); + if (factory != null) { + return factory; + } + // Fall back to default loaded from TikaLoader + return defaultContentHandlerFactory; + } + private void _preParse(FetchEmitTuple t, TikaInputStream tis, Metadata metadata, @@ -133,14 +157,16 @@ private Metadata preParse(FetchEmitTuple t, TikaInputStream tis, Metadata metada } public List parseRecursive(FetchEmitTuple fetchEmitTuple, - HandlerConfig handlerConfig, TikaInputStream stream, + ContentHandlerFactory contentHandlerFactory, TikaInputStream stream, Metadata metadata, ParseContext parseContext) throws InterruptedException { //Intentionally do not add the metadata filter here! //We need to let stacktraces percolate + int maxEmbeddedResources = -1; + if (contentHandlerFactory instanceof BasicContentHandlerFactory) { + maxEmbeddedResources = ((BasicContentHandlerFactory) contentHandlerFactory).getMaxEmbeddedResources(); + } RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(handlerConfig.getType(), - handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), - parseContext), handlerConfig.getMaxEmbeddedResources()); + contentHandlerFactory, maxEmbeddedResources); long start = System.currentTimeMillis(); @@ -168,25 +194,24 @@ public List parseRecursive(FetchEmitTuple fetchEmitTuple, } public List parseConcatenated(FetchEmitTuple fetchEmitTuple, - HandlerConfig handlerConfig, TikaInputStream stream, + ContentHandlerFactory contentHandlerFactory, TikaInputStream stream, Metadata metadata, ParseContext parseContext) throws InterruptedException { - ContentHandlerFactory contentHandlerFactory = - new BasicContentHandlerFactory(handlerConfig.getType(), - handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), - parseContext); - - ContentHandler handler = contentHandlerFactory.getNewContentHandler(); + ContentHandler handler = contentHandlerFactory.createHandler(); + int maxEmbedded = -1; + if (contentHandlerFactory instanceof BasicContentHandlerFactory) { + maxEmbedded = ((BasicContentHandlerFactory) contentHandlerFactory).getMaxEmbeddedResources(); + } + final int finalMaxEmbedded = maxEmbedded; parseContext.set(DocumentSelector.class, new DocumentSelector() { - final int maxEmbedded = handlerConfig.getMaxEmbeddedResources(); int embedded = 0; @Override public boolean select(Metadata metadata) { - if (maxEmbedded < 0) { + if (finalMaxEmbedded < 0) { return true; } - return embedded++ < maxEmbedded; + return embedded++ < finalMaxEmbedded; } }); diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java index 8a90b4d89c3..8c04a110c05 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java @@ -77,6 +77,7 @@ import org.apache.tika.pipes.core.fetcher.FetcherManager; import org.apache.tika.plugins.ExtensionConfig; import org.apache.tika.plugins.TikaPluginManager; +import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.serialization.ParseContextUtils; import org.apache.tika.utils.ExceptionUtils; @@ -150,6 +151,7 @@ public byte getByte() { private final PipesConfig pipesConfig; private final Socket socket; private final MetadataFilter defaultMetadataFilter; + private final ContentHandlerFactory defaultContentHandlerFactory; private AutoDetectParser autoDetectParser; private RecursiveParserWrapper rMetaParser; private FetcherManager fetcherManager; @@ -176,7 +178,8 @@ public static PipesServer load(int port, Path tikaConfigPath) throws Exception { socket.setSoTimeout((int) pipesConfig.getSocketTimeoutMs()); MetadataFilter metadataFilter = tikaLoader.loadMetadataFilters(); - PipesServer pipesServer = new PipesServer(pipesClientId, tikaLoader, pipesConfig, socket, dis, dos, metadataFilter); + ContentHandlerFactory contentHandlerFactory = tikaLoader.loadContentHandlerFactory(); + PipesServer pipesServer = new PipesServer(pipesClientId, tikaLoader, pipesConfig, socket, dis, dos, metadataFilter, contentHandlerFactory); pipesServer.initializeResources(); LOG.debug("pipesClientId={}: PipesServer loaded and ready", pipesClientId); return pipesServer; @@ -209,7 +212,7 @@ public static PipesServer load(int port, Path tikaConfigPath) throws Exception { } public PipesServer(String pipesClientId, TikaLoader tikaLoader, PipesConfig pipesConfig, Socket socket, DataInputStream in, - DataOutputStream out, MetadataFilter metadataFilter) throws TikaConfigException, + DataOutputStream out, MetadataFilter metadataFilter, ContentHandlerFactory contentHandlerFactory) throws TikaConfigException, IOException { this.pipesClientId = pipesClientId; @@ -217,6 +220,7 @@ public PipesServer(String pipesClientId, TikaLoader tikaLoader, PipesConfig pipe this.pipesConfig = pipesConfig; this.socket = socket; this.defaultMetadataFilter = metadataFilter; + this.defaultContentHandlerFactory = contentHandlerFactory; this.input = new DataInputStream(in); this.output = new DataOutputStream(out); this.heartbeatIntervalMs = pipesConfig.getHeartbeatIntervalMs(); @@ -357,7 +361,8 @@ public void mainLoop() { private PipesWorker getPipesWorker(ArrayBlockingQueue intermediateResult, FetchEmitTuple fetchEmitTuple, CountDownLatch countDownLatch) { FetchHandler fetchHandler = new FetchHandler(fetcherManager); - ParseHandler parseHandler = new ParseHandler(detector, digester, intermediateResult, countDownLatch, autoDetectParser, rMetaParser); + ParseHandler parseHandler = new ParseHandler(detector, digester, intermediateResult, countDownLatch, autoDetectParser, + rMetaParser, defaultContentHandlerFactory, pipesConfig.getParseMode()); Long thresholdBytes = pipesConfig.getEmitStrategy().getThresholdBytes(); long threshold = (thresholdBytes != null) ? thresholdBytes : EmitStrategyConfig.DEFAULT_DIRECT_EMIT_THRESHOLD_BYTES; EmitHandler emitHandler = new EmitHandler(defaultMetadataFilter, emitStrategy, emitterManager, threshold); diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java index 8d15c92a0a0..b7793881274 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java @@ -16,8 +16,6 @@ */ package org.apache.tika.pipes.core.server; -import static org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig.DEFAULT_HANDLER_CONFIG; - import java.io.Closeable; import java.io.IOException; import java.time.Duration; @@ -41,7 +39,6 @@ import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.PipesResult; import org.apache.tika.pipes.core.PipesResults; import org.apache.tika.pipes.core.emitter.EmitterManager; @@ -149,9 +146,8 @@ protected ParseDataOrPipesResult parseFromTuple() throws TikaException, Interrup private ParseContext setupParseContext(FetchEmitTuple fetchEmitTuple) throws TikaException, IOException { ParseContext parseContext = fetchEmitTuple.getParseContext(); - if (parseContext.get(HandlerConfig.class) == null) { - parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG); - } + // ContentHandlerFactory and ParseMode are retrieved from ParseContext in ParseHandler. + // They are set in ParseContext from PipesConfig loaded via TikaLoader at startup. EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = parseContext.get(EmbeddedDocumentBytesConfig.class); if (embeddedDocumentBytesConfig == null) { //make sure there's one here -- or do we make this default in fetchemit tuple? diff --git a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleTest.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleTest.java index 4168d37a6f2..1650e7d00ad 100644 --- a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleTest.java +++ b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleTest.java @@ -27,10 +27,11 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; public class JsonFetchEmitTupleTest { @@ -45,8 +46,11 @@ public void testBasic() throws Exception { ParseContext parseContext = new ParseContext(); - HandlerConfig h = new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.XML, HandlerConfig.PARSE_MODE.CONCATENATE, 10000, 10, true); - parseContext.set(HandlerConfig.class, h); + // Set ContentHandlerFactory and ParseMode in ParseContext + ContentHandlerFactory factory = new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.XML, 10000); + parseContext.set(ContentHandlerFactory.class, factory); + parseContext.set(ParseMode.class, ParseMode.CONCATENATE); FetchEmitTuple t = new FetchEmitTuple("my_id", new FetchKey("my_fetcher", "fetchKey1"), new EmitKey("my_emitter", "emitKey1"), m, parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP); @@ -66,12 +70,10 @@ public void testFetchRange() throws Exception { m.add("m2", "v3"); m.add("m3", "v4"); - /** - * TODO -- add this to the ParseContext - * new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.XML, - * HandlerConfig.PARSE_MODE.CONCATENATE, - * 10000,10, true), - */ + // TODO -- add this to the ParseContext: + // parseContext.set(ContentHandlerFactory.class, new BasicContentHandlerFactory( + // BasicContentHandlerFactory.HANDLER_TYPE.XML, 10000)); + // parseContext.set(ParseMode.class, ParseMode.CONCATENATE); FetchEmitTuple t = new FetchEmitTuple("my_id", new FetchKey("my_fetcher", "fetchKey1", 10, 1000), new EmitKey("my_emitter", "emitKey1"), m, new ParseContext(), FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP); StringWriter writer = new StringWriter(); @@ -83,14 +85,12 @@ public void testFetchRange() throws Exception { @Test public void testBytes() throws Exception { - /** - * TODO -- add these to the ParseContext - EmbeddedDocumentBytesConfig bytesConfig = new EmbeddedDocumentBytesConfig(true); - bytesConfig.setEmitter("emitter"); - * new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.XML, - * HandlerConfig.PARSE_MODE.CONCATENATE, - * 10000,10, true) - */ + // TODO -- add these to the ParseContext: + // EmbeddedDocumentBytesConfig bytesConfig = new EmbeddedDocumentBytesConfig(true); + // bytesConfig.setEmitter("emitter"); + // parseContext.set(ContentHandlerFactory.class, new BasicContentHandlerFactory( + // BasicContentHandlerFactory.HANDLER_TYPE.XML, 10000)); + // parseContext.set(ParseMode.class, ParseMode.CONCATENATE); FetchEmitTuple t = new FetchEmitTuple("my_id", new FetchKey("my_fetcher", "fetchKey1", 10, 1000), new EmitKey("my_emitter", "emitKey1"), new Metadata(), new ParseContext(), FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP); StringWriter writer = new StringWriter(); diff --git a/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParser.java b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParser.java index 0420596d58c..cfb9251e30c 100644 --- a/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParser.java +++ b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParser.java @@ -33,13 +33,14 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.PipesResult; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.pipes.core.PipesConfig; import org.apache.tika.pipes.core.PipesException; import org.apache.tika.pipes.core.PipesParser; +import org.apache.tika.sax.ContentHandlerFactory; /** * A ForkParser implementation backed by {@link PipesParser}. @@ -86,7 +87,8 @@ * Example usage: *

      * PipesForkParserConfig config = new PipesForkParserConfig();
    - * config.setHandlerConfig(new HandlerConfig(HANDLER_TYPE.TEXT, PARSE_MODE.RMETA, -1, -1, true));
    + * config.setHandlerType(HANDLER_TYPE.TEXT);
    + * config.setParseMode(ParseMode.RMETA);
      *
      * try (PipesForkParser parser = new PipesForkParser(config)) {
      *     // Parse from a file
    @@ -204,8 +206,9 @@ public PipesForkResult parse(TikaInputStream tis, Metadata metadata, ParseContex
             FetchKey fetchKey = new FetchKey(config.getFetcherName(), absolutePath);
             EmitKey emitKey = new EmitKey("", id); // Empty emitter name since we're using PASSBACK_ALL
     
    -        // Add handler config to parse context so server knows how to handle content
    -        parseContext.set(HandlerConfig.class, config.getHandlerConfig());
    +        // Add content handler factory and parse mode to parse context
    +        parseContext.set(ContentHandlerFactory.class, config.getContentHandlerFactory());
    +        parseContext.set(ParseMode.class, config.getParseMode());
     
             FetchEmitTuple tuple = new FetchEmitTuple(id, fetchKey, emitKey, metadata, parseContext);
     
    diff --git a/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserConfig.java b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserConfig.java
    index 8ffa0b555f1..467a2189730 100644
    --- a/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserConfig.java
    +++ b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserConfig.java
    @@ -20,9 +20,10 @@
     import java.util.ArrayList;
     import java.util.List;
     
    -import org.apache.tika.pipes.api.HandlerConfig;
    +import org.apache.tika.pipes.api.ParseMode;
     import org.apache.tika.pipes.core.PipesConfig;
     import org.apache.tika.sax.BasicContentHandlerFactory;
    +import org.apache.tika.sax.ContentHandlerFactory;
     
     /**
      * Configuration for {@link PipesForkParser}.
    @@ -33,13 +34,15 @@
     public class PipesForkParserConfig {
     
         private final PipesConfig pipesConfig;
    -    private HandlerConfig handlerConfig;
    +    private ContentHandlerFactory contentHandlerFactory;
    +    private ParseMode parseMode = ParseMode.RMETA;
         private String fetcherName = PipesForkParser.DEFAULT_FETCHER_NAME;
         private Path pluginsDir;
     
         public PipesForkParserConfig() {
             this.pipesConfig = new PipesConfig();
    -        this.handlerConfig = new HandlerConfig();
    +        this.contentHandlerFactory = new BasicContentHandlerFactory(
    +                BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1);
             // Default to single client for simple fork parser use case
             this.pipesConfig.setNumClients(1);
         }
    @@ -54,25 +57,34 @@ public PipesConfig getPipesConfig() {
         }
     
         /**
    -     * Get the handler configuration that specifies how content should be handled.
    +     * Get the content handler factory that specifies how content should be handled.
          *
    -     * @return the handler configuration
    +     * @return the content handler factory
          */
    -    public HandlerConfig getHandlerConfig() {
    -        return handlerConfig;
    +    public ContentHandlerFactory getContentHandlerFactory() {
    +        return contentHandlerFactory;
         }
     
         /**
    -     * Set the handler configuration.
    +     * Set the content handler factory.
          *
    -     * @param handlerConfig the handler configuration
    +     * @param contentHandlerFactory the content handler factory
          * @return this config for chaining
          */
    -    public PipesForkParserConfig setHandlerConfig(HandlerConfig handlerConfig) {
    -        this.handlerConfig = handlerConfig;
    +    public PipesForkParserConfig setContentHandlerFactory(ContentHandlerFactory contentHandlerFactory) {
    +        this.contentHandlerFactory = contentHandlerFactory;
             return this;
         }
     
    +    /**
    +     * Get the parse mode.
    +     *
    +     * @return the parse mode
    +     */
    +    public ParseMode getParseMode() {
    +        return parseMode;
    +    }
    +
         /**
          * Set the handler type (TEXT, HTML, XML, etc.).
          *
    @@ -80,7 +92,7 @@ public PipesForkParserConfig setHandlerConfig(HandlerConfig handlerConfig) {
          * @return this config for chaining
          */
         public PipesForkParserConfig setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE type) {
    -        this.handlerConfig.setType(type);
    +        this.contentHandlerFactory = new BasicContentHandlerFactory(type, -1);
             return this;
         }
     
    @@ -90,8 +102,8 @@ public PipesForkParserConfig setHandlerType(BasicContentHandlerFactory.HANDLER_T
          * @param parseMode the parse mode
          * @return this config for chaining
          */
    -    public PipesForkParserConfig setParseMode(HandlerConfig.PARSE_MODE parseMode) {
    -        this.handlerConfig.setParseMode(parseMode);
    +    public PipesForkParserConfig setParseMode(ParseMode parseMode) {
    +        this.parseMode = parseMode;
             return this;
         }
     
    @@ -102,7 +114,9 @@ public PipesForkParserConfig setParseMode(HandlerConfig.PARSE_MODE parseMode) {
          * @return this config for chaining
          */
         public PipesForkParserConfig setWriteLimit(int writeLimit) {
    -        this.handlerConfig.setWriteLimit(writeLimit);
    +        if (contentHandlerFactory instanceof BasicContentHandlerFactory bcf) {
    +            this.contentHandlerFactory = new BasicContentHandlerFactory(bcf.getType(), writeLimit);
    +        }
             return this;
         }
     
    @@ -113,7 +127,9 @@ public PipesForkParserConfig setWriteLimit(int writeLimit) {
          * @return this config for chaining
          */
         public PipesForkParserConfig setMaxEmbeddedResources(int maxEmbeddedResources) {
    -        this.handlerConfig.setMaxEmbeddedResources(maxEmbeddedResources);
    +        if (contentHandlerFactory instanceof BasicContentHandlerFactory bcf) {
    +            bcf.setMaxEmbeddedResources(maxEmbeddedResources);
    +        }
             return this;
         }
     
    diff --git a/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java b/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java
    index 30fc322dcef..34e56552b33 100644
    --- a/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java
    +++ b/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java
    @@ -38,7 +38,7 @@
     
     import org.apache.tika.io.TikaInputStream;
     import org.apache.tika.metadata.Metadata;
    -import org.apache.tika.pipes.api.HandlerConfig;
    +import org.apache.tika.pipes.api.ParseMode;
     import org.apache.tika.pipes.api.PipesResult;
     import org.apache.tika.sax.BasicContentHandlerFactory;
     
    @@ -80,7 +80,7 @@ public void testParseTextFile() throws Exception {
             PipesForkParserConfig config = new PipesForkParserConfig()
                     .setPluginsDir(PLUGINS_DIR)
                     .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
    -                .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
    +                .setParseMode(ParseMode.RMETA)
                     .setTimeoutMillis(60000)
                     .addJvmArg("-Xmx256m");
     
    @@ -114,7 +114,7 @@ public void testParseWithMetadata() throws Exception {
             PipesForkParserConfig config = new PipesForkParserConfig()
                     .setPluginsDir(PLUGINS_DIR)
                     .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
    -                .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
    +                .setParseMode(ParseMode.RMETA)
                     .setTimeoutMillis(60000);
     
             try (PipesForkParser parser = new PipesForkParser(config);
    @@ -144,7 +144,7 @@ public void testParseMultipleFiles() throws Exception {
             PipesForkParserConfig config = new PipesForkParserConfig()
                     .setPluginsDir(PLUGINS_DIR)
                     .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
    -                .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
    +                .setParseMode(ParseMode.RMETA)
                     .setTimeoutMillis(60000);
     
             try (PipesForkParser parser = new PipesForkParser(config)) {
    @@ -171,7 +171,7 @@ public void testConcatenateMode() throws Exception {
             PipesForkParserConfig config = new PipesForkParserConfig()
                     .setPluginsDir(PLUGINS_DIR)
                     .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
    -                .setParseMode(HandlerConfig.PARSE_MODE.CONCATENATE)
    +                .setParseMode(ParseMode.CONCATENATE)
                     .setTimeoutMillis(60000);
     
             try (PipesForkParser parser = new PipesForkParser(config);
    @@ -204,7 +204,7 @@ public void testRmetaModeWithEmbedded() throws Exception {
             PipesForkParserConfig config = new PipesForkParserConfig()
                     .setPluginsDir(PLUGINS_DIR)
                     .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
    -                .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
    +                .setParseMode(ParseMode.RMETA)
                     .setTimeoutMillis(60000);
     
             try (PipesForkParser parser = new PipesForkParser(config);
    @@ -232,7 +232,7 @@ public void testDefaultConfigMatchesExplicitRmeta() throws Exception {
             PipesForkParserConfig explicitConfig = new PipesForkParserConfig()
                     .setPluginsDir(PLUGINS_DIR)
                     .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
    -                .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
    +                .setParseMode(ParseMode.RMETA)
                     .setTimeoutMillis(60000);
     
             int explicitMetadataCount;
    @@ -268,7 +268,7 @@ public void testTextVsXhtmlHandlerType() throws Exception {
             PipesForkParserConfig textConfig = new PipesForkParserConfig()
                     .setPluginsDir(PLUGINS_DIR)
                     .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
    -                .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
    +                .setParseMode(ParseMode.RMETA)
                     .setTimeoutMillis(60000);
     
             String textContent;
    @@ -288,7 +288,7 @@ public void testTextVsXhtmlHandlerType() throws Exception {
             PipesForkParserConfig xmlConfig = new PipesForkParserConfig()
                     .setPluginsDir(PLUGINS_DIR)
                     .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.XML)
    -                .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
    +                .setParseMode(ParseMode.RMETA)
                     .setTimeoutMillis(60000);
     
             String xmlContent;
    @@ -322,7 +322,7 @@ public void testWriteLimit() throws Exception {
             PipesForkParserConfig config = new PipesForkParserConfig()
                     .setPluginsDir(PLUGINS_DIR)
                     .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
    -                .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
    +                .setParseMode(ParseMode.RMETA)
                     .setWriteLimit(100)  // Limit to 100 characters
                     .setTimeoutMillis(60000);
     
    diff --git a/tika-pipes/tika-pipes-integration-tests/pom.xml b/tika-pipes/tika-pipes-integration-tests/pom.xml
    index d1833aa5002..56bb2d1225b 100644
    --- a/tika-pipes/tika-pipes-integration-tests/pom.xml
    +++ b/tika-pipes/tika-pipes-integration-tests/pom.xml
    @@ -141,6 +141,14 @@
               
             
           
    +      
    +        org.apache.maven.plugins
    +        maven-surefire-plugin
    +        
    +          
    +          false
    +        
    +      
     
         
       
    diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
    index b85bcdf516a..1cba2622ac2 100644
    --- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
    +++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
    @@ -617,6 +617,54 @@ public void testEmitterNotFound(@TempDir Path tmp) throws Exception {
             }
         }
     
    +    @Test
    +    public void testCustomContentHandlerFactory(@TempDir Path tmp) throws Exception {
    +        // Test that a custom ContentHandlerFactory configured in tika-config.json
    +        // is properly used during parsing. The UppercasingContentHandlerFactory
    +        // converts all extracted text to uppercase.
    +        Path inputDir = tmp.resolve("input");
    +        Files.createDirectories(inputDir);
    +
    +        // Create a simple mock XML file with known content
    +        String mockContent = "" + "" +
    +                "Test Author" +
    +                "Hello World from Tika" +
    +                "";
    +        String testFile = "test-uppercase.xml";
    +        Files.write(inputDir.resolve(testFile), mockContent.getBytes(StandardCharsets.UTF_8));
    +
    +        // Use the uppercasing config
    +        Path tikaConfigPath = PluginsTestHelper.getFileSystemFetcherConfig(
    +                "tika-config-uppercasing.json", tmp, inputDir, tmp.resolve("output"), false);
    +        TikaJsonConfig tikaJsonConfig = TikaJsonConfig.load(tikaConfigPath);
    +        PipesConfig pipesConfig = PipesConfig.load(tikaJsonConfig);
    +
    +        try (PipesClient pipesClient = new PipesClient(pipesConfig, tikaConfigPath)) {
    +            FetchEmitTuple tuple = new FetchEmitTuple(testFile,
    +                    new FetchKey(fetcherName, testFile),
    +                    new EmitKey(), new Metadata(), new ParseContext(),
    +                    FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP);
    +
    +            PipesResult pipesResult = pipesClient.process(tuple);
    +
    +            // Should succeed
    +            assertTrue(pipesResult.isSuccess(),
    +                    "Processing should succeed. Got status: " + pipesResult.status() +
    +                            ", message: " + pipesResult.message());
    +
    +            Assertions.assertNotNull(pipesResult.emitData().getMetadataList());
    +            assertEquals(1, pipesResult.emitData().getMetadataList().size());
    +
    +            Metadata metadata = pipesResult.emitData().getMetadataList().get(0);
    +
    +            // The content should be uppercased due to UppercasingContentHandlerFactory
    +            String content = metadata.get(TikaCoreProperties.TIKA_CONTENT);
    +            Assertions.assertNotNull(content, "Content should not be null");
    +            assertTrue(content.contains("HELLO WORLD FROM TIKA"),
    +                    "Content should be uppercased. Actual content: " + content);
    +        }
    +    }
    +
         @Test
         public void testHeartbeatProtocol(@TempDir Path tmp) throws Exception {
             // Test that heartbeat protocol works correctly and doesn't cause protocol errors
    diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesServerTest.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesServerTest.java
    index 7c137084c79..621822fd236 100644
    --- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesServerTest.java
    +++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesServerTest.java
    @@ -16,9 +16,6 @@
      */
     package org.apache.tika.pipes.core;
     
    -
    -
    -
     import org.apache.tika.TikaTest;
     
     public class PipesServerTest extends TikaTest {
    diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/async/MockDigesterFactory.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/async/MockDigesterFactory.java
    deleted file mode 100644
    index 9df3e9866f3..00000000000
    --- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/async/MockDigesterFactory.java
    +++ /dev/null
    @@ -1,49 +0,0 @@
    -/*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    - *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    - *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    - */
    -package org.apache.tika.pipes.core.async;
    -
    -import org.apache.tika.config.TikaComponent;
    -import org.apache.tika.digest.Digester;
    -import org.apache.tika.digest.DigesterFactory;
    -import org.apache.tika.digest.Encoder;
    -import org.apache.tika.digest.InputStreamDigester;
    -
    -@TikaComponent
    -public class MockDigesterFactory implements DigesterFactory {
    -
    -    @Override
    -    public Digester build() {
    -        return new InputStreamDigester(1000000, "SHA-256", "X-TIKA:digest:SHA-256", new MockEncoder());
    -    }
    -
    -    private static class MockEncoder implements Encoder {
    -
    -        @Override
    -        public String encode(byte[] bytes) {
    -            StringBuilder hexString = new StringBuilder(2 * bytes.length);
    -            for (int i = 0; i < bytes.length; i++) {
    -                String hex = Integer.toHexString(0xff & bytes[i]);
    -                if (hex.length() == 1) {
    -                    hexString.append('0');
    -                }
    -                hexString.append(hex);
    -            }
    -            return hexString.toString();
    -        }
    -    }
    -
    -}
    diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json
    index f0283182078..5873c39a87b 100644
    --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json
    +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json
    @@ -1,4 +1,12 @@
     {
    +  "content-handler-factory": {
    +    "basic-content-handler-factory": {
    +      "type": "TEXT",
    +      "writeLimit": -1,
    +      "maxEmbeddedResources": -1,
    +      "throwOnWriteLimitReached": true
    +    }
    +  },
       "fetchers": {
         "fsf": {
           "file-system-fetcher": {
    @@ -18,27 +26,15 @@
       },
       "pipes-iterator": {
         "file-system-pipes-iterator": {
    -      "fspi": {
    -        "basePath": "FETCHER_BASE_PATH",
    -        "countTotal": true,
    -        "baseConfig": {
    -          "fetcherId": "fsf",
    -          "emitterId": "fse",
    -          "handlerConfig": {
    -            "type": "TEXT",
    -            "parseMode": "RMETA",
    -            "writeLimit": -1,
    -            "maxEmbeddedResources": -1,
    -            "throwOnWriteLimitReached": true
    -          },
    -          "onParseException": "EMIT",
    -          "maxWaitMs": 600000,
    -          "queueSize": 10000
    -        }
    -      }
    +      "basePath": "FETCHER_BASE_PATH",
    +      "countTotal": true,
    +      "fetcherId": "fsf",
    +      "emitterId": "fse"
         }
       },
       "pipes": {
    +    "parseMode": "RMETA",
    +    "onParseException": "EMIT",
         "numClients": 4,
         "timeoutMillis": 5000,
         "emitIntermediateResults": "EMIT_INTERMEDIATE_RESULTS",
    diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json
    index 153a68796dc..529e878cb60 100644
    --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json
    +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json
    @@ -1,4 +1,12 @@
     {
    +  "content-handler-factory": {
    +    "basic-content-handler-factory": {
    +      "type": "TEXT",
    +      "writeLimit": -1,
    +      "maxEmbeddedResources": -1,
    +      "throwOnWriteLimitReached": true
    +    }
    +  },
       "fetchers": {
         "fsf": {
           "file-system-fetcher": {
    @@ -18,27 +26,15 @@
       },
       "pipes-iterator": {
         "file-system-pipes-iterator": {
    -      "fspi": {
    -        "basePath": "FETCHER_BASE_PATH",
    -        "countTotal": true,
    -        "baseConfig": {
    -          "fetcherId": "fsf",
    -          "emitterId": "fse",
    -          "handlerConfig": {
    -            "type": "TEXT",
    -            "parseMode": "RMETA",
    -            "writeLimit": -1,
    -            "maxEmbeddedResources": -1,
    -            "throwOnWriteLimitReached": true
    -          },
    -          "onParseException": "EMIT",
    -          "maxWaitMs": 600000,
    -          "queueSize": 10000
    -        }
    -      }
    +      "basePath": "FETCHER_BASE_PATH",
    +      "countTotal": true,
    +      "fetcherId": "fsf",
    +      "emitterId": "fse"
         }
       },
       "pipes": {
    +    "parseMode": "RMETA",
    +    "onParseException": "EMIT",
         "numClients": 4,
         "timeoutMillis": 5000,
         "emitIntermediateResults": "EMIT_INTERMEDIATE_RESULTS",
    diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
    index 873ce685a52..b58bfe269c6 100644
    --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
    +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
    @@ -1,4 +1,12 @@
     {
    +  "content-handler-factory": {
    +    "basic-content-handler-factory": {
    +      "type": "TEXT",
    +      "writeLimit": -1,
    +      "maxEmbeddedResources": -1,
    +      "throwOnWriteLimitReached": true
    +    }
    +  },
       "fetchers": {
         "fsf": {
           "file-system-fetcher": {
    @@ -18,30 +26,18 @@
       },
       "pipes-iterator": {
         "file-system-pipes-iterator": {
    -      "fspi": {
    -        "basePath": "FETCHER_BASE_PATH",
    -        "countTotal": true,
    -        "baseConfig": {
    -          "fetcherId": "fsf",
    -          "emitterId": "fse",
    -          "handlerConfig": {
    -            "type": "TEXT",
    -            "parseMode": "RMETA",
    -            "writeLimit": -1,
    -            "maxEmbeddedResources": -1,
    -            "throwOnWriteLimitReached": true
    -          },
    -          "onParseException": "EMIT",
    -          "maxWaitMs": 600000,
    -          "queueSize": 10000
    -        }
    -      }
    +      "basePath": "FETCHER_BASE_PATH",
    +      "countTotal": true,
    +      "fetcherId": "fsf",
    +      "emitterId": "fse"
         }
       },
       "pipes": {
    +    "parseMode": "RMETA",
    +    "onParseException": "EMIT",
         "numClients": 4,
         "timeoutMillis": 5000,
    -    "emitIntermediateResults": EMIT_INTERMEDIATE_RESULTS,
    +    "emitIntermediateResults": "EMIT_INTERMEDIATE_RESULTS",
         "forkedJvmArgs": ["-Xmx512m"],
         "emitStrategy": {
           "type": "DYNAMIC",
    diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json
    new file mode 100644
    index 00000000000..e7d8a21c028
    --- /dev/null
    +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json
    @@ -0,0 +1,52 @@
    +{
    +  "content-handler-factory": {
    +    "uppercasing-content-handler-factory": {}
    +  },
    +  "fetchers": {
    +    "fsf": {
    +      "file-system-fetcher": {
    +        "basePath": "FETCHER_BASE_PATH",
    +        "extractFileSystemMetadata": false
    +      }
    +    }
    +  },
    +  "emitters": {
    +    "fse": {
    +      "file-system-emitter": {
    +        "basePath": "EMITTER_BASE_PATH",
    +        "fileExtension": "json",
    +        "onExists": "EXCEPTION"
    +      }
    +    }
    +  },
    +  "pipes-iterator": {
    +    "file-system-pipes-iterator": {
    +      "basePath": "FETCHER_BASE_PATH",
    +      "countTotal": true,
    +      "fetcherId": "fsf",
    +      "emitterId": "fse"
    +    }
    +  },
    +  "pipes": {
    +    "parseMode": "RMETA",
    +    "onParseException": "EMIT",
    +    "numClients": 4,
    +    "timeoutMillis": 5000,
    +    "emitIntermediateResults": "EMIT_INTERMEDIATE_RESULTS",
    +    "forkedJvmArgs": ["-Xmx512m"],
    +    "emitStrategy": {
    +      "type": "DYNAMIC",
    +      "thresholdBytes": 1000000
    +    }
    +  },
    +  "auto-detect-parser": {
    +    "spoolToDisk": 1000000,
    +    "outputThreshold": 1000000,
    +    "skipContainerDocumentDigest": false,
    +    "digesterFactory": {
    +      "mock-digester-factory": {}
    +    },
    +    "throwOnZeroBytes": false
    +  },
    +  "plugin-roots": "PLUGINS_PATHS"
    +}
    diff --git a/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorBase.java b/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorBase.java
    index 4fd11352da1..8a4622dcb8e 100644
    --- a/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorBase.java
    +++ b/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorBase.java
    @@ -54,6 +54,16 @@ public abstract class PipesIteratorBase extends AbstractTikaExtension implements
         private int added = 0;
         private FutureTask futureTask;
     
    +    /**
    +     * The fetcher ID to use for fetching documents.
    +     */
    +    private String fetcherId;
    +
    +    /**
    +     * The emitter ID to use for emitting results.
    +     */
    +    private String emitterId;
    +
         public PipesIteratorBase(ExtensionConfig pluginConfig) {
             super(pluginConfig);
         }
    diff --git a/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorConfig.java b/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorConfig.java
    new file mode 100644
    index 00000000000..e8356a64a86
    --- /dev/null
    +++ b/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorConfig.java
    @@ -0,0 +1,61 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.tika.pipes.pipesiterator;
    +
    +import java.util.Objects;
    +
    +/**
    + * Abstract base class for pipes iterator configurations.
    + * Provides the common fetcherId and emitterId fields that all iterators need.
    + * 

    + * ContentHandlerFactory, ParseMode, and other parsing settings should be loaded + * from tika-config.json via TikaLoader and set in PipesConfig. + */ +public abstract class PipesIteratorConfig { + + private String fetcherId; + private String emitterId; + + public String getFetcherId() { + return fetcherId; + } + + public void setFetcherId(String fetcherId) { + this.fetcherId = fetcherId; + } + + public String getEmitterId() { + return emitterId; + } + + public void setEmitterId(String emitterId) { + this.emitterId = emitterId; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof PipesIteratorConfig that)) return false; + return Objects.equals(fetcherId, that.fetcherId) && + Objects.equals(emitterId, that.emitterId); + } + + @Override + public int hashCode() { + return Objects.hash(fetcherId, emitterId); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIterator.java index 8d56f2e87d7..855059914c6 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIterator.java @@ -36,10 +36,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; import org.apache.tika.pipes.pipesiterator.PipesIteratorBase; import org.apache.tika.plugins.ExtensionConfig; import org.apache.tika.utils.StringUtils; @@ -81,10 +79,8 @@ private void checkConfig(AZBlobPipesIteratorConfig config) throws TikaConfigExce @Override protected void enqueue() throws InterruptedException, IOException, TimeoutException { - PipesIteratorBaseConfig baseConfig = config.getBaseConfig(); - String fetcherId = baseConfig.fetcherId(); - String emitterId = baseConfig.emitterId(); - HandlerConfig handlerConfig = baseConfig.handlerConfig(); + String fetcherId = config.getFetcherId(); + String emitterId = config.getEmitterId(); long start = System.currentTimeMillis(); int count = 0; @@ -125,10 +121,9 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept } //TODO -- extract metadata about content length etc from properties ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, handlerConfig); tryToAdd(new FetchEmitTuple(blob.getName(), new FetchKey(fetcherId, blob.getName()), new EmitKey(emitterId, blob.getName()), new Metadata(), parseContext, - baseConfig.onParseException())); + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); count++; } long elapsed = System.currentTimeMillis() - start; diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorConfig.java index 068ff346044..ef3d78a49a8 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorConfig.java @@ -22,10 +22,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig; +import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig; -public class AZBlobPipesIteratorConfig implements PipesIteratorConfig { +public class AZBlobPipesIteratorConfig extends PipesIteratorConfig { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -45,7 +44,6 @@ public static AZBlobPipesIteratorConfig load(final String json) private String container; private String prefix = ""; private long timeoutMillis = 360000; - private PipesIteratorBaseConfig baseConfig = null; public String getSasToken() { return sasToken; @@ -68,32 +66,28 @@ public long getTimeoutMillis() { } @Override - public PipesIteratorBaseConfig getBaseConfig() { - return baseConfig; - } - - @Override - public final boolean equals(Object o) { + public boolean equals(Object o) { if (!(o instanceof AZBlobPipesIteratorConfig that)) { return false; } - + if (!super.equals(o)) { + return false; + } return timeoutMillis == that.timeoutMillis && Objects.equals(sasToken, that.sasToken) && Objects.equals(endpoint, that.endpoint) && Objects.equals(container, that.container) && - Objects.equals(prefix, that.prefix) && - Objects.equals(baseConfig, that.baseConfig); + Objects.equals(prefix, that.prefix); } @Override public int hashCode() { - int result = Objects.hashCode(sasToken); + int result = super.hashCode(); + result = 31 * result + Objects.hashCode(sasToken); result = 31 * result + Objects.hashCode(endpoint); result = 31 * result + Objects.hashCode(container); result = 31 * result + Objects.hashCode(prefix); result = 31 * result + Long.hashCode(timeoutMillis); - result = 31 * result + Objects.hashCode(baseConfig); return result; } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/iterator/azblob/TestAZBlobPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/iterator/azblob/TestAZBlobPipesIterator.java index 298b16ebc84..4c81e4ae8df 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/iterator/azblob/TestAZBlobPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/iterator/azblob/TestAZBlobPipesIterator.java @@ -48,10 +48,9 @@ public void testSimple() throws Exception { configNode.put("endpoint", ""); // use one configNode.put("sasToken", ""); // find one - ObjectNode baseConfigNode = MAPPER.createObjectNode(); - baseConfigNode.put("fetcherId", "az-blob"); - baseConfigNode.put("emitterId", "test-emitter"); - configNode.set("baseConfig", baseConfigNode); + // Add fetcherId and emitterId at root level (not nested in baseConfig) + configNode.put("fetcherId", "az-blob"); + configNode.put("emitterId", "test-emitter"); ExtensionConfig extensionConfig = new ExtensionConfig("test-az-blob", "az-blob-pipes-iterator", MAPPER.writeValueAsString(configNode)); diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIterator.java index 7ca24c03e15..317db26e132 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIterator.java @@ -34,7 +34,6 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.pipes.pipesiterator.PipesIteratorBase; @@ -91,8 +90,8 @@ public static CSVPipesIterator build(ExtensionConfig extensionConfig) throws IOE @Override protected void enqueue() throws InterruptedException, IOException, TimeoutException { - String fetcherPluginId = config.getBaseConfig().fetcherId(); - String emitterName = config.getBaseConfig().emitterId(); + String fetcherId = config.getFetcherId(); + String emitterId = config.getEmitterId(); try (Reader reader = Files.newBufferedReader(config.getCsvPath(), charset)) { Iterable records = CSVFormat.EXCEL.parse(reader); List headers = new ArrayList<>(); @@ -103,17 +102,16 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept } try { - checkFetchEmitValidity(fetcherPluginId, emitterName, fetchEmitKeyIndices, headers); + checkFetchEmitValidity(fetcherId, emitterId, fetchEmitKeyIndices, headers); } catch (TikaConfigException e) { throw new IOException(e); } - HandlerConfig handlerConfig = config.getBaseConfig().handlerConfig(); for (CSVRecord record : records) { String id = record.get(fetchEmitKeyIndices.idIndex); String fetchKey = record.get(fetchEmitKeyIndices.fetchKeyIndex); String emitKey = record.get(fetchEmitKeyIndices.emitKeyIndex); - if (StringUtils.isBlank(fetchKey) && !StringUtils.isBlank(fetcherPluginId)) { - LOGGER.debug("Fetcher specified ({}), but no fetchkey was found in ({})", fetcherPluginId, record); + if (StringUtils.isBlank(fetchKey) && !StringUtils.isBlank(fetcherId)) { + LOGGER.debug("Fetcher specified ({}), but no fetchkey was found in ({})", fetcherId, record); } if (StringUtils.isBlank(emitKey)) { throw new IOException("emitKey must not be blank in :" + record); @@ -121,27 +119,26 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept Metadata metadata = loadMetadata(fetchEmitKeyIndices, headers, record); ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, handlerConfig); - tryToAdd(new FetchEmitTuple(id, new FetchKey(fetcherPluginId, fetchKey), new EmitKey(emitterName, emitKey), metadata, parseContext, - config.getBaseConfig().onParseException())); + tryToAdd(new FetchEmitTuple(id, new FetchKey(fetcherId, fetchKey), new EmitKey(emitterId, emitKey), metadata, parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); } } } - private void checkFetchEmitValidity(String fetcherPluginId, String emitterName, FetchEmitKeyIndices fetchEmitKeyIndices, List headers) throws TikaConfigException { + private void checkFetchEmitValidity(String fetcherId, String emitterId, FetchEmitKeyIndices fetchEmitKeyIndices, List headers) throws TikaConfigException { String fetchKeyColumn = config.getFetchKeyColumn(); String emitKeyColumn = config.getEmitKeyColumn(); String idColumn = config.getIdColumn(); - if (StringUtils.isBlank(emitterName)) { - throw new TikaConfigException("must specify at least an emitterName"); + if (StringUtils.isBlank(emitterId)) { + throw new TikaConfigException("must specify at least an emitterId"); } - if (StringUtils.isBlank(fetcherPluginId) && !StringUtils.isBlank(fetchKeyColumn)) { - throw new TikaConfigException("If specifying a 'fetchKeyColumn', " + "you must also specify a 'fetcherPluginId'"); + if (StringUtils.isBlank(fetcherId) && !StringUtils.isBlank(fetchKeyColumn)) { + throw new TikaConfigException("If specifying a 'fetchKeyColumn', " + "you must also specify a 'fetcherId'"); } - if (StringUtils.isBlank(fetcherPluginId)) { + if (StringUtils.isBlank(fetcherId)) { LOGGER.info("No fetcher specified. This will be metadata only"); } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorConfig.java index 46bee035e9a..3a5231821d8 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorConfig.java @@ -23,10 +23,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig; +import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig; -public class CSVPipesIteratorConfig implements PipesIteratorConfig { +public class CSVPipesIteratorConfig extends PipesIteratorConfig { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -45,7 +44,6 @@ public static CSVPipesIteratorConfig load(final String json) private String fetchKeyColumn; private String emitKeyColumn; private String idColumn; - private PipesIteratorBaseConfig baseConfig = null; public Path getCsvPath() { return csvPath; @@ -64,30 +62,26 @@ public String getIdColumn() { } @Override - public PipesIteratorBaseConfig getBaseConfig() { - return baseConfig; - } - - @Override - public final boolean equals(Object o) { + public boolean equals(Object o) { if (!(o instanceof CSVPipesIteratorConfig that)) { return false; } - + if (!super.equals(o)) { + return false; + } return Objects.equals(csvPath, that.csvPath) && Objects.equals(fetchKeyColumn, that.fetchKeyColumn) && Objects.equals(emitKeyColumn, that.emitKeyColumn) && - Objects.equals(idColumn, that.idColumn) && - Objects.equals(baseConfig, that.baseConfig); + Objects.equals(idColumn, that.idColumn); } @Override public int hashCode() { - int result = Objects.hashCode(csvPath); + int result = super.hashCode(); + result = 31 * result + Objects.hashCode(csvPath); result = 31 * result + Objects.hashCode(fetchKeyColumn); result = 31 * result + Objects.hashCode(emitKeyColumn); result = 31 * result + Objects.hashCode(idColumn); - result = 31 * result + Objects.hashCode(baseConfig); return result; } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/java/org/apache/tika/pipes/iterator/csv/TestCSVPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/java/org/apache/tika/pipes/iterator/csv/TestCSVPipesIterator.java index d423119e9d9..b2f090231b3 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/java/org/apache/tika/pipes/iterator/csv/TestCSVPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/java/org/apache/tika/pipes/iterator/csv/TestCSVPipesIterator.java @@ -114,11 +114,9 @@ private CSVPipesIterator createIterator(Path csvPath, String fetcherName, String jsonConfig.put("idColumn", idColumn); } - // Add baseConfig - ObjectNode baseConfig = OBJECT_MAPPER.createObjectNode(); - baseConfig.put("fetcherId", fetcherName); - baseConfig.put("emitterId", emitterName); - jsonConfig.set("baseConfig", baseConfig); + // Add fetcherId and emitterId at root level (not nested in baseConfig) + jsonConfig.put("fetcherId", fetcherName); + jsonConfig.put("emitterId", emitterName); ExtensionConfig extensionConfig = new ExtensionConfig("test-csv-iterator", "csv-pipes-iterator", OBJECT_MAPPER.writeValueAsString(jsonConfig)); diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIterator.java index 4dedfaf478f..bb4b1fc0dcd 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIterator.java @@ -34,10 +34,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; import org.apache.tika.pipes.api.pipesiterator.TotalCountResult; import org.apache.tika.pipes.api.pipesiterator.TotalCounter; import org.apache.tika.pipes.pipesiterator.PipesIteratorBase; @@ -79,9 +77,8 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept "\"basePath\" directory does not exist: " + config .getBasePath().toAbsolutePath()); } - PipesIteratorBaseConfig config = this.config.getBaseConfig(); try { - Files.walkFileTree(this.config.getBasePath(), new FSFileVisitor(config.fetcherId(), config.emitterId())); + Files.walkFileTree(config.getBasePath(), new FSFileVisitor(config.getFetcherId(), config.getEmitterId())); } catch (IOException e) { Throwable cause = e.getCause(); if (cause != null && cause instanceof TimeoutException) { @@ -139,15 +136,14 @@ public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { - String relPath = config + String relPath = FileSystemPipesIterator.this.config .getBasePath().relativize(file).toString(); - PipesIteratorBaseConfig config = FileSystemPipesIterator.this.config.getBaseConfig(); try { ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, config.handlerConfig()); + // ContentHandlerFactory, ParseMode, and onParseException come from PipesConfig loaded via TikaLoader tryToAdd(new FetchEmitTuple(relPath, new FetchKey(fetcherId, relPath), new EmitKey(emitterId, relPath), new Metadata(), parseContext, - config.onParseException())); + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); } catch (TimeoutException e) { throw new IOException(e); } catch (InterruptedException e) { diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorConfig.java index 0648620fc4c..61eeeb66a65 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorConfig.java @@ -23,10 +23,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig; +import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig; -public class FileSystemPipesIteratorConfig implements PipesIteratorConfig { +public class FileSystemPipesIteratorConfig extends PipesIteratorConfig { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -44,7 +43,6 @@ public static FileSystemPipesIteratorConfig load(final String json) private Path basePath = null; private boolean countTotal = true; - private PipesIteratorBaseConfig baseConfig = null; public Path getBasePath() { return basePath; @@ -55,24 +53,21 @@ public boolean isCountTotal() { } @Override - public PipesIteratorBaseConfig getBaseConfig() { - return baseConfig; - } - - @Override - public final boolean equals(Object o) { + public boolean equals(Object o) { if (!(o instanceof FileSystemPipesIteratorConfig that)) { return false; } - - return countTotal == that.countTotal && Objects.equals(basePath, that.basePath) && Objects.equals(baseConfig, that.baseConfig); + if (!super.equals(o)) { + return false; + } + return countTotal == that.countTotal && Objects.equals(basePath, that.basePath); } @Override public int hashCode() { - int result = Objects.hashCode(basePath); + int result = super.hashCode(); + result = 31 * result + Objects.hashCode(basePath); result = 31 * result + Boolean.hashCode(countTotal); - result = 31 * result + Objects.hashCode(baseConfig); return result; } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIterator.java index f25fd696af1..0b64c18812c 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIterator.java @@ -30,10 +30,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; import org.apache.tika.pipes.pipesiterator.PipesIteratorBase; import org.apache.tika.plugins.ExtensionConfig; import org.apache.tika.utils.StringUtils; @@ -71,12 +69,10 @@ public static GCSPipesIterator build(ExtensionConfig extensionConfig) throws IOE @Override protected void enqueue() throws InterruptedException, IOException, TimeoutException { - PipesIteratorBaseConfig baseConfig = config.getBaseConfig(); - String fetcherPluginId = baseConfig.fetcherId(); - String emitterName = baseConfig.emitterId(); + String fetcherId = config.getFetcherId(); + String emitterId = config.getEmitterId(); long start = System.currentTimeMillis(); int count = 0; - HandlerConfig handlerConfig = baseConfig.handlerConfig(); Page blobs = null; String prefix = config.getPrefix(); @@ -96,9 +92,8 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept LOGGER.debug("adding ({}) {} in {} ms", count, blob.getName(), elapsed); //TODO -- allow user specified metadata as the "id"? ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, handlerConfig); - tryToAdd(new FetchEmitTuple(blob.getName(), new FetchKey(fetcherPluginId, blob.getName()), new EmitKey(emitterName, blob.getName()), new Metadata(), parseContext, - baseConfig.onParseException())); + tryToAdd(new FetchEmitTuple(blob.getName(), new FetchKey(fetcherId, blob.getName()), new EmitKey(emitterId, blob.getName()), new Metadata(), parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); count++; } long elapsed = System.currentTimeMillis() - start; diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorConfig.java index f4c4f1690e3..d87fea102a1 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorConfig.java @@ -22,10 +22,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig; +import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig; -public class GCSPipesIteratorConfig implements PipesIteratorConfig { +public class GCSPipesIteratorConfig extends PipesIteratorConfig { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -42,7 +41,6 @@ public static GCSPipesIteratorConfig load(final String json) private String bucket; private String prefix = ""; private String projectId = ""; - private PipesIteratorBaseConfig baseConfig = null; public String getBucket() { return bucket; @@ -57,28 +55,24 @@ public String getProjectId() { } @Override - public PipesIteratorBaseConfig getBaseConfig() { - return baseConfig; - } - - @Override - public final boolean equals(Object o) { + public boolean equals(Object o) { if (!(o instanceof GCSPipesIteratorConfig that)) { return false; } - + if (!super.equals(o)) { + return false; + } return Objects.equals(bucket, that.bucket) && Objects.equals(prefix, that.prefix) && - Objects.equals(projectId, that.projectId) && - Objects.equals(baseConfig, that.baseConfig); + Objects.equals(projectId, that.projectId); } @Override public int hashCode() { - int result = Objects.hashCode(bucket); + int result = super.hashCode(); + result = 31 * result + Objects.hashCode(bucket); result = 31 * result + Objects.hashCode(prefix); result = 31 * result + Objects.hashCode(projectId); - result = 31 * result + Objects.hashCode(baseConfig); return result; } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/iterator/gcs/TestGCSPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/iterator/gcs/TestGCSPipesIterator.java index a8f2310b71e..3af3c4342c2 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/iterator/gcs/TestGCSPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/iterator/gcs/TestGCSPipesIterator.java @@ -86,11 +86,9 @@ private GCSPipesIterator createIterator(String bucket, String projectId, String jsonConfig.put("prefix", prefix); } - // Add baseConfig - ObjectNode baseConfig = OBJECT_MAPPER.createObjectNode(); - baseConfig.put("fetcherId", fetcherName); - baseConfig.put("emitterId", emitterName); - jsonConfig.set("baseConfig", baseConfig); + // Add fetcherId and emitterId at root level (not nested in baseConfig) + jsonConfig.put("fetcherId", fetcherName); + jsonConfig.put("emitterId", emitterName); ExtensionConfig extensionConfig = new ExtensionConfig("test-gcs-iterator", "gcs-pipes-iterator", OBJECT_MAPPER.writeValueAsString(jsonConfig)); diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIterator.java index fbf86f4fe63..be0fccfdfa1 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIterator.java @@ -34,10 +34,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; import org.apache.tika.pipes.pipesiterator.PipesIteratorBase; import org.apache.tika.plugins.ExtensionConfig; import org.apache.tika.utils.StringUtils; @@ -78,16 +76,15 @@ private JDBCPipesIterator(JDBCPipesIteratorConfig config, ExtensionConfig extens throw new TikaConfigException("select must not be empty"); } - PipesIteratorBaseConfig baseConfig = config.getBaseConfig(); - String fetcherName = baseConfig.fetcherId(); - String emitterName = baseConfig.emitterId(); + String fetcherName = config.getFetcherId(); + String emitterName = config.getEmitterId(); if (StringUtils.isBlank(fetcherName) && !StringUtils.isBlank(config.getFetchKeyColumn())) { - throw new TikaConfigException("If you specify a 'fetchKeyColumn', you must specify a 'fetcherPluginId'"); + throw new TikaConfigException("If you specify a 'fetchKeyColumn', you must specify a 'fetcherId'"); } if (StringUtils.isBlank(emitterName) && !StringUtils.isBlank(config.getEmitKeyColumn())) { - throw new TikaConfigException("If you specify an 'emitKeyColumn', you must specify an 'emitterPluginId'"); + throw new TikaConfigException("If you specify an 'emitKeyColumn', you must specify an 'emitterId'"); } if (StringUtils.isBlank(emitterName) && StringUtils.isBlank(fetcherName)) { @@ -120,13 +117,11 @@ public static JDBCPipesIterator build(ExtensionConfig extensionConfig) throws IO @Override protected void enqueue() throws InterruptedException, IOException, TimeoutException { - PipesIteratorBaseConfig baseConfig = config.getBaseConfig(); - String fetcherPluginId = baseConfig.fetcherId(); - String emitterName = baseConfig.emitterId(); + String fetcherId = config.getFetcherId(); + String emitterId = config.getEmitterId(); FetchEmitKeyIndices fetchEmitKeyIndices = null; List headers = new ArrayList<>(); int rowCount = 0; - HandlerConfig handlerConfig = baseConfig.handlerConfig(); LOGGER.debug("select: {}", config.getSelect()); try (Statement st = db.createStatement()) { if (config.getFetchSize() > 0) { @@ -139,10 +134,10 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept while (rs.next()) { if (headers.size() == 0) { fetchEmitKeyIndices = loadHeaders(rs.getMetaData(), headers); - checkFetchEmitValidity(fetcherPluginId, emitterName, fetchEmitKeyIndices, headers); + checkFetchEmitValidity(fetcherId, emitterId, fetchEmitKeyIndices, headers); } try { - processRow(fetcherPluginId, emitterName, headers, fetchEmitKeyIndices, rs, handlerConfig, baseConfig); + processRow(fetcherId, emitterId, headers, fetchEmitKeyIndices, rs); } catch (SQLException e) { LOGGER.warn("Failed to insert: " + rs, e); } @@ -164,7 +159,7 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept } } - private void checkFetchEmitValidity(String fetcherPluginId, String emitterName, FetchEmitKeyIndices fetchEmitKeyIndices, List headers) throws IOException { + private void checkFetchEmitValidity(String fetcherId, String emitterId, FetchEmitKeyIndices fetchEmitKeyIndices, List headers) throws IOException { if (!StringUtils.isBlank(config.getFetchKeyColumn()) && fetchEmitKeyIndices.fetchKeyIndex < 0) { throw new IOException(new TikaConfigException("Couldn't find fetchkey column: " + config.getFetchKeyColumn())); } @@ -180,9 +175,8 @@ private void checkFetchEmitValidity(String fetcherPluginId, String emitterName, } } - private void processRow(String fetcherPluginId, String emitterName, List headers, - FetchEmitKeyIndices fetchEmitKeyIndices, ResultSet rs, - HandlerConfig handlerConfig, PipesIteratorBaseConfig baseConfig) + private void processRow(String fetcherId, String emitterId, List headers, + FetchEmitKeyIndices fetchEmitKeyIndices, ResultSet rs) throws SQLException, TimeoutException, InterruptedException { Metadata metadata = new Metadata(); String fetchKey = ""; @@ -233,9 +227,8 @@ private void processRow(String fetcherPluginId, String emitterName, List } } ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, handlerConfig); - tryToAdd(new FetchEmitTuple(id, new FetchKey(fetcherPluginId, fetchKey, fetchStartRange, fetchEndRange), new EmitKey(emitterName, emitKey), metadata, parseContext, - baseConfig.onParseException())); + tryToAdd(new FetchEmitTuple(id, new FetchKey(fetcherId, fetchKey, fetchStartRange, fetchEndRange), new EmitKey(emitterId, emitKey), metadata, parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); } private String toString(ResultSet rs) throws SQLException { diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorConfig.java index 5cdfa0a7076..ff6b68d229e 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorConfig.java @@ -22,10 +22,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig; +import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig; -public class JDBCPipesIteratorConfig implements PipesIteratorConfig { +public class JDBCPipesIteratorConfig extends PipesIteratorConfig { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -49,7 +48,6 @@ public static JDBCPipesIteratorConfig load(final String json) private String select; private int fetchSize = -1; private int queryTimeoutSeconds = -1; - private PipesIteratorBaseConfig baseConfig = null; public String getIdColumn() { return idColumn; @@ -88,16 +86,13 @@ public int getQueryTimeoutSeconds() { } @Override - public PipesIteratorBaseConfig getBaseConfig() { - return baseConfig; - } - - @Override - public final boolean equals(Object o) { + public boolean equals(Object o) { if (!(o instanceof JDBCPipesIteratorConfig that)) { return false; } - + if (!super.equals(o)) { + return false; + } return fetchSize == that.fetchSize && queryTimeoutSeconds == that.queryTimeoutSeconds && Objects.equals(idColumn, that.idColumn) && @@ -106,13 +101,13 @@ public final boolean equals(Object o) { Objects.equals(fetchKeyRangeEndColumn, that.fetchKeyRangeEndColumn) && Objects.equals(emitKeyColumn, that.emitKeyColumn) && Objects.equals(connection, that.connection) && - Objects.equals(select, that.select) && - Objects.equals(baseConfig, that.baseConfig); + Objects.equals(select, that.select); } @Override public int hashCode() { - int result = Objects.hashCode(idColumn); + int result = super.hashCode(); + result = 31 * result + Objects.hashCode(idColumn); result = 31 * result + Objects.hashCode(fetchKeyColumn); result = 31 * result + Objects.hashCode(fetchKeyRangeStartColumn); result = 31 * result + Objects.hashCode(fetchKeyRangeEndColumn); @@ -121,7 +116,6 @@ public int hashCode() { result = 31 * result + Objects.hashCode(select); result = 31 * result + fetchSize; result = 31 * result + queryTimeoutSeconds; - result = 31 * result + Objects.hashCode(baseConfig); return result; } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/java/org/apache/tika/pipes/iterator/jdbc/TestJDBCPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/java/org/apache/tika/pipes/iterator/jdbc/TestJDBCPipesIterator.java index 30aac2d57f8..ce7c5989433 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/java/org/apache/tika/pipes/iterator/jdbc/TestJDBCPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/java/org/apache/tika/pipes/iterator/jdbc/TestJDBCPipesIterator.java @@ -163,12 +163,9 @@ private JDBCPipesIterator createIterator() throws Exception { jsonConfig.put("fetchKeyColumn", "my_fetchkey"); jsonConfig.put("emitKeyColumn", "my_fetchkey"); - // Add baseConfig - ObjectNode baseConfig = OBJECT_MAPPER.createObjectNode(); - baseConfig.put("fetcherId", "s3f"); - baseConfig.put("emitterId", "s3e"); - baseConfig.put("queueSize", 57); - jsonConfig.set("baseConfig", baseConfig); + // Add fetcherId and emitterId at root level (not nested in baseConfig) + jsonConfig.put("fetcherId", "s3f"); + jsonConfig.put("emitterId", "s3e"); ExtensionConfig extensionConfig = new ExtensionConfig("test-jdbc-iterator", "jdbc-pipes-iterator", OBJECT_MAPPER.writeValueAsString(jsonConfig)); diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorConfig.java index a9942a625ca..c3f6f53924b 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorConfig.java @@ -23,10 +23,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig; +import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig; -public class JsonPipesIteratorConfig implements PipesIteratorConfig { +public class JsonPipesIteratorConfig extends PipesIteratorConfig { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -42,31 +41,26 @@ public static JsonPipesIteratorConfig load(final String json) } private Path jsonPath; - private PipesIteratorBaseConfig baseConfig = null; public Path getJsonPath() { return jsonPath; } @Override - public PipesIteratorBaseConfig getBaseConfig() { - return baseConfig; - } - - @Override - public final boolean equals(Object o) { + public boolean equals(Object o) { if (!(o instanceof JsonPipesIteratorConfig that)) { return false; } - - return Objects.equals(jsonPath, that.jsonPath) && - Objects.equals(baseConfig, that.baseConfig); + if (!super.equals(o)) { + return false; + } + return Objects.equals(jsonPath, that.jsonPath); } @Override public int hashCode() { - int result = Objects.hashCode(jsonPath); - result = 31 * result + Objects.hashCode(baseConfig); + int result = super.hashCode(); + result = 31 * result + Objects.hashCode(jsonPath); return result; } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/java/org/apache/tika/pipes/pipesiterator/json/TestJsonPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/java/org/apache/tika/pipes/pipesiterator/json/TestJsonPipesIterator.java index 4211387888f..53aca506ffc 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/java/org/apache/tika/pipes/pipesiterator/json/TestJsonPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/java/org/apache/tika/pipes/pipesiterator/json/TestJsonPipesIterator.java @@ -71,34 +71,4 @@ private JsonPipesIterator createIterator(Path jsonPath) throws Exception { OBJECT_MAPPER.writeValueAsString(jsonConfig)); return JsonPipesIterator.build(extensionConfig); } - - - /* - //use this to generate test files - public static void main(String[] args) throws Exception { - Path p = Paths.get("/home/tallison/Intellij/tika-main/tika-pipes/tika-pipes-iterators" + - "/tika-pipes-iterator-json/src/test/resources/test-documents/test-with-embedded" + - "-bytes.json"); - try (BufferedWriter writer = Files.newBufferedWriter(p, StandardCharsets.UTF_8)) { - HandlerConfig handlerConfig = - new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, - HandlerConfig.PARSE_MODE.RMETA, -1, -1, - false); - EmbeddedDocumentBytesConfig config = new EmbeddedDocumentBytesConfig(true); - for (int i = 0; i < 100; i++) { - String id = "myid-"+i; - FetchEmitTuple t = new FetchEmitTuple( - id, - new FetchKey("fs", i + ".xml"), - new EmitKey("fs", i + ".xml.json"), - new Metadata(), - handlerConfig, - FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT, - config); - String line = JsonFetchEmitTuple.toJson(t); - writer.write(line); - writer.newLine(); - } - } - }*/ } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test-with-embedded-bytes.json b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test-with-embedded-bytes.json index 74883069062..daef89edaa6 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test-with-embedded-bytes.json +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test-with-embedded-bytes.json @@ -4,12 +4,6 @@ "fetchKey": "0.xml", "emitter": "fs", "emitKey": "0.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -25,12 +19,6 @@ "fetchKey": "1.xml", "emitter": "fs", "emitKey": "1.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -46,12 +34,6 @@ "fetchKey": "2.xml", "emitter": "fs", "emitKey": "2.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -67,12 +49,6 @@ "fetchKey": "3.xml", "emitter": "fs", "emitKey": "3.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -88,12 +64,6 @@ "fetchKey": "4.xml", "emitter": "fs", "emitKey": "4.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -109,12 +79,6 @@ "fetchKey": "5.xml", "emitter": "fs", "emitKey": "5.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -130,12 +94,6 @@ "fetchKey": "6.xml", "emitter": "fs", "emitKey": "6.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -151,12 +109,6 @@ "fetchKey": "7.xml", "emitter": "fs", "emitKey": "7.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -172,12 +124,6 @@ "fetchKey": "8.xml", "emitter": "fs", "emitKey": "8.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -193,12 +139,6 @@ "fetchKey": "9.xml", "emitter": "fs", "emitKey": "9.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -214,12 +154,6 @@ "fetchKey": "10.xml", "emitter": "fs", "emitKey": "10.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -235,12 +169,6 @@ "fetchKey": "11.xml", "emitter": "fs", "emitKey": "11.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -256,12 +184,6 @@ "fetchKey": "12.xml", "emitter": "fs", "emitKey": "12.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -277,12 +199,6 @@ "fetchKey": "13.xml", "emitter": "fs", "emitKey": "13.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -298,12 +214,6 @@ "fetchKey": "14.xml", "emitter": "fs", "emitKey": "14.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -319,12 +229,6 @@ "fetchKey": "15.xml", "emitter": "fs", "emitKey": "15.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -340,12 +244,6 @@ "fetchKey": "16.xml", "emitter": "fs", "emitKey": "16.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -361,12 +259,6 @@ "fetchKey": "17.xml", "emitter": "fs", "emitKey": "17.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -382,12 +274,6 @@ "fetchKey": "18.xml", "emitter": "fs", "emitKey": "18.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -403,12 +289,6 @@ "fetchKey": "19.xml", "emitter": "fs", "emitKey": "19.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -424,12 +304,6 @@ "fetchKey": "20.xml", "emitter": "fs", "emitKey": "20.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -445,12 +319,6 @@ "fetchKey": "21.xml", "emitter": "fs", "emitKey": "21.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -466,12 +334,6 @@ "fetchKey": "22.xml", "emitter": "fs", "emitKey": "22.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -487,12 +349,6 @@ "fetchKey": "23.xml", "emitter": "fs", "emitKey": "23.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -508,12 +364,6 @@ "fetchKey": "24.xml", "emitter": "fs", "emitKey": "24.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -529,12 +379,6 @@ "fetchKey": "25.xml", "emitter": "fs", "emitKey": "25.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -550,12 +394,6 @@ "fetchKey": "26.xml", "emitter": "fs", "emitKey": "26.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -571,12 +409,6 @@ "fetchKey": "27.xml", "emitter": "fs", "emitKey": "27.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -592,12 +424,6 @@ "fetchKey": "28.xml", "emitter": "fs", "emitKey": "28.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -613,12 +439,6 @@ "fetchKey": "29.xml", "emitter": "fs", "emitKey": "29.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -634,12 +454,6 @@ "fetchKey": "30.xml", "emitter": "fs", "emitKey": "30.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -655,12 +469,6 @@ "fetchKey": "31.xml", "emitter": "fs", "emitKey": "31.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -676,12 +484,6 @@ "fetchKey": "32.xml", "emitter": "fs", "emitKey": "32.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -697,12 +499,6 @@ "fetchKey": "33.xml", "emitter": "fs", "emitKey": "33.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -718,12 +514,6 @@ "fetchKey": "34.xml", "emitter": "fs", "emitKey": "34.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -739,12 +529,6 @@ "fetchKey": "35.xml", "emitter": "fs", "emitKey": "35.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -760,12 +544,6 @@ "fetchKey": "36.xml", "emitter": "fs", "emitKey": "36.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -781,12 +559,6 @@ "fetchKey": "37.xml", "emitter": "fs", "emitKey": "37.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -802,12 +574,6 @@ "fetchKey": "38.xml", "emitter": "fs", "emitKey": "38.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -823,12 +589,6 @@ "fetchKey": "39.xml", "emitter": "fs", "emitKey": "39.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -844,12 +604,6 @@ "fetchKey": "40.xml", "emitter": "fs", "emitKey": "40.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -865,12 +619,6 @@ "fetchKey": "41.xml", "emitter": "fs", "emitKey": "41.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -886,12 +634,6 @@ "fetchKey": "42.xml", "emitter": "fs", "emitKey": "42.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -907,12 +649,6 @@ "fetchKey": "43.xml", "emitter": "fs", "emitKey": "43.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -928,12 +664,6 @@ "fetchKey": "44.xml", "emitter": "fs", "emitKey": "44.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -949,12 +679,6 @@ "fetchKey": "45.xml", "emitter": "fs", "emitKey": "45.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -970,12 +694,6 @@ "fetchKey": "46.xml", "emitter": "fs", "emitKey": "46.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -991,12 +709,6 @@ "fetchKey": "47.xml", "emitter": "fs", "emitKey": "47.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1012,12 +724,6 @@ "fetchKey": "48.xml", "emitter": "fs", "emitKey": "48.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1033,12 +739,6 @@ "fetchKey": "49.xml", "emitter": "fs", "emitKey": "49.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1054,12 +754,6 @@ "fetchKey": "50.xml", "emitter": "fs", "emitKey": "50.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1075,12 +769,6 @@ "fetchKey": "51.xml", "emitter": "fs", "emitKey": "51.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1096,12 +784,6 @@ "fetchKey": "52.xml", "emitter": "fs", "emitKey": "52.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1117,12 +799,6 @@ "fetchKey": "53.xml", "emitter": "fs", "emitKey": "53.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1138,12 +814,6 @@ "fetchKey": "54.xml", "emitter": "fs", "emitKey": "54.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1159,12 +829,6 @@ "fetchKey": "55.xml", "emitter": "fs", "emitKey": "55.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1180,12 +844,6 @@ "fetchKey": "56.xml", "emitter": "fs", "emitKey": "56.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1201,12 +859,6 @@ "fetchKey": "57.xml", "emitter": "fs", "emitKey": "57.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1222,12 +874,6 @@ "fetchKey": "58.xml", "emitter": "fs", "emitKey": "58.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1243,12 +889,6 @@ "fetchKey": "59.xml", "emitter": "fs", "emitKey": "59.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1264,12 +904,6 @@ "fetchKey": "60.xml", "emitter": "fs", "emitKey": "60.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1285,12 +919,6 @@ "fetchKey": "61.xml", "emitter": "fs", "emitKey": "61.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1306,12 +934,6 @@ "fetchKey": "62.xml", "emitter": "fs", "emitKey": "62.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1327,12 +949,6 @@ "fetchKey": "63.xml", "emitter": "fs", "emitKey": "63.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1348,12 +964,6 @@ "fetchKey": "64.xml", "emitter": "fs", "emitKey": "64.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1369,12 +979,6 @@ "fetchKey": "65.xml", "emitter": "fs", "emitKey": "65.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1390,12 +994,6 @@ "fetchKey": "66.xml", "emitter": "fs", "emitKey": "66.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1411,12 +1009,6 @@ "fetchKey": "67.xml", "emitter": "fs", "emitKey": "67.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1432,12 +1024,6 @@ "fetchKey": "68.xml", "emitter": "fs", "emitKey": "68.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1453,12 +1039,6 @@ "fetchKey": "69.xml", "emitter": "fs", "emitKey": "69.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1474,12 +1054,6 @@ "fetchKey": "70.xml", "emitter": "fs", "emitKey": "70.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1495,12 +1069,6 @@ "fetchKey": "71.xml", "emitter": "fs", "emitKey": "71.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1516,12 +1084,6 @@ "fetchKey": "72.xml", "emitter": "fs", "emitKey": "72.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1537,12 +1099,6 @@ "fetchKey": "73.xml", "emitter": "fs", "emitKey": "73.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1558,12 +1114,6 @@ "fetchKey": "74.xml", "emitter": "fs", "emitKey": "74.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1579,12 +1129,6 @@ "fetchKey": "75.xml", "emitter": "fs", "emitKey": "75.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1600,12 +1144,6 @@ "fetchKey": "76.xml", "emitter": "fs", "emitKey": "76.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1621,12 +1159,6 @@ "fetchKey": "77.xml", "emitter": "fs", "emitKey": "77.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1642,12 +1174,6 @@ "fetchKey": "78.xml", "emitter": "fs", "emitKey": "78.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1663,12 +1189,6 @@ "fetchKey": "79.xml", "emitter": "fs", "emitKey": "79.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1684,12 +1204,6 @@ "fetchKey": "80.xml", "emitter": "fs", "emitKey": "80.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1705,12 +1219,6 @@ "fetchKey": "81.xml", "emitter": "fs", "emitKey": "81.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1726,12 +1234,6 @@ "fetchKey": "82.xml", "emitter": "fs", "emitKey": "82.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1747,12 +1249,6 @@ "fetchKey": "83.xml", "emitter": "fs", "emitKey": "83.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1768,12 +1264,6 @@ "fetchKey": "84.xml", "emitter": "fs", "emitKey": "84.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1789,12 +1279,6 @@ "fetchKey": "85.xml", "emitter": "fs", "emitKey": "85.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1810,12 +1294,6 @@ "fetchKey": "86.xml", "emitter": "fs", "emitKey": "86.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1831,12 +1309,6 @@ "fetchKey": "87.xml", "emitter": "fs", "emitKey": "87.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1852,12 +1324,6 @@ "fetchKey": "88.xml", "emitter": "fs", "emitKey": "88.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1873,12 +1339,6 @@ "fetchKey": "89.xml", "emitter": "fs", "emitKey": "89.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1894,12 +1354,6 @@ "fetchKey": "90.xml", "emitter": "fs", "emitKey": "90.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1915,12 +1369,6 @@ "fetchKey": "91.xml", "emitter": "fs", "emitKey": "91.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1936,12 +1384,6 @@ "fetchKey": "92.xml", "emitter": "fs", "emitKey": "92.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1957,12 +1399,6 @@ "fetchKey": "93.xml", "emitter": "fs", "emitKey": "93.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1978,12 +1414,6 @@ "fetchKey": "94.xml", "emitter": "fs", "emitKey": "94.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -1999,12 +1429,6 @@ "fetchKey": "95.xml", "emitter": "fs", "emitKey": "95.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -2020,12 +1444,6 @@ "fetchKey": "96.xml", "emitter": "fs", "emitKey": "96.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -2041,12 +1459,6 @@ "fetchKey": "97.xml", "emitter": "fs", "emitKey": "97.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -2062,12 +1474,6 @@ "fetchKey": "98.xml", "emitter": "fs", "emitKey": "98.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, @@ -2083,12 +1489,6 @@ "fetchKey": "99.xml", "emitter": "fs", "emitKey": "99.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit", "embeddedDocumentBytesConfig": { "extractEmbeddedDocumentBytes": true, diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test.json b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test.json index 721410fd3a8..e5199c6cbac 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test.json +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test.json @@ -4,12 +4,6 @@ "fetchKey": "0.xml", "emitter": "fs", "emitKey": "0.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -18,12 +12,6 @@ "fetchKey": "1.xml", "emitter": "fs", "emitKey": "1.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -32,12 +20,6 @@ "fetchKey": "2.xml", "emitter": "fs", "emitKey": "2.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -46,12 +28,6 @@ "fetchKey": "3.xml", "emitter": "fs", "emitKey": "3.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -60,12 +36,6 @@ "fetchKey": "4.xml", "emitter": "fs", "emitKey": "4.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -74,12 +44,6 @@ "fetchKey": "5.xml", "emitter": "fs", "emitKey": "5.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -88,12 +52,6 @@ "fetchKey": "6.xml", "emitter": "fs", "emitKey": "6.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -102,12 +60,6 @@ "fetchKey": "7.xml", "emitter": "fs", "emitKey": "7.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -116,12 +68,6 @@ "fetchKey": "8.xml", "emitter": "fs", "emitKey": "8.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -130,12 +76,6 @@ "fetchKey": "9.xml", "emitter": "fs", "emitKey": "9.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -144,12 +84,6 @@ "fetchKey": "10.xml", "emitter": "fs", "emitKey": "10.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -158,12 +92,6 @@ "fetchKey": "11.xml", "emitter": "fs", "emitKey": "11.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -172,12 +100,6 @@ "fetchKey": "12.xml", "emitter": "fs", "emitKey": "12.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -186,12 +108,6 @@ "fetchKey": "13.xml", "emitter": "fs", "emitKey": "13.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -200,12 +116,6 @@ "fetchKey": "14.xml", "emitter": "fs", "emitKey": "14.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -214,12 +124,6 @@ "fetchKey": "15.xml", "emitter": "fs", "emitKey": "15.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -228,12 +132,6 @@ "fetchKey": "16.xml", "emitter": "fs", "emitKey": "16.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -242,12 +140,6 @@ "fetchKey": "17.xml", "emitter": "fs", "emitKey": "17.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -256,12 +148,6 @@ "fetchKey": "18.xml", "emitter": "fs", "emitKey": "18.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -270,12 +156,6 @@ "fetchKey": "19.xml", "emitter": "fs", "emitKey": "19.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -284,12 +164,6 @@ "fetchKey": "20.xml", "emitter": "fs", "emitKey": "20.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -298,12 +172,6 @@ "fetchKey": "21.xml", "emitter": "fs", "emitKey": "21.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -312,12 +180,6 @@ "fetchKey": "22.xml", "emitter": "fs", "emitKey": "22.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -326,12 +188,6 @@ "fetchKey": "23.xml", "emitter": "fs", "emitKey": "23.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -340,12 +196,6 @@ "fetchKey": "24.xml", "emitter": "fs", "emitKey": "24.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -354,12 +204,6 @@ "fetchKey": "25.xml", "emitter": "fs", "emitKey": "25.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -368,12 +212,6 @@ "fetchKey": "26.xml", "emitter": "fs", "emitKey": "26.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -382,12 +220,6 @@ "fetchKey": "27.xml", "emitter": "fs", "emitKey": "27.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -396,12 +228,6 @@ "fetchKey": "28.xml", "emitter": "fs", "emitKey": "28.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -410,12 +236,6 @@ "fetchKey": "29.xml", "emitter": "fs", "emitKey": "29.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -424,12 +244,6 @@ "fetchKey": "30.xml", "emitter": "fs", "emitKey": "30.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -438,12 +252,6 @@ "fetchKey": "31.xml", "emitter": "fs", "emitKey": "31.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -452,12 +260,6 @@ "fetchKey": "32.xml", "emitter": "fs", "emitKey": "32.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -466,12 +268,6 @@ "fetchKey": "33.xml", "emitter": "fs", "emitKey": "33.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -480,12 +276,6 @@ "fetchKey": "34.xml", "emitter": "fs", "emitKey": "34.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -494,12 +284,6 @@ "fetchKey": "35.xml", "emitter": "fs", "emitKey": "35.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -508,12 +292,6 @@ "fetchKey": "36.xml", "emitter": "fs", "emitKey": "36.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -522,12 +300,6 @@ "fetchKey": "37.xml", "emitter": "fs", "emitKey": "37.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -536,12 +308,6 @@ "fetchKey": "38.xml", "emitter": "fs", "emitKey": "38.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -550,12 +316,6 @@ "fetchKey": "39.xml", "emitter": "fs", "emitKey": "39.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -564,12 +324,6 @@ "fetchKey": "40.xml", "emitter": "fs", "emitKey": "40.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -578,12 +332,6 @@ "fetchKey": "41.xml", "emitter": "fs", "emitKey": "41.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -592,12 +340,6 @@ "fetchKey": "42.xml", "emitter": "fs", "emitKey": "42.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -606,12 +348,6 @@ "fetchKey": "43.xml", "emitter": "fs", "emitKey": "43.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -620,12 +356,6 @@ "fetchKey": "44.xml", "emitter": "fs", "emitKey": "44.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -634,12 +364,6 @@ "fetchKey": "45.xml", "emitter": "fs", "emitKey": "45.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -648,12 +372,6 @@ "fetchKey": "46.xml", "emitter": "fs", "emitKey": "46.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -662,12 +380,6 @@ "fetchKey": "47.xml", "emitter": "fs", "emitKey": "47.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -676,12 +388,6 @@ "fetchKey": "48.xml", "emitter": "fs", "emitKey": "48.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -690,12 +396,6 @@ "fetchKey": "49.xml", "emitter": "fs", "emitKey": "49.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -704,12 +404,6 @@ "fetchKey": "50.xml", "emitter": "fs", "emitKey": "50.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -718,12 +412,6 @@ "fetchKey": "51.xml", "emitter": "fs", "emitKey": "51.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -732,12 +420,6 @@ "fetchKey": "52.xml", "emitter": "fs", "emitKey": "52.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -746,12 +428,6 @@ "fetchKey": "53.xml", "emitter": "fs", "emitKey": "53.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -760,12 +436,6 @@ "fetchKey": "54.xml", "emitter": "fs", "emitKey": "54.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -774,12 +444,6 @@ "fetchKey": "55.xml", "emitter": "fs", "emitKey": "55.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -788,12 +452,6 @@ "fetchKey": "56.xml", "emitter": "fs", "emitKey": "56.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -802,12 +460,6 @@ "fetchKey": "57.xml", "emitter": "fs", "emitKey": "57.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -816,12 +468,6 @@ "fetchKey": "58.xml", "emitter": "fs", "emitKey": "58.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -830,12 +476,6 @@ "fetchKey": "59.xml", "emitter": "fs", "emitKey": "59.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -844,12 +484,6 @@ "fetchKey": "60.xml", "emitter": "fs", "emitKey": "60.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -858,12 +492,6 @@ "fetchKey": "61.xml", "emitter": "fs", "emitKey": "61.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -872,12 +500,6 @@ "fetchKey": "62.xml", "emitter": "fs", "emitKey": "62.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -886,12 +508,6 @@ "fetchKey": "63.xml", "emitter": "fs", "emitKey": "63.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -900,12 +516,6 @@ "fetchKey": "64.xml", "emitter": "fs", "emitKey": "64.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -914,12 +524,6 @@ "fetchKey": "65.xml", "emitter": "fs", "emitKey": "65.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -928,12 +532,6 @@ "fetchKey": "66.xml", "emitter": "fs", "emitKey": "66.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -942,12 +540,6 @@ "fetchKey": "67.xml", "emitter": "fs", "emitKey": "67.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -956,12 +548,6 @@ "fetchKey": "68.xml", "emitter": "fs", "emitKey": "68.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -970,12 +556,6 @@ "fetchKey": "69.xml", "emitter": "fs", "emitKey": "69.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -984,12 +564,6 @@ "fetchKey": "70.xml", "emitter": "fs", "emitKey": "70.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -998,12 +572,6 @@ "fetchKey": "71.xml", "emitter": "fs", "emitKey": "71.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1012,12 +580,6 @@ "fetchKey": "72.xml", "emitter": "fs", "emitKey": "72.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1026,12 +588,6 @@ "fetchKey": "73.xml", "emitter": "fs", "emitKey": "73.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1040,12 +596,6 @@ "fetchKey": "74.xml", "emitter": "fs", "emitKey": "74.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1054,12 +604,6 @@ "fetchKey": "75.xml", "emitter": "fs", "emitKey": "75.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1068,12 +612,6 @@ "fetchKey": "76.xml", "emitter": "fs", "emitKey": "76.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1082,12 +620,6 @@ "fetchKey": "77.xml", "emitter": "fs", "emitKey": "77.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1096,12 +628,6 @@ "fetchKey": "78.xml", "emitter": "fs", "emitKey": "78.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1110,12 +636,6 @@ "fetchKey": "79.xml", "emitter": "fs", "emitKey": "79.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1124,12 +644,6 @@ "fetchKey": "80.xml", "emitter": "fs", "emitKey": "80.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1138,12 +652,6 @@ "fetchKey": "81.xml", "emitter": "fs", "emitKey": "81.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1152,12 +660,6 @@ "fetchKey": "82.xml", "emitter": "fs", "emitKey": "82.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1166,12 +668,6 @@ "fetchKey": "83.xml", "emitter": "fs", "emitKey": "83.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1180,12 +676,6 @@ "fetchKey": "84.xml", "emitter": "fs", "emitKey": "84.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1194,12 +684,6 @@ "fetchKey": "85.xml", "emitter": "fs", "emitKey": "85.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1208,12 +692,6 @@ "fetchKey": "86.xml", "emitter": "fs", "emitKey": "86.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1222,12 +700,6 @@ "fetchKey": "87.xml", "emitter": "fs", "emitKey": "87.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1236,12 +708,6 @@ "fetchKey": "88.xml", "emitter": "fs", "emitKey": "88.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1250,12 +716,6 @@ "fetchKey": "89.xml", "emitter": "fs", "emitKey": "89.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1264,12 +724,6 @@ "fetchKey": "90.xml", "emitter": "fs", "emitKey": "90.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1278,12 +732,6 @@ "fetchKey": "91.xml", "emitter": "fs", "emitKey": "91.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1292,12 +740,6 @@ "fetchKey": "92.xml", "emitter": "fs", "emitKey": "92.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1306,12 +748,6 @@ "fetchKey": "93.xml", "emitter": "fs", "emitKey": "93.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1320,12 +756,6 @@ "fetchKey": "94.xml", "emitter": "fs", "emitKey": "94.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1334,12 +764,6 @@ "fetchKey": "95.xml", "emitter": "fs", "emitKey": "95.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1348,12 +772,6 @@ "fetchKey": "96.xml", "emitter": "fs", "emitKey": "96.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1362,12 +780,6 @@ "fetchKey": "97.xml", "emitter": "fs", "emitKey": "97.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1376,12 +788,6 @@ "fetchKey": "98.xml", "emitter": "fs", "emitKey": "98.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } { @@ -1390,11 +796,5 @@ "fetchKey": "99.xml", "emitter": "fs", "emitKey": "99.xml.json", - "handlerConfig": { - "type": "text", - "parseMode": "rmeta", - "writeLimit": -1, - "maxEmbeddedResources": -1 - }, "onParseException": "emit" } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIterator.java index 285bc07188d..ccf9e2037f3 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIterator.java @@ -35,10 +35,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; import org.apache.tika.pipes.pipesiterator.PipesIteratorBase; import org.apache.tika.plugins.ExtensionConfig; @@ -99,10 +97,8 @@ private Object serializerClass(String className, Class defaultClass) { @Override protected void enqueue() throws InterruptedException, TimeoutException { - PipesIteratorBaseConfig baseConfig = config.getBaseConfig(); - String fetcherId = baseConfig.fetcherId(); - String emitterId = baseConfig.emitterId(); - HandlerConfig handlerConfig = baseConfig.handlerConfig(); + String fetcherId = config.getFetcherId(); + String emitterId = config.getEmitterId(); long start = System.currentTimeMillis(); int count = 0; @@ -117,10 +113,9 @@ protected void enqueue() throws InterruptedException, TimeoutException { LOGGER.debug("adding ({}) {} in {} ms", count, r.key(), elapsed); } ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, handlerConfig); tryToAdd(new FetchEmitTuple(r.key(), new FetchKey(fetcherId, r.key()), new EmitKey(emitterId, r.key()), new Metadata(), parseContext, - baseConfig.onParseException())); + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); ++count; } } while ((emitMax < 0 || count < emitMax) && !records.isEmpty()); diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorConfig.java index 53675ac233a..63342adbe31 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorConfig.java @@ -22,10 +22,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig; +import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig; -public class KafkaPipesIteratorConfig implements PipesIteratorConfig { +public class KafkaPipesIteratorConfig extends PipesIteratorConfig { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -49,7 +48,6 @@ public static KafkaPipesIteratorConfig load(final String json) private int pollDelayMs = 100; private int emitMax = -1; private int groupInitialRebalanceDelayMs = 3000; - private PipesIteratorBaseConfig baseConfig = null; public String getTopic() { return topic; @@ -88,16 +86,13 @@ public int getGroupInitialRebalanceDelayMs() { } @Override - public PipesIteratorBaseConfig getBaseConfig() { - return baseConfig; - } - - @Override - public final boolean equals(Object o) { + public boolean equals(Object o) { if (!(o instanceof KafkaPipesIteratorConfig that)) { return false; } - + if (!super.equals(o)) { + return false; + } return pollDelayMs == that.pollDelayMs && emitMax == that.emitMax && groupInitialRebalanceDelayMs == that.groupInitialRebalanceDelayMs && @@ -106,13 +101,13 @@ public final boolean equals(Object o) { Objects.equals(keySerializer, that.keySerializer) && Objects.equals(valueSerializer, that.valueSerializer) && Objects.equals(groupId, that.groupId) && - Objects.equals(autoOffsetReset, that.autoOffsetReset) && - Objects.equals(baseConfig, that.baseConfig); + Objects.equals(autoOffsetReset, that.autoOffsetReset); } @Override public int hashCode() { - int result = Objects.hashCode(topic); + int result = super.hashCode(); + result = 31 * result + Objects.hashCode(topic); result = 31 * result + Objects.hashCode(bootstrapServers); result = 31 * result + Objects.hashCode(keySerializer); result = 31 * result + Objects.hashCode(valueSerializer); @@ -121,7 +116,6 @@ public int hashCode() { result = 31 * result + pollDelayMs; result = 31 * result + emitMax; result = 31 * result + groupInitialRebalanceDelayMs; - result = 31 * result + Objects.hashCode(baseConfig); return result; } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/java/org/apache/tika/pipes/iterator/kafka/TestKafkaPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/java/org/apache/tika/pipes/iterator/kafka/TestKafkaPipesIterator.java index f71c0b4db86..00e8e364203 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/java/org/apache/tika/pipes/iterator/kafka/TestKafkaPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/java/org/apache/tika/pipes/iterator/kafka/TestKafkaPipesIterator.java @@ -49,10 +49,9 @@ public void testSimple() throws Exception { configNode.put("bootstrapServers", ""); // use one configNode.put("groupId", ""); // find one - ObjectNode baseConfigNode = MAPPER.createObjectNode(); - baseConfigNode.put("fetcherId", "kafka"); - baseConfigNode.put("emitterId", "test-emitter"); - configNode.set("baseConfig", baseConfigNode); + // Add fetcherId and emitterId at root level (not nested in baseConfig) + configNode.put("fetcherId", "kafka"); + configNode.put("emitterId", "test-emitter"); ExtensionConfig extensionConfig = new ExtensionConfig("test-kafka", "kafka-pipes-iterator", MAPPER.writeValueAsString(configNode)); diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIterator.java index 6a9539ca316..6e6daa11526 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIterator.java @@ -46,10 +46,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; import org.apache.tika.pipes.pipesiterator.PipesIteratorBase; import org.apache.tika.plugins.ExtensionConfig; import org.apache.tika.utils.StringUtils; @@ -125,12 +123,10 @@ public static S3PipesIterator build(ExtensionConfig extensionConfig) throws IOEx @Override protected void enqueue() throws InterruptedException, IOException, TimeoutException { - PipesIteratorBaseConfig baseConfig = config.getBaseConfig(); - String fetcherPluginId = baseConfig.fetcherId(); - String emitterName = baseConfig.emitterId(); + String fetcherId = config.getFetcherId(); + String emitterId = config.getEmitterId(); long start = System.currentTimeMillis(); int count = 0; - HandlerConfig handlerConfig = baseConfig.handlerConfig(); final Matcher fileNameMatcher; if (fileNamePattern != null) { fileNameMatcher = fileNamePattern.matcher(""); @@ -149,9 +145,8 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept long elapsed = System.currentTimeMillis() - start; LOGGER.debug("adding ({}) {} in {} ms", count, key, elapsed); ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, handlerConfig); - tryToAdd(new FetchEmitTuple(key, new FetchKey(fetcherPluginId, key), new EmitKey(emitterName, key), new Metadata(), parseContext, - baseConfig.onParseException())); + tryToAdd(new FetchEmitTuple(key, new FetchKey(fetcherId, key), new EmitKey(emitterId, key), new Metadata(), parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); count++; } long elapsed = System.currentTimeMillis() - start; diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorConfig.java index dc4bd12c2e8..4e8cf3ef20d 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorConfig.java @@ -22,10 +22,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig; +import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig; -public class S3PipesIteratorConfig implements PipesIteratorConfig { +public class S3PipesIteratorConfig extends PipesIteratorConfig { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -50,7 +49,6 @@ public static S3PipesIteratorConfig load(final String json) private String fileNamePattern; private int maxConnections = 50; private boolean pathStyleAccessEnabled = false; - private PipesIteratorBaseConfig baseConfig = null; public String getPrefix() { return prefix; @@ -97,16 +95,13 @@ public boolean isPathStyleAccessEnabled() { } @Override - public PipesIteratorBaseConfig getBaseConfig() { - return baseConfig; - } - - @Override - public final boolean equals(Object o) { + public boolean equals(Object o) { if (!(o instanceof S3PipesIteratorConfig that)) { return false; } - + if (!super.equals(o)) { + return false; + } return maxConnections == that.maxConnections && pathStyleAccessEnabled == that.pathStyleAccessEnabled && Objects.equals(prefix, that.prefix) && @@ -117,13 +112,13 @@ public final boolean equals(Object o) { Objects.equals(credentialsProvider, that.credentialsProvider) && Objects.equals(profile, that.profile) && Objects.equals(bucket, that.bucket) && - Objects.equals(fileNamePattern, that.fileNamePattern) && - Objects.equals(baseConfig, that.baseConfig); + Objects.equals(fileNamePattern, that.fileNamePattern); } @Override public int hashCode() { - int result = Objects.hashCode(prefix); + int result = super.hashCode(); + result = 31 * result + Objects.hashCode(prefix); result = 31 * result + Objects.hashCode(region); result = 31 * result + Objects.hashCode(accessKey); result = 31 * result + Objects.hashCode(secretKey); @@ -134,7 +129,6 @@ public int hashCode() { result = 31 * result + Objects.hashCode(fileNamePattern); result = 31 * result + maxConnections; result = 31 * result + Boolean.hashCode(pathStyleAccessEnabled); - result = 31 * result + Objects.hashCode(baseConfig); return result; } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/iterator/s3/TestS3PipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/iterator/s3/TestS3PipesIterator.java index d840fc29509..4104b54e698 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/iterator/s3/TestS3PipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/iterator/s3/TestS3PipesIterator.java @@ -50,10 +50,9 @@ public void testSimple() throws Exception { jsonConfig.put("profile", ""); // use one jsonConfig.put("credentialsProvider", "profile"); - ObjectNode baseConfig = OBJECT_MAPPER.createObjectNode(); - baseConfig.put("fetcherId", "s3"); - baseConfig.put("emitterId", "fs"); - jsonConfig.set("baseConfig", baseConfig); + // Add fetcherId and emitterId at root level (not nested in baseConfig) + jsonConfig.put("fetcherId", "s3"); + jsonConfig.put("emitterId", "fs"); ExtensionConfig extensionConfig = new ExtensionConfig("test-s3-iterator", "s3-pipes-iterator", OBJECT_MAPPER.writeValueAsString(jsonConfig)); diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIterator.java index 6be72029b8c..02615bf12e6 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIterator.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIterator.java @@ -42,10 +42,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; import org.apache.tika.pipes.pipesiterator.PipesIteratorBase; import org.apache.tika.plugins.ExtensionConfig; import org.apache.tika.utils.StringUtils; @@ -119,9 +117,8 @@ private void configure() throws IOException, TikaConfigException { @Override protected void enqueue() throws InterruptedException, IOException, TimeoutException { - PipesIteratorBaseConfig baseConfig = config.getBaseConfig(); - String fetcherId = baseConfig.fetcherId(); - String emitterId = baseConfig.emitterId(); + String fetcherId = config.getFetcherId(); + String emitterId = config.getEmitterId(); try (SolrClient solrClient = createSolrClient()) { int fileCount = 0; @@ -145,8 +142,6 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept List filters = config.getFilters() != null ? config.getFilters() : Collections.emptyList(); query.setFilterQueries(filters.toArray(new String[]{})); - HandlerConfig handlerConfig = baseConfig.handlerConfig(); - String cursorMark = CursorMarkParams.CURSOR_MARK_START; boolean done = false; while (!done) { @@ -167,9 +162,8 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept } LOGGER.info("iterator doc: {}, idField={}, fetchKey={}", sd, config.getIdField(), fetchKey); ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, handlerConfig); tryToAdd(new FetchEmitTuple(fetchKey, new FetchKey(fetcherId, fetchKey), new EmitKey(emitterId, emitKey), new Metadata(), parseContext, - baseConfig.onParseException())); + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); } if (cursorMark.equals(nextCursorMark)) { done = true; diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorConfig.java index 60211ed9ac9..9c37a52819c 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorConfig.java @@ -24,10 +24,9 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig; -import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig; +import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig; -public class SolrPipesIteratorConfig implements PipesIteratorConfig { +public class SolrPipesIteratorConfig extends PipesIteratorConfig { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -60,7 +59,6 @@ public static SolrPipesIteratorConfig load(final String json) private String authScheme; private String proxyHost; private int proxyPort = 0; - private PipesIteratorBaseConfig baseConfig = null; public String getSolrCollection() { return solrCollection; @@ -135,16 +133,13 @@ public int getProxyPort() { } @Override - public PipesIteratorBaseConfig getBaseConfig() { - return baseConfig; - } - - @Override - public final boolean equals(Object o) { + public boolean equals(Object o) { if (!(o instanceof SolrPipesIteratorConfig that)) { return false; } - + if (!super.equals(o)) { + return false; + } return rows == that.rows && connectionTimeout == that.connectionTimeout && socketTimeout == that.socketTimeout && @@ -162,13 +157,13 @@ public final boolean equals(Object o) { Objects.equals(userName, that.userName) && Objects.equals(password, that.password) && Objects.equals(authScheme, that.authScheme) && - Objects.equals(proxyHost, that.proxyHost) && - Objects.equals(baseConfig, that.baseConfig); + Objects.equals(proxyHost, that.proxyHost); } @Override public int hashCode() { - int result = Objects.hashCode(solrCollection); + int result = super.hashCode(); + result = 31 * result + Objects.hashCode(solrCollection); result = 31 * result + Objects.hashCode(solrUrls); result = 31 * result + Objects.hashCode(solrZkHosts); result = 31 * result + Objects.hashCode(solrZkChroot); @@ -186,7 +181,6 @@ public int hashCode() { result = 31 * result + Objects.hashCode(authScheme); result = 31 * result + Objects.hashCode(proxyHost); result = 31 * result + proxyPort; - result = 31 * result + Objects.hashCode(baseConfig); return result; } } diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java index 1ab7014ed8e..5ecfffecb5f 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java @@ -48,6 +48,20 @@ */ public class ComponentRegistry { + /** + * Built-in aliases for external dependencies. + * Maps component names to fully qualified class names. + */ + private static final Map BUILTIN_ALIASES = createBuiltinAliases(); + + private static Map createBuiltinAliases() { + Map aliases = new HashMap<>(); + // EmbeddedDocumentBytesConfig is in tika-pipes-core which can't depend on tika-core for @TikaComponent + aliases.put("embedded-document-bytes-config", + "org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig"); + return Collections.unmodifiableMap(aliases); + } + private final Map components; private final Map classNameToFriendlyName; // Reverse lookup by class name private final ClassLoader classLoader; diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java index b57aae89ee9..e262bd64129 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java @@ -40,10 +40,10 @@ * TikaLoader loader = TikaLoader.load(configPath); * * // Load by explicit key - * HandlerConfig config = loader.configs().load("handler-config", HandlerConfig.class); + * MyConfig config = loader.configs().load("my-config", MyConfig.class); * * // Load by class name (auto-converts to kebab-case) - * HandlerConfig config = loader.configs().load(HandlerConfig.class); + * MyConfig config = loader.configs().load(MyConfig.class); *

    * *

    JSON configuration example: @@ -57,7 +57,7 @@ * * // Custom configs MUST be in "other-configs" (loaded via configs()) * "other-configs": { - * "handler-config": { + * "my-config": { * "timeout": 5000, * "retries": 3 * }, @@ -93,7 +93,7 @@ public class ConfigLoader { /** * Loads a configuration object using the class name converted to kebab-case. *

    - * For example, {@code HandlerConfig.class} will look for key "handler-config". + * For example, {@code MyAppConfig.class} will look for key "my-app-config". * Class name suffixes like "Config", "Configuration", "Settings" are stripped first. *

    * For interfaces, the JSON must specify the implementation (see {@link #load(String, Class)}). @@ -213,7 +213,7 @@ public T load(String key, Class clazz, T defaultValue) throws TikaConfigE * *

    Example: *

    -     * HandlerConfig defaults = new HandlerConfig();
    +     * MyConfig defaults = new MyConfig();
          * defaults.setTimeout(30000);
          * defaults.setRetries(2);
          * defaults.setEnabled(false);
    @@ -221,9 +221,9 @@ public  T load(String key, Class clazz, T defaultValue) throws TikaConfigE
          * // JSON: { "enabled": true }
          * // Result: timeout=30000, retries=2, enabled=true (merged!)
          * // Note: 'defaults' object remains unchanged
    -     * HandlerConfig config = loader.configs().loadWithDefaults("handler-config",
    -     *                                                           HandlerConfig.class,
    -     *                                                           defaults);
    +     * MyConfig config = loader.configs().loadWithDefaults("my-config",
    +     *                                                      MyConfig.class,
    +     *                                                      defaults);
          * 
    * * @param key The JSON key to load from diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java index 2d9243b81a4..3d6a1ba4735 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java @@ -111,6 +111,7 @@ public class TikaJsonConfig { "detectors", "encoding-detectors", "metadata-filters", + "content-handler-factory", "renderers", "translator", "auto-detect-parser", diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index 5bc29d88f99..b527532e5ba 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@ -50,6 +50,8 @@ import org.apache.tika.parser.Parser; import org.apache.tika.renderer.CompositeRenderer; import org.apache.tika.renderer.Renderer; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.serialization.ComponentConfig; import org.apache.tika.serialization.ComponentNameResolver; import org.apache.tika.serialization.JsonMetadata; @@ -143,6 +145,12 @@ private static void registerComponentConfigs() { // Special cached instances that aren't standard components private Parser autoDetectParser; + private Detector detectors; + private EncodingDetector encodingDetectors; + private MetadataFilter metadataFilter; + private ContentHandlerFactory contentHandlerFactory; + private Renderer renderers; + private Translator translator; private ConfigLoader configLoader; private GlobalSettings globalSettings; @@ -272,6 +280,47 @@ public MetadataFilter loadMetadataFilters() throws TikaConfigException { return get(MetadataFilter.class); } + /** + * Loads and returns the content handler factory. + * If "content-handler-factory" section exists in config, uses that factory. + * If section missing, returns a default BasicContentHandlerFactory with TEXT handler. + * Results are cached - subsequent calls return the same instance. + * + *

    Example JSON: + *

    +     * {
    +     *   "content-handler-factory": {
    +     *     "basic-content-handler-factory": {
    +     *       "type": "HTML",
    +     *       "writeLimit": 100000
    +     *     }
    +     *   }
    +     * }
    +     * 
    + * + * @return the content handler factory + * @throws TikaConfigException if loading fails + */ + public synchronized ContentHandlerFactory loadContentHandlerFactory() throws TikaConfigException { + if (contentHandlerFactory == null) { + // Check if content-handler-factory section exists in config + if (config.hasComponentSection("content-handler-factory")) { + try { + contentHandlerFactory = config.deserialize("content-handler-factory", + ContentHandlerFactory.class); + } catch (IOException e) { + throw new TikaConfigException("Failed to load content-handler-factory", e); + } + } + // Default to BasicContentHandlerFactory with TEXT handler if not configured + if (contentHandlerFactory == null) { + contentHandlerFactory = new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1); + } + } + return contentHandlerFactory; + } + /** * Loads and returns all renderers. * Syntactic sugar for {@code get(Renderer.class)}. @@ -335,9 +384,9 @@ public synchronized Parser loadAutoDetectParser() throws TikaConfigException, IO * *

    Usage: *

    -     * HandlerConfig config = loader.configs().load("handler-config", HandlerConfig.class);
    +     * MyConfig config = loader.configs().load("my-config", MyConfig.class);
          * // Or use kebab-case auto-conversion:
    -     * HandlerConfig config = loader.configs().load(HandlerConfig.class);
    +     * MyConfig config = loader.configs().load(MyConfig.class);
          * 
    * * @return the ConfigLoader instance diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java index 6c6521a8716..39849524858 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java @@ -47,6 +47,7 @@ public class TikaObjectMapperFactory { "renderers", "translators", "digester-factories", + "content-handler-factories", "other-configs" }; diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java index 80c54c0178f..195cfd6df05 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java @@ -203,4 +203,26 @@ public static boolean hasComponentConfig(Class componentClass) { public static Set getComponentFields() { return Collections.unmodifiableSet(FIELD_TO_CONFIG.keySet()); } + + /** + * Gets the contextKey for a class from the component registry. + * The contextKey is recorded in the .idx file by the annotation processor. + * + * @param clazz the class to check + * @return the contextKey class if specified, or null if not registered or no contextKey + */ + public static Class getContextKey(Class clazz) { + for (ComponentRegistry registry : REGISTRIES.values()) { + String friendlyName = registry.getFriendlyName(clazz); + if (friendlyName != null) { + try { + ComponentInfo info = registry.getComponentInfo(friendlyName); + return info.contextKey(); + } catch (TikaConfigException e) { + // continue to next registry + } + } + } + return null; + } } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java index 16607e2ade3..049d9d0327e 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java @@ -27,7 +27,6 @@ import com.fasterxml.jackson.databind.module.SimpleModule; import org.apache.tika.metadata.Metadata; -import org.apache.tika.serialization.serdes.MetadataDeserializer; import org.apache.tika.serialization.serdes.MetadataSerializer; public class JsonMetadata { @@ -56,13 +55,12 @@ private static void rebuildObjectMappers() { JsonFactory factory = new JsonFactory(); factory.setStreamReadConstraints(streamReadConstraints); + // Use TikaModule which includes Metadata serializers ObjectMapper mapper = new ObjectMapper(factory); - SimpleModule baseModule = new SimpleModule(); - baseModule.addDeserializer(Metadata.class, new MetadataDeserializer()); - baseModule.addSerializer(Metadata.class, new MetadataSerializer()); - mapper.registerModule(baseModule); + mapper.registerModule(new TikaModule()); OBJECT_MAPPER = mapper; + // Pretty printer needs custom serializer with sort flag ObjectMapper prettyMapper = new ObjectMapper(factory); SimpleModule prettySerializerModule = new SimpleModule(); prettySerializerModule.addSerializer(Metadata.class, new MetadataSerializer(true)); diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java index 2571c4c4b95..21f413087fd 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java @@ -28,7 +28,6 @@ import com.fasterxml.jackson.databind.module.SimpleModule; import org.apache.tika.metadata.Metadata; -import org.apache.tika.serialization.serdes.MetadataDeserializer; import org.apache.tika.serialization.serdes.MetadataSerializer; public class JsonMetadataList { @@ -57,13 +56,12 @@ private static void rebuildObjectMappers() { JsonFactory factory = new JsonFactory(); factory.setStreamReadConstraints(streamReadConstraints); + // Use TikaModule which includes Metadata serializers ObjectMapper mapper = new ObjectMapper(factory); - SimpleModule baseModule = new SimpleModule(); - baseModule.addDeserializer(Metadata.class, new MetadataDeserializer()); - baseModule.addSerializer(Metadata.class, new MetadataSerializer()); - mapper.registerModule(baseModule); + mapper.registerModule(new TikaModule()); OBJECT_MAPPER = mapper; + // Pretty printer needs custom serializer with sort flag ObjectMapper prettyMapper = new ObjectMapper(factory); SimpleModule prettySerializerModule = new SimpleModule(); prettySerializerModule.addSerializer(Metadata.class, new MetadataSerializer(true)); diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java index b50709702f1..249f7f71cfb 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java @@ -53,17 +53,24 @@ import org.apache.tika.exception.TikaConfigException; import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory; import org.apache.tika.language.translate.Translator; +import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MimeTypes; import org.apache.tika.parser.DefaultParser; +import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.apache.tika.renderer.Renderer; import org.apache.tika.sax.ContentHandlerDecoratorFactory; +import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.serialization.serdes.DefaultDetectorSerializer; import org.apache.tika.serialization.serdes.DefaultParserSerializer; +import org.apache.tika.serialization.serdes.MetadataDeserializer; +import org.apache.tika.serialization.serdes.MetadataSerializer; +import org.apache.tika.serialization.serdes.ParseContextDeserializer; +import org.apache.tika.serialization.serdes.ParseContextSerializer; /** * Jackson module that provides compact serialization for Tika components. @@ -103,6 +110,7 @@ public class TikaModule extends SimpleModule { COMPACT_FORMAT_INTERFACES.add(EmbeddedDocumentExtractorFactory.class); COMPACT_FORMAT_INTERFACES.add(MetadataWriteFilterFactory.class); COMPACT_FORMAT_INTERFACES.add(ContentHandlerDecoratorFactory.class); + COMPACT_FORMAT_INTERFACES.add(ContentHandlerFactory.class); } /** @@ -120,6 +128,14 @@ private static boolean usesCompactFormat(Class type) { public TikaModule() { super("TikaModule"); + + // Register Metadata serializers + addSerializer(Metadata.class, new MetadataSerializer()); + addDeserializer(Metadata.class, new MetadataDeserializer()); + + // Register ParseContext serializers + addSerializer(ParseContext.class, new ParseContextSerializer()); + addDeserializer(ParseContext.class, new ParseContextDeserializer()); } /** diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java index 997fe6e4fed..2dcf2961042 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.Iterator; +import java.util.Optional; import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.databind.DeserializationContext; @@ -30,6 +31,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.config.loader.ComponentInfo; import org.apache.tika.parser.ParseContext; import org.apache.tika.serialization.ComponentNameResolver; @@ -122,19 +124,22 @@ private static void deserializeTypedObjects(JsonNode typedNode, ParseContext par JsonNode configNode = typedNode.get(componentName); Class configClass = null; + Class contextKeyClass = null; // First, try component registry lookup (for friendly names like "pdf-parser-config") - try { - configClass = ComponentNameResolver.resolveClass( - componentName, ParseContextDeserializer.class.getClassLoader()); - } catch (ClassNotFoundException e) { - // Not in registry, try as FQCN + Optional infoOpt = ComponentNameResolver.getComponentInfo(componentName); + if (infoOpt.isPresent()) { + ComponentInfo info = infoOpt.get(); + configClass = info.componentClass(); + contextKeyClass = info.contextKey(); } // If not found in registry, try as fully qualified class name if (configClass == null) { try { configClass = Class.forName(componentName); + // Check if the class has a contextKey via its annotation + contextKeyClass = ComponentNameResolver.getContextKey(configClass); } catch (ClassNotFoundException e) { LOG.warn("Could not find class for typed component '{}', storing as JSON config", componentName); @@ -144,11 +149,15 @@ private static void deserializeTypedObjects(JsonNode typedNode, ParseContext par } } + // Use contextKey if available, otherwise use the config class itself + Class parseContextKey = (contextKeyClass != null) ? contextKeyClass : configClass; + // Deserialize and add to context try { Object config = mapper.treeToValue(configNode, configClass); - parseContext.set((Class) configClass, config); - LOG.debug("Deserialized typed object '{}' -> {}", componentName, configClass.getName()); + parseContext.set((Class) parseContextKey, config); + LOG.debug("Deserialized typed object '{}' -> {} (contextKey={})", + componentName, configClass.getName(), parseContextKey.getName()); } catch (Exception e) { LOG.warn("Failed to deserialize typed component '{}' as {}, storing as JSON config", componentName, configClass.getName(), e); diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java index 903c48f3e7c..e2545d4033a 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java @@ -49,19 +49,25 @@ public class ParseContextSerializer extends JsonSerializer { public static final String PARSE_CONTEXT = "parseContext"; public static final String TYPED = "typed"; + // Plain mapper for serializing values without TikaModule's component wrapping + private static final ObjectMapper PLAIN_MAPPER = new ObjectMapper(); + + static { + // Allow serialization of classes with no properties + PLAIN_MAPPER.disable(com.fasterxml.jackson.databind.SerializationFeature.FAIL_ON_EMPTY_BEANS); + } + @Override public void serialize(ParseContext parseContext, JsonGenerator gen, SerializerProvider serializers) throws IOException { gen.writeStartObject(); - ObjectMapper mapper = (ObjectMapper) gen.getCodec(); - // First, serialize typed objects from the context map under "typed" key Map contextMap = parseContext.getContextMap(); boolean hasTypedObjects = false; for (Map.Entry entry : contextMap.entrySet()) { - String className = entry.getKey(); + String keyClassName = entry.getKey(); Object value = entry.getValue(); // Skip null values @@ -69,10 +75,14 @@ public void serialize(ParseContext parseContext, JsonGenerator gen, continue; } - // Try to find a friendly component name, otherwise use FQCN - String keyName = findComponentName(className); + // Use the actual value's class for serialization, not the key class (which may be an interface) + // This ensures we can deserialize back to the concrete class + String valueClassName = value.getClass().getName(); + + // Try to find a friendly component name for the value's class, otherwise use FQCN + String keyName = findComponentName(valueClassName); if (keyName == null) { - keyName = className; + keyName = valueClassName; } if (!hasTypedObjects) { @@ -81,7 +91,7 @@ public void serialize(ParseContext parseContext, JsonGenerator gen, hasTypedObjects = true; } gen.writeFieldName(keyName); - gen.writeRawValue(mapper.writeValueAsString(value)); + gen.writeRawValue(PLAIN_MAPPER.writeValueAsString(value)); } if (hasTypedObjects) { diff --git a/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java index 80063b151d5..1db87866e7f 100644 --- a/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java @@ -51,9 +51,9 @@ public void setUp() throws Exception { // ==================== Test POJOs ==================== /** - * Simple config POJO with properties. + * Simple config POJO with properties for testing config loading. */ - public static class HandlerConfig { + public static class RetryConfig { private int timeout; private int retries; private boolean enabled; @@ -185,7 +185,7 @@ public abstract static class AbstractHandler implements TestHandler { @Test public void testLoadByExplicitKey() throws Exception { - HandlerConfig config = configLoader.load("handler-config", HandlerConfig.class); + RetryConfig config = configLoader.load("retry-config", RetryConfig.class); assertNotNull(config); assertEquals(5000, config.getTimeout()); @@ -195,7 +195,7 @@ public void testLoadByExplicitKey() throws Exception { @Test public void testLoadByClassNameKebabCase() throws Exception { - HandlerConfig config = configLoader.load(HandlerConfig.class); + RetryConfig config = configLoader.load(RetryConfig.class); assertNotNull(config); assertEquals(5000, config.getTimeout()); @@ -224,20 +224,20 @@ public void testLoadByClassNameMyFeatureSettings() throws Exception { @Test public void testLoadWithDefaultValue() throws Exception { - HandlerConfig config = configLoader.load("handler-config", HandlerConfig.class); + RetryConfig config = configLoader.load("retry-config", RetryConfig.class); assertNotNull(config); // Non-existent key with default - HandlerConfig defaultConfig = new HandlerConfig(); + RetryConfig defaultConfig = new RetryConfig(); defaultConfig.setTimeout(9999); - HandlerConfig result = configLoader.load("non-existent", HandlerConfig.class, defaultConfig); + RetryConfig result = configLoader.load("non-existent", RetryConfig.class, defaultConfig); assertEquals(9999, result.getTimeout()); } @Test public void testLoadMissingKeyReturnsNull() throws Exception { - HandlerConfig config = configLoader.load("non-existent-key", HandlerConfig.class); + RetryConfig config = configLoader.load("non-existent-key", RetryConfig.class); assertNull(config); } @@ -312,7 +312,7 @@ public void testLoadProhibitedKeyMetadataFilters() throws Exception { @Test public void testHasKey() throws Exception { - assertTrue(configLoader.hasKey("handler-config")); + assertTrue(configLoader.hasKey("retry-config")); assertTrue(configLoader.hasKey("simple-handler")); assertFalse(configLoader.hasKey("non-existent")); } @@ -350,10 +350,10 @@ public void testLoadWithUnexpectedFieldFails() throws Exception { TikaLoader loader = TikaLoader.load(configPath); TikaConfigException ex = assertThrows(TikaConfigException.class, () -> - loader.configs().load("handler-config", HandlerConfig.class)); + loader.configs().load("retry-config", RetryConfig.class)); // Should contain information about the unrecognized field - assertTrue(ex.getMessage().contains("handler-config") || + assertTrue(ex.getMessage().contains("retry-config") || ex.getCause().getMessage().contains("Unrecognized") || ex.getCause().getMessage().contains("unexpectedField"), "Exception should mention the unrecognized field"); @@ -370,7 +370,7 @@ public void testKebabCaseConversion() throws Exception { @Test public void testLoadByClassWithDefault() throws Exception { - HandlerConfig config = configLoader.load(HandlerConfig.class); + RetryConfig config = configLoader.load(RetryConfig.class); assertNotNull(config); // Non-existent class @@ -394,14 +394,14 @@ public void testLoadWithDefaultsPartialConfig() throws Exception { TikaLoader loader = TikaLoader.load(configPath); // Set up defaults - HandlerConfig defaults = new HandlerConfig(); + RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); defaults.setRetries(2); defaults.setEnabled(false); // JSON only has: { "enabled": true } - HandlerConfig config = loader.configs().loadWithDefaults("handler-config", - HandlerConfig.class, + RetryConfig config = loader.configs().loadWithDefaults("retry-config", + RetryConfig.class, defaults); assertNotNull(config); @@ -417,14 +417,14 @@ public void testLoadWithDefaultsFullOverride() throws Exception { getClass().getResource("/configs/test-partial-config.json").toURI()); TikaLoader loader = TikaLoader.load(configPath); - HandlerConfig defaults = new HandlerConfig(); + RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); defaults.setRetries(2); defaults.setEnabled(false); // JSON has: { "timeout": 10000, "retries": 5, "enabled": false } - HandlerConfig config = loader.configs().loadWithDefaults("handler-config-full", - HandlerConfig.class, + RetryConfig config = loader.configs().loadWithDefaults("retry-config-full", + RetryConfig.class, defaults); assertNotNull(config); @@ -436,13 +436,13 @@ public void testLoadWithDefaultsFullOverride() throws Exception { @Test public void testLoadWithDefaultsMissingKey() throws Exception { // When key doesn't exist, should return original defaults unchanged - HandlerConfig defaults = new HandlerConfig(); + RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); defaults.setRetries(2); defaults.setEnabled(false); - HandlerConfig config = configLoader.loadWithDefaults("non-existent-key", - HandlerConfig.class, + RetryConfig config = configLoader.loadWithDefaults("non-existent-key", + RetryConfig.class, defaults); assertNotNull(config); @@ -458,13 +458,13 @@ public void testLoadWithDefaultsByClass() throws Exception { getClass().getResource("/configs/test-partial-config.json").toURI()); TikaLoader loader = TikaLoader.load(configPath); - HandlerConfig defaults = new HandlerConfig(); + RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); defaults.setRetries(2); defaults.setEnabled(false); - // Uses kebab-case: HandlerConfig -> "handler-config" - HandlerConfig config = loader.configs().loadWithDefaults(HandlerConfig.class, defaults); + // Uses kebab-case: RetryConfig -> "retry-config" + RetryConfig config = loader.configs().loadWithDefaults(RetryConfig.class, defaults); assertNotNull(config); assertEquals(30000, config.getTimeout()); @@ -479,20 +479,20 @@ public void testLoadVsLoadWithDefaults() throws Exception { getClass().getResource("/configs/test-partial-config.json").toURI()); TikaLoader loader = TikaLoader.load(configPath); - HandlerConfig defaults = new HandlerConfig(); + RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); defaults.setRetries(2); defaults.setEnabled(false); // Using load() - creates new object, loses defaults - HandlerConfig config1 = loader.configs().load("handler-config", HandlerConfig.class); + RetryConfig config1 = loader.configs().load("retry-config", RetryConfig.class); assertEquals(0, config1.getTimeout()); // ❌ Lost default! assertEquals(0, config1.getRetries()); // ❌ Lost default! assertTrue(config1.isEnabled()); // ✅ From JSON // Using loadWithDefaults() - merges into defaults - HandlerConfig config2 = loader.configs().loadWithDefaults("handler-config", - HandlerConfig.class, + RetryConfig config2 = loader.configs().loadWithDefaults("retry-config", + RetryConfig.class, defaults); assertEquals(30000, config2.getTimeout()); // ✅ Kept default! assertEquals(2, config2.getRetries()); // ✅ Kept default! @@ -508,14 +508,14 @@ public void testLoadWithDefaultsDoesNotMutateOriginal() throws Exception { getClass().getResource("/configs/test-partial-config.json").toURI()); TikaLoader loader = TikaLoader.load(configPath); - HandlerConfig defaults = new HandlerConfig(); + RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); defaults.setRetries(2); defaults.setEnabled(false); // Load config with partial override (JSON only has "enabled": true) - HandlerConfig result = loader.configs().loadWithDefaults("handler-config", - HandlerConfig.class, + RetryConfig result = loader.configs().loadWithDefaults("retry-config", + RetryConfig.class, defaults); // Verify result has merged values @@ -541,17 +541,17 @@ public void testLoadWithDefaultsReusableDefaults() throws Exception { getClass().getResource("/configs/test-partial-config.json").toURI()); TikaLoader loader = TikaLoader.load(configPath); - HandlerConfig defaults = new HandlerConfig(); + RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); defaults.setRetries(2); defaults.setEnabled(false); // Load multiple times with same defaults - HandlerConfig config1 = loader.configs().loadWithDefaults("handler-config", - HandlerConfig.class, + RetryConfig config1 = loader.configs().loadWithDefaults("retry-config", + RetryConfig.class, defaults); - HandlerConfig config2 = loader.configs().loadWithDefaults("handler-config-full", - HandlerConfig.class, + RetryConfig config2 = loader.configs().loadWithDefaults("retry-config-full", + RetryConfig.class, defaults); // Verify results are different @@ -564,8 +564,8 @@ public void testLoadWithDefaultsReusableDefaults() throws Exception { assertFalse(defaults.isEnabled()); // Use defaults one more time - HandlerConfig config3 = loader.configs().loadWithDefaults("non-existent", - HandlerConfig.class, + RetryConfig config3 = loader.configs().loadWithDefaults("non-existent", + RetryConfig.class, defaults); assertEquals(defaults, config3); // Should return original when key missing } @@ -595,13 +595,13 @@ public void testLoadWithDefaultsComplexObjectImmutability() throws Exception { @Test public void testLoadWithDefaultsMissingKeyDoesNotClone() throws Exception { // When key is missing, should return the original object (no unnecessary cloning) - HandlerConfig defaults = new HandlerConfig(); + RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); defaults.setRetries(2); defaults.setEnabled(false); - HandlerConfig result = configLoader.loadWithDefaults("non-existent-key", - HandlerConfig.class, + RetryConfig result = configLoader.loadWithDefaults("non-existent-key", + RetryConfig.class, defaults); // Should return the exact same object when key is missing @@ -619,17 +619,17 @@ public void testLoadWithDefaultsThreadSafety() throws Exception { TikaLoader loader = TikaLoader.load(configPath); // Shared defaults object - HandlerConfig sharedDefaults = new HandlerConfig(); + RetryConfig sharedDefaults = new RetryConfig(); sharedDefaults.setTimeout(30000); sharedDefaults.setRetries(2); sharedDefaults.setEnabled(false); // Simulate concurrent usage (not a real concurrency test, just demonstrates safety) - HandlerConfig result1 = loader.configs().loadWithDefaults("handler-config", - HandlerConfig.class, + RetryConfig result1 = loader.configs().loadWithDefaults("retry-config", + RetryConfig.class, sharedDefaults); - HandlerConfig result2 = loader.configs().loadWithDefaults("handler-config-full", - HandlerConfig.class, + RetryConfig result2 = loader.configs().loadWithDefaults("retry-config-full", + RetryConfig.class, sharedDefaults); // Both results should be valid diff --git a/tika-serialization/src/test/java/org/apache/tika/sax/UppercasingContentHandlerFactory.java b/tika-serialization/src/test/java/org/apache/tika/sax/UppercasingContentHandlerFactory.java index cf0c56043f8..2826320979e 100644 --- a/tika-serialization/src/test/java/org/apache/tika/sax/UppercasingContentHandlerFactory.java +++ b/tika-serialization/src/test/java/org/apache/tika/sax/UppercasingContentHandlerFactory.java @@ -16,9 +16,6 @@ */ package org.apache.tika.sax; -import java.io.OutputStream; -import java.nio.charset.Charset; - import org.xml.sax.ContentHandler; import org.apache.tika.config.TikaComponent; @@ -36,17 +33,7 @@ public class UppercasingContentHandlerFactory implements ContentHandlerFactory { private static final long serialVersionUID = 1L; @Override - public ContentHandler getNewContentHandler() { + public ContentHandler createHandler() { return new UppercasingContentHandler(new ToTextContentHandler()); } - - @Override - public ContentHandler getNewContentHandler(OutputStream os, Charset charset) { - try { - return new UppercasingContentHandler(new ToTextContentHandler(os, charset.name())); - } catch (java.io.UnsupportedEncodingException e) { - // Should never happen since we're using a valid Charset - throw new RuntimeException("Unexpected encoding error", e); - } - } } diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java index 43c2b9cd74a..c8fd0e42210 100644 --- a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java +++ b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java @@ -39,6 +39,8 @@ import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.filter.MockUpperCaseFilter; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.parser.SimplePasswordProvider; import org.apache.tika.serialization.serdes.ParseContextDeserializer; import org.apache.tika.serialization.serdes.ParseContextSerializer; @@ -313,4 +315,31 @@ public void testContextKeyDeserialization() throws Exception { assertFalse(selector.select(new org.apache.tika.metadata.Metadata()), "SkipEmbeddedDocumentSelector should return false for all documents"); } + + @Test + public void testSimplePasswordProviderDeserialization() throws Exception { + // Test that SimplePasswordProvider with contextKey=PasswordProvider.class + // is stored in ParseContext with the contextKey + String json = """ + { + "simple-password-provider": { + "password": "secret123" + } + } + """; + + ObjectMapper mapper = createMapper(); + ParseContext deserialized = mapper.readValue(json, ParseContext.class); + + // Resolve the config + ParseContextUtils.resolveAll(deserialized, Thread.currentThread().getContextClassLoader()); + + // Should be accessible via PasswordProvider.class (the contextKey) + PasswordProvider provider = deserialized.get(PasswordProvider.class); + assertNotNull(provider, "PasswordProvider should be found via contextKey"); + assertTrue(provider instanceof SimplePasswordProvider, + "Should be SimplePasswordProvider instance"); + assertEquals("secret123", provider.getPassword(null), + "Password should match the configured value"); + } } diff --git a/tika-serialization/src/test/resources/configs/test-config-loader.json b/tika-serialization/src/test/resources/configs/test-config-loader.json index 5305f2a43a9..dd657c81e05 100644 --- a/tika-serialization/src/test/resources/configs/test-config-loader.json +++ b/tika-serialization/src/test/resources/configs/test-config-loader.json @@ -4,7 +4,7 @@ ], "other-configs": { - "handler-config": { + "retry-config": { "timeout": 5000, "retries": 3, "enabled": true diff --git a/tika-serialization/src/test/resources/configs/test-partial-config.json b/tika-serialization/src/test/resources/configs/test-partial-config.json index 866f2594b7c..5c5eab6992a 100644 --- a/tika-serialization/src/test/resources/configs/test-partial-config.json +++ b/tika-serialization/src/test/resources/configs/test-partial-config.json @@ -1,10 +1,10 @@ { "other-configs": { - "handler-config": { + "retry-config": { "enabled": true }, - "handler-config-full": { + "retry-config-full": { "timeout": 10000, "retries": 5, "enabled": false diff --git a/tika-serialization/src/test/resources/configs/test-unexpected-field.json b/tika-serialization/src/test/resources/configs/test-unexpected-field.json index d250d5fa1d3..5946b399ea9 100644 --- a/tika-serialization/src/test/resources/configs/test-unexpected-field.json +++ b/tika-serialization/src/test/resources/configs/test-unexpected-field.json @@ -1,6 +1,6 @@ { "other-configs": { - "handler-config": { + "retry-config": { "timeout": 5000, "retries": 3, "enabled": true, diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java index d215552db17..698241cc3bb 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java @@ -45,8 +45,9 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.server.core.MetadataList; import org.apache.tika.server.core.TikaServerParseException; @@ -59,7 +60,7 @@ public class RecursiveMetadataResource { private static final Logger LOG = LoggerFactory.getLogger(RecursiveMetadataResource.class); public static List parseMetadata(TikaInputStream tis, Metadata metadata, MultivaluedMap httpHeaders, - UriInfo info, HandlerConfig handlerConfig) + UriInfo info, ServerHandlerConfig handlerConfig) throws Exception { final ParseContext context = new ParseContext(); @@ -69,10 +70,16 @@ public static List parseMetadata(TikaInputStream tis, Metadata metadat fillMetadata(parser, metadata, httpHeaders); TikaResource.logRequest(LOG, "/rmeta", metadata); - BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.getType(); + // Check if a ContentHandlerFactory was provided in ParseContext + ContentHandlerFactory factory = context.get(ContentHandlerFactory.class); + if (factory == null) { + // Fall back to creating one from HTTP headers + BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.type(); + factory = new BasicContentHandlerFactory(type, handlerConfig.writeLimit(), handlerConfig.throwOnWriteLimitReached(), context); + } RecursiveParserWrapperHandler handler = - new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(type, handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), context), - handlerConfig.getMaxEmbeddedResources()); + new RecursiveParserWrapperHandler(factory, + handlerConfig.maxEmbeddedResources()); try { TikaResource.parse(wrapper, LOG, "/rmeta", tis, handler, metadata, context); } catch (TikaServerParseException e) { @@ -90,7 +97,7 @@ public static List parseMetadata(TikaInputStream tis, Metadata metadat return metadataList; } - static HandlerConfig buildHandlerConfig(MultivaluedMap httpHeaders, String handlerTypeName, HandlerConfig.PARSE_MODE parseMode) { + static ServerHandlerConfig buildHandlerConfig(MultivaluedMap httpHeaders, String handlerTypeName, ParseMode parseMode) { int writeLimit = -1; if (httpHeaders.containsKey("writeLimit")) { writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit")); @@ -100,7 +107,7 @@ static HandlerConfig buildHandlerConfig(MultivaluedMap httpHeade if (httpHeaders.containsKey("maxEmbeddedResources")) { maxEmbeddedResources = Integer.parseInt(httpHeaders.getFirst("maxEmbeddedResources")); } - return new HandlerConfig(BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE), parseMode, writeLimit, maxEmbeddedResources, + return new ServerHandlerConfig(BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE), parseMode, writeLimit, maxEmbeddedResources, TikaResource.getThrowOnWriteLimitReached(httpHeaders)); } @@ -136,7 +143,7 @@ public Response getMetadataFromMultipart(Attachment att, @Context UriInfo info, try (TikaInputStream tis = TikaInputStream.get(att.getObject(InputStream.class))) { return Response .ok(parseMetadataToMetadataList(tis, new Metadata(), att.getHeaders(), info, - buildHandlerConfig(att.getHeaders(), handlerTypeName, HandlerConfig.PARSE_MODE.RMETA))) + buildHandlerConfig(att.getHeaders(), handlerTypeName, ParseMode.RMETA))) .build(); } } @@ -163,21 +170,27 @@ public Response getMetadataWithConfig( return Response .ok(parseMetadataWithContext(tis, metadata, httpHeaders.getRequestHeaders(), info, - buildHandlerConfig(httpHeaders.getRequestHeaders(), handlerTypeName != null ? handlerTypeName.substring(1) : null, HandlerConfig.PARSE_MODE.RMETA), + buildHandlerConfig(httpHeaders.getRequestHeaders(), handlerTypeName != null ? handlerTypeName.substring(1) : null, ParseMode.RMETA), context)) .build(); } } private MetadataList parseMetadataWithContext(TikaInputStream tis, Metadata metadata, MultivaluedMap httpHeaders, - UriInfo info, HandlerConfig handlerConfig, ParseContext context) throws Exception { + UriInfo info, ServerHandlerConfig handlerConfig, ParseContext context) throws Exception { Parser parser = TikaResource.createParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); - BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.getType(); + // Check if a ContentHandlerFactory was provided in ParseContext (e.g., from config JSON) + ContentHandlerFactory factory = context.get(ContentHandlerFactory.class); + if (factory == null) { + // Fall back to creating one from HTTP headers + BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.type(); + factory = new BasicContentHandlerFactory(type, handlerConfig.writeLimit(), handlerConfig.throwOnWriteLimitReached(), context); + } RecursiveParserWrapperHandler handler = - new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(type, handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), context), - handlerConfig.getMaxEmbeddedResources()); + new RecursiveParserWrapperHandler(factory, + handlerConfig.maxEmbeddedResources()); try { TikaResource.parse(wrapper, LOG, "/rmeta/config", tis, handler, metadata, context); } catch (TikaServerParseException e) { @@ -225,12 +238,13 @@ public Response getMetadata(InputStream is, @Context HttpHeaders httpHeaders, @C try (TikaInputStream tis = TikaResource.getInputStream(is, metadata, httpHeaders, info)) { return Response .ok(parseMetadataToMetadataList(tis, metadata, httpHeaders.getRequestHeaders(), info, - buildHandlerConfig(httpHeaders.getRequestHeaders(), handlerTypeName, HandlerConfig.PARSE_MODE.RMETA))) + buildHandlerConfig(httpHeaders.getRequestHeaders(), handlerTypeName, ParseMode.RMETA))) .build(); } } - private MetadataList parseMetadataToMetadataList(TikaInputStream tis, Metadata metadata, MultivaluedMap httpHeaders, UriInfo info, HandlerConfig handlerConfig) + private MetadataList parseMetadataToMetadataList(TikaInputStream tis, Metadata metadata, + MultivaluedMap httpHeaders, UriInfo info, ServerHandlerConfig handlerConfig) throws Exception { return new MetadataList(parseMetadata(tis, metadata, httpHeaders, info, handlerConfig)); } diff --git a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorConfig.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/ServerHandlerConfig.java similarity index 58% rename from tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorConfig.java rename to tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/ServerHandlerConfig.java index 09a9ab4abb3..b46802aecd1 100644 --- a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorConfig.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/ServerHandlerConfig.java @@ -14,8 +14,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.pipes.api.pipesiterator; +package org.apache.tika.server.core.resource; -public interface PipesIteratorConfig { - PipesIteratorBaseConfig getBaseConfig(); +import org.apache.tika.pipes.api.ParseMode; +import org.apache.tika.sax.BasicContentHandlerFactory; + +/** + * Server-internal configuration for request handlers. + * This holds configuration parsed from HTTP headers for a single request + * for the BasicContentHandlerFactory kinds of elements. + */ +public record ServerHandlerConfig( + BasicContentHandlerFactory.HANDLER_TYPE type, + ParseMode parseMode, + int writeLimit, + int maxEmbeddedResources, + boolean throwOnWriteLimitReached +) { } diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java index 9b312606b21..ad379252491 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java @@ -17,7 +17,6 @@ package org.apache.tika.server.core.resource; import static java.nio.charset.StandardCharsets.UTF_8; -import static org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig.DEFAULT_HANDLER_CONFIG; import static org.apache.tika.server.core.resource.RecursiveMetadataResource.DEFAULT_HANDLER_TYPE; import static org.apache.tika.server.core.resource.RecursiveMetadataResource.HANDLER_TYPE_PARAM; @@ -77,6 +76,7 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.sax.ExpandedTitleContentHandler; import org.apache.tika.sax.RichTextContentHandler; import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler; @@ -151,24 +151,22 @@ public static void mergeParseContextFromConfig(String configJson, ParseContext c JsonNode root = mapper.readTree(configJson); // Use root directly - the JSON should contain parser configs at the top level ParseContext configuredContext = ParseContextDeserializer.readParseContext(root, mapper); - - // Copy jsonConfigs first (for SelfConfiguring parsers like PDFParser) - for (Map.Entry entry : configuredContext.getJsonConfigs().entrySet()) { - context.setJsonConfig(entry.getKey(), entry.getValue()); - } - - // Then resolve all configs to typed objects ParseContextUtils.resolveAll(configuredContext, Thread.currentThread().getContextClassLoader()); - - // Copy resolved typed objects from contextMap + // Copy resolved context entries for (Map.Entry entry : configuredContext.getContextMap().entrySet()) { try { Class clazz = Class.forName(entry.getKey()); context.set((Class) clazz, entry.getValue()); + LOG.debug("Merged contextMap entry {} into context", entry.getKey()); } catch (ClassNotFoundException e) { LOG.warn("Could not load class for parseContext entry: {}", entry.getKey()); } } + // Copy jsonConfigs for lazy resolution by parsers (e.g., pdf-parser config) + for (Map.Entry entry : configuredContext.getJsonConfigs().entrySet()) { + context.setJsonConfig(entry.getKey(), entry.getValue().json()); + LOG.debug("Merged jsonConfig entry {} into context", entry.getKey()); + } } public static TikaInputStream getInputStream(InputStream is, Metadata metadata, HttpHeaders headers, UriInfo uriInfo) { @@ -352,7 +350,8 @@ public static boolean getThrowOnWriteLimitReached(MultivaluedMap throw new IllegalArgumentException("'throwOnWriteLimitReached' must be either 'true' or 'false'"); } } - return DEFAULT_HANDLER_CONFIG.isThrowOnWriteLimitReached(); + // Default: throw on write limit reached + return true; } public static long getTaskTimeout(ParseContext parseContext) { @@ -542,9 +541,14 @@ private void parseToMetadata(TikaInputStream tis, Metadata metadata, Multivalued writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit")); } - BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE); - BasicContentHandlerFactory fact = new BasicContentHandlerFactory(type, writeLimit, throwOnWriteLimitReached, context); - ContentHandler contentHandler = fact.getNewContentHandler(); + // Check if a ContentHandlerFactory was provided in ParseContext (e.g., from config JSON) + ContentHandlerFactory fact = context.get(ContentHandlerFactory.class); + if (fact == null) { + // Fall back to creating one from HTTP headers + BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE); + fact = new BasicContentHandlerFactory(type, writeLimit, throwOnWriteLimitReached, context); + } + ContentHandler contentHandler = fact.createHandler(); try { parse(parser, LOG, info.getPath(), tis, contentHandler, metadata, context); diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java index 7bff9149c01..ee899ab180e 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java @@ -54,13 +54,14 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.pipes.core.fetcher.FetcherManager; import org.apache.tika.pipes.core.serialization.JsonFetchEmitTuple; import org.apache.tika.plugins.TikaPluginManager; import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.serialization.JsonMetadataList; import org.apache.tika.server.core.resource.PipesResource; import org.apache.tika.server.core.writer.JSONObjWriter; @@ -203,8 +204,9 @@ public void testPostXML() throws Exception { userMetadata.add("my-key-multi", s); } ParseContext parseContext = new ParseContext(); - HandlerConfig handlerConfig = new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.XML, HandlerConfig.PARSE_MODE.RMETA, -1, -1, true); - parseContext.set(HandlerConfig.class, handlerConfig); + parseContext.set(ContentHandlerFactory.class, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + parseContext.set(ParseMode.class, ParseMode.RMETA); FetchEmitTuple t = new FetchEmitTuple("myId", new FetchKey(FETCHER_ID, "hello_world.xml"), new EmitKey(EMITTER_JSON_ID, ""), userMetadata, parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT); diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerAsyncIntegrationTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerAsyncIntegrationTest.java index 97e10201ac6..fb7aaf554bd 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerAsyncIntegrationTest.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerAsyncIntegrationTest.java @@ -18,7 +18,6 @@ import static java.nio.charset.StandardCharsets.UTF_8; -import static org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig.DEFAULT_HANDLER_CONFIG; import static org.apache.tika.server.core.CXFTestBase.EMITTER_JSON_ID; import static org.apache.tika.server.core.CXFTestBase.FETCHER_ID; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -49,10 +48,12 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.pipes.core.serialization.JsonFetchEmitTupleList; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; @Disabled("useful for development...need to turn it into a real unit test") public class TikaServerAsyncIntegrationTest extends IntegrationTestBase { @@ -170,7 +171,9 @@ private JsonNode sendAsync(List fileNames) throws Exception { private FetchEmitTuple getFetchEmitTuple(String fileName) throws IOException { ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG); + parseContext.set(ContentHandlerFactory.class, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + parseContext.set(ParseMode.class, ParseMode.RMETA); return new FetchEmitTuple(fileName, new FetchKey(FETCHER_ID, fileName), new EmitKey(EMITTER_JSON_ID, ""), new Metadata(), parseContext, ON_PARSE_EXCEPTION); } diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java index e7957c9df2d..bcbe5251c72 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java @@ -18,7 +18,6 @@ import static java.nio.charset.StandardCharsets.UTF_8; -import static org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig.DEFAULT_HANDLER_CONFIG; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -43,10 +42,12 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.pipes.core.serialization.JsonFetchEmitTuple; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.utils.ProcessUtils; public class TikaServerPipesIntegrationTest extends IntegrationTestBase { @@ -221,7 +222,9 @@ private JsonNode testOneWithPerRequestTimeout(String fileName, long timeoutMilli private String getJsonStringWithTimeout(String fileName, long timeoutMillis) throws IOException { ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG); + parseContext.set(ContentHandlerFactory.class, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + parseContext.set(ParseMode.class, ParseMode.RMETA); parseContext.setJsonConfig("tika-task-timeout", "{\"timeoutMillis\":" + timeoutMillis + "}"); FetchEmitTuple t = new FetchEmitTuple(fileName, @@ -259,7 +262,9 @@ private JsonNode testOne(String fileName, boolean shouldFileExist, FetchEmitTupl private String getJsonString(String fileName, FetchEmitTuple.ON_PARSE_EXCEPTION onParseException) throws IOException { ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG); + parseContext.set(ContentHandlerFactory.class, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + parseContext.set(ParseMode.class, ParseMode.RMETA); FetchEmitTuple t = new FetchEmitTuple(fileName, new FetchKey(CXFTestBase.FETCHER_ID, fileName), new EmitKey(CXFTestBase.EMITTER_JSON_ID, ""), new Metadata(), parseContext, onParseException); return JsonFetchEmitTuple.toJson(t); diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java index 8dcd90a296b..4946191552d 100644 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java @@ -427,8 +427,8 @@ public void testEmbeddedResourceLimit() throws Exception { } } - // TIKA-3227 - TODO: re-enable once HandlerConfig is configurable via JSON - // Use maxEmbeddedResources=0 in handler-config to skip embedded documents + // TIKA-3227 - TODO: re-enable once maxEmbeddedResources is configurable via JSON + // Use maxEmbeddedResources=0 in config to skip embedded documents @Test public void testWriteLimit() throws Exception { diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java index 3761fb0bfe2..078a83038e8 100644 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java @@ -16,7 +16,6 @@ */ package org.apache.tika.server.standard; -import static org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig.DEFAULT_HANDLER_CONFIG; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -56,13 +55,15 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; -import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.pipes.core.fetcher.FetcherManager; import org.apache.tika.pipes.core.serialization.JsonFetchEmitTuple; import org.apache.tika.plugins.TikaPluginManager; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.serialization.JsonMetadataList; import org.apache.tika.server.core.CXFTestBase; import org.apache.tika.server.core.FetcherStreamFactory; @@ -181,9 +182,8 @@ public void testBasic() throws Exception { @Test public void testConcatenated() throws Exception { ParseContext parseContext = new ParseContext(); - // Use addConfig with JSON for handler-config - parseContext.setJsonConfig("handler-config", - "{\"type\": \"TEXT\", \"parseMode\": \"CONCATENATE\", \"writeLimit\": -1, \"maxEmbeddedResources\": -1, \"throwOnWriteLimitReached\": true}"); + // Set ParseMode directly - it's now separate from ContentHandlerFactory + parseContext.set(ParseMode.class, ParseMode.CONCATENATE); FetchEmitTuple t = new FetchEmitTuple("myId", new FetchKey(FETCHER_ID, "test_recursive_embedded.docx"), new EmitKey(EMITTER_JSON_ID, ""), new Metadata(), parseContext, @@ -247,7 +247,10 @@ public void testBytes() throws Exception { config.setZeroPadName(10); config.setSuffixStrategy(EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.EXISTING); ParseContext parseContext = new ParseContext(); - parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG); + // Set default content handler and parse mode + parseContext.set(ContentHandlerFactory.class, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + parseContext.set(ParseMode.class, ParseMode.RMETA); parseContext.set(EmbeddedDocumentBytesConfig.class, config); FetchEmitTuple t = new FetchEmitTuple("myId", new FetchKey(FETCHER_ID, "test_recursive_embedded.docx"),