diff --git a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java index 273dfeda1cf..9e818627c62 100644 --- a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java +++ b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java @@ -77,6 +77,8 @@ public class TikaComponentProcessor extends AbstractProcessor { SERVICE_INTERFACES.put("org.apache.tika.renderer.Renderer", "renderers"); SERVICE_INTERFACES.put("org.apache.tika.metadata.filter.MetadataFilter", "metadata-filters"); SERVICE_INTERFACES.put("org.apache.tika.digest.DigesterFactory", "digester-factories"); + SERVICE_INTERFACES.put("org.apache.tika.sax.ContentHandlerFactory", + "content-handler-factories"); } private Messager messager; diff --git a/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java b/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java index e7f35814cb9..69e42570b30 100644 --- a/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java +++ b/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java @@ -34,8 +34,8 @@ *
This annotation is only used at compile time by the annotation processor. - * It is retained in .class files for tooling but not loaded by the runtime JVM. + *
This annotation is processed at compile time by the annotation processor. + * The contextKey is recorded in the .idx file for runtime resolution. * *
Example usage: *
diff --git a/tika-app/src/test/resources/configs/config-template.json b/tika-app/src/test/resources/configs/config-template.json
index dc73dadfe1f..e25bc96833a 100644
--- a/tika-app/src/test/resources/configs/config-template.json
+++ b/tika-app/src/test/resources/configs/config-template.json
@@ -1,4 +1,12 @@
{
+ "content-handler-factory": {
+ "basic-content-handler-factory": {
+ "type": "TEXT",
+ "writeLimit": -1,
+ "maxEmbeddedResources": -1,
+ "throwOnWriteLimitReached": true
+ }
+ },
"fetchers": {
"fsf": {
"file-system-fetcher": {
@@ -21,23 +29,13 @@
"file-system-pipes-iterator": {
"basePath": "FETCHER_BASE_PATH",
"countTotal": true,
- "baseConfig": {
- "fetcherId": "fsf",
- "emitterId": "fse",
- "handlerConfig": {
- "type": "TEXT",
- "parseMode": "RMETA",
- "writeLimit": -1,
- "maxEmbeddedResources": -1,
- "throwOnWriteLimitReached": true
- },
- "onParseException": "EMIT",
- "maxWaitMs": 600000,
- "queueSize": 10000
- }
+ "fetcherId": "fsf",
+ "emitterId": "fse"
}
},
"pipes": {
+ "parseMode": "RMETA",
+ "onParseException": "EMIT",
"emitWithinMillis": 10000,
"emitMaxEstimatedBytes": 100000,
"queueSize": 10000,
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 0c0599ec765..07159dba01b 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -141,7 +141,7 @@ public void parse(TikaInputStream tis, ContentHandler recursiveParserWrapperHand
new EmbeddedParserDecorator(getWrappedParser(), "/", "/", parserState);
context.set(Parser.class, decorator);
ContentHandler localHandler =
- parserState.recursiveParserWrapperHandler.getNewContentHandler();
+ parserState.recursiveParserWrapperHandler.createHandler();
long started = System.currentTimeMillis();
parserState.recursiveParserWrapperHandler.startDocument();
int writeLimit = -1;
@@ -241,7 +241,7 @@ public void parse(TikaInputStream tis, ContentHandler ignore, Metadata metadata,
metadata.set(TikaCoreProperties.EMBEDDED_ID, parserState.embeddedCount);
//get a fresh handler
ContentHandler localHandler =
- parserState.recursiveParserWrapperHandler.getNewContentHandler();
+ parserState.recursiveParserWrapperHandler.createHandler();
parserState.recursiveParserWrapperHandler.startEmbeddedDocument(localHandler, metadata);
Parser preContextParser = context.get(Parser.class);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 568d61c2570..cc78a55be3e 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -260,7 +260,7 @@ private void parse(TikaInputStream tis, ContentHandler handler,
// If not, the user will get text from every parser
// mushed together onto the one solitary handler...
if (handlerFactory != null) {
- handler = handlerFactory.getNewContentHandler();
+ handler = handlerFactory.createHandler();
}
// Record that we used this parser
diff --git a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
index 850ceb4147c..ea4efedf6b2 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
@@ -16,9 +16,7 @@
*/
package org.apache.tika.sax;
-import java.io.OutputStream;
import java.io.Serializable;
-import java.nio.charset.Charset;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -55,12 +53,8 @@ public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandle
this.maxEmbeddedResources = maxEmbeddedResources;
}
- public ContentHandler getNewContentHandler() {
- return contentHandlerFactory.getNewContentHandler();
- }
-
- public ContentHandler getNewContentHandler(OutputStream os, Charset charset) {
- return contentHandlerFactory.getNewContentHandler(os, charset);
+ public ContentHandler createHandler() {
+ return contentHandlerFactory.createHandler();
}
/**
diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
index 361b7817c72..2612ec8650b 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
@@ -26,19 +26,30 @@
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;
+import org.apache.tika.config.TikaComponent;
import org.apache.tika.parser.ParseContext;
/**
- * Basic factory for creating common types of ContentHandlers
+ * Basic factory for creating common types of ContentHandlers.
+ *
+ * Implements {@link StreamingContentHandlerFactory} to support both in-memory
+ * content extraction and streaming output to an OutputStream.
*/
-public class BasicContentHandlerFactory implements ContentHandlerFactory, WriteLimiter {
+@TikaComponent(contextKey = ContentHandlerFactory.class)
+public class BasicContentHandlerFactory implements StreamingContentHandlerFactory, WriteLimiter {
- private final HANDLER_TYPE type;
- private final int writeLimit;
+ private HANDLER_TYPE type = HANDLER_TYPE.TEXT;
+ private int writeLimit = -1;
+ private boolean throwOnWriteLimitReached = true;
+ private int maxEmbeddedResources = -1;
+ private transient ParseContext parseContext;
- private final boolean throwOnWriteLimitReached;
-
- private final ParseContext parseContext;
+ /**
+ * No-arg constructor for bean-style configuration (e.g., Jackson deserialization).
+ * Creates a factory with TEXT handler type, unlimited write, and throwOnWriteLimitReached=true.
+ */
+ public BasicContentHandlerFactory() {
+ }
/**
* Create a BasicContentHandlerFactory with {@link #throwOnWriteLimitReached} is true
@@ -70,7 +81,29 @@ public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit,
throw new IllegalArgumentException("parse context must not be null if " +
"throwOnWriteLimitReached is false");
}
+ }
+ /**
+ * Full constructor with all parameters including maxEmbeddedResources.
+ *
+ * @param type basic type of handler
+ * @param writeLimit maximum number of characters to store; -1 for unlimited
+ * @param throwOnWriteLimitReached whether to throw when write limit is reached
+ * @param maxEmbeddedResources maximum number of embedded resources to process; -1 for unlimited
+ * @param parseContext to store warnings if throwOnWriteLimitReached is false
+ */
+ public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit,
+ boolean throwOnWriteLimitReached, int maxEmbeddedResources,
+ ParseContext parseContext) {
+ this.type = type;
+ this.writeLimit = writeLimit;
+ this.throwOnWriteLimitReached = throwOnWriteLimitReached;
+ this.maxEmbeddedResources = maxEmbeddedResources;
+ this.parseContext = parseContext;
+ if (throwOnWriteLimitReached == false && parseContext == null) {
+ throw new IllegalArgumentException("parse context must not be null if " +
+ "throwOnWriteLimitReached is false");
+ }
}
/**
@@ -108,7 +141,7 @@ public static HANDLER_TYPE parseHandlerType(String handlerTypeName, HANDLER_TYPE
}
@Override
- public ContentHandler getNewContentHandler() {
+ public ContentHandler createHandler() {
if (type == HANDLER_TYPE.BODY) {
return new BodyContentHandler(
@@ -139,7 +172,7 @@ private ContentHandler getFormatHandler() {
}
@Override
- public ContentHandler getNewContentHandler(OutputStream os, Charset charset) {
+ public ContentHandler createHandler(OutputStream os, Charset charset) {
if (type == HANDLER_TYPE.IGNORE) {
return new DefaultHandler();
@@ -191,6 +224,14 @@ public HANDLER_TYPE getType() {
return type;
}
+ /**
+ * Sets the handler type.
+ * @param type the handler type
+ */
+ public void setType(HANDLER_TYPE type) {
+ this.type = type;
+ }
+
/**
* Common handler types for content.
*/
@@ -203,8 +244,72 @@ public int getWriteLimit() {
return writeLimit;
}
+ /**
+ * Sets the write limit.
+ * @param writeLimit max characters to extract; -1 for unlimited
+ */
+ public void setWriteLimit(int writeLimit) {
+ this.writeLimit = writeLimit;
+ }
+
@Override
public boolean isThrowOnWriteLimitReached() {
return throwOnWriteLimitReached;
}
+
+ /**
+ * Sets whether to throw an exception when write limit is reached.
+ * @param throwOnWriteLimitReached true to throw, false to silently stop
+ */
+ public void setThrowOnWriteLimitReached(boolean throwOnWriteLimitReached) {
+ this.throwOnWriteLimitReached = throwOnWriteLimitReached;
+ }
+
+ /**
+ * Gets the maximum number of embedded resources to process.
+ * @return max embedded resources; -1 for unlimited
+ */
+ public int getMaxEmbeddedResources() {
+ return maxEmbeddedResources;
+ }
+
+ /**
+ * Sets the maximum number of embedded resources to process.
+ * @param maxEmbeddedResources max embedded resources; -1 for unlimited
+ */
+ public void setMaxEmbeddedResources(int maxEmbeddedResources) {
+ this.maxEmbeddedResources = maxEmbeddedResources;
+ }
+
+ /**
+ * Sets the parse context for storing warnings when throwOnWriteLimitReached is false.
+ * @param parseContext the parse context
+ */
+ public void setParseContext(ParseContext parseContext) {
+ this.parseContext = parseContext;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ BasicContentHandlerFactory that = (BasicContentHandlerFactory) o;
+ return writeLimit == that.writeLimit &&
+ throwOnWriteLimitReached == that.throwOnWriteLimitReached &&
+ maxEmbeddedResources == that.maxEmbeddedResources &&
+ type == that.type;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = type != null ? type.hashCode() : 0;
+ result = 31 * result + writeLimit;
+ result = 31 * result + (throwOnWriteLimitReached ? 1 : 0);
+ result = 31 * result + maxEmbeddedResources;
+ return result;
+ }
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java
index dc2f3384fcf..4c7efd7231f 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java
@@ -16,19 +16,27 @@
*/
package org.apache.tika.sax;
-
-import java.io.OutputStream;
import java.io.Serializable;
-import java.nio.charset.Charset;
import org.xml.sax.ContentHandler;
/**
- * Interface to allow easier injection of code for getting a new ContentHandler
+ * Factory interface for creating ContentHandler instances.
+ *
+ * This is the base interface used by tika-pipes, RecursiveParserWrapper, and other
+ * components that need to create content handlers for in-memory content extraction.
+ *
+ * For streaming output to an OutputStream, see {@link StreamingContentHandlerFactory}.
+ *
+ * @see StreamingContentHandlerFactory
+ * @see BasicContentHandlerFactory
*/
public interface ContentHandlerFactory extends Serializable {
- ContentHandler getNewContentHandler();
-
- ContentHandler getNewContentHandler(OutputStream os, Charset charset);
+ /**
+ * Creates a new ContentHandler for extracting content.
+ *
+ * @return a new ContentHandler instance
+ */
+ ContentHandler createHandler();
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/StreamingContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/StreamingContentHandlerFactory.java
new file mode 100644
index 00000000000..02279c16972
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/sax/StreamingContentHandlerFactory.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.io.OutputStream;
+import java.nio.charset.Charset;
+
+import org.xml.sax.ContentHandler;
+
+/**
+ * Extended factory interface for creating ContentHandler instances that write
+ * directly to an OutputStream.
+ *
+ * This interface extends {@link ContentHandlerFactory} to add streaming output
+ * capability, primarily used by tika-server's /tika endpoint for streaming
+ * responses back to clients.
+ *
+ * @see ContentHandlerFactory
+ * @see BasicContentHandlerFactory
+ */
+public interface StreamingContentHandlerFactory extends ContentHandlerFactory {
+
+ /**
+ * Creates a new ContentHandler that writes output directly to the given OutputStream.
+ *
+ * @param os the output stream to write to
+ * @param charset the character encoding to use
+ * @return a new ContentHandler instance that writes to the stream
+ */
+ ContentHandler createHandler(OutputStream os, Charset charset);
+}
diff --git a/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java b/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java
index 8a177c12ed4..bc6260d0a4a 100644
--- a/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java
@@ -73,7 +73,7 @@ public void testIgnore() throws Exception {
Parser p = new MockParser(OVER_DEFAULT);
ContentHandler handler =
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)
- .getNewContentHandler();
+ .createHandler();
assertTrue(handler instanceof DefaultHandler);
p.parse(null, handler, null, null);
//unfortunatley, the DefaultHandler does not return "",
@@ -82,7 +82,7 @@ public void testIgnore() throws Exception {
//tests that no write limit exception is thrown
p = new MockParser(100);
handler = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, 5)
- .getNewContentHandler();
+ .createHandler();
assertTrue(handler instanceof DefaultHandler);
p.parse(null, handler, null, null);
assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString());
@@ -92,7 +92,7 @@ public void testIgnore() throws Exception {
public void testText() throws Exception {
Parser p = new MockParser(OVER_DEFAULT);
BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
- ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler();
+ ContentHandler handler = new BasicContentHandlerFactory(type, -1).createHandler();
assertTrue(handler instanceof ToTextContentHandler);
p.parse(null, handler, null, null);
@@ -104,7 +104,7 @@ public void testText() throws Exception {
assertTrue(extracted.length() > 110000);
//now test write limit
p = new MockParser(10);
- handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler();
+ handler = new BasicContentHandlerFactory(type, 5).createHandler();
assertTrue(handler instanceof WriteOutContentHandler);
assertWriteLimitReached(p, (WriteOutContentHandler) handler);
extracted = handler.toString();
@@ -114,7 +114,7 @@ public void testText() throws Exception {
//now test outputstream call
p = new MockParser(OVER_DEFAULT);
ByteArrayOutputStream os = new ByteArrayOutputStream();
- handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8);
+ handler = new BasicContentHandlerFactory(type, -1).createHandler(os, UTF_8);
assertTrue(handler instanceof ToTextContentHandler);
p.parse(null, handler, null, null);
assertContains("This is the title", os.toByteArray());
@@ -125,7 +125,7 @@ public void testText() throws Exception {
p = new MockParser(10);
os = new ByteArrayOutputStream();
- handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8);
+ handler = new BasicContentHandlerFactory(type, 5).createHandler(os, UTF_8);
assertTrue(handler instanceof WriteOutContentHandler);
assertWriteLimitReached(p, (WriteOutContentHandler) handler);
//When writing to an OutputStream and a write limit is reached,
@@ -137,7 +137,7 @@ public void testText() throws Exception {
public void testHTML() throws Exception {
Parser p = new MockParser(OVER_DEFAULT);
BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.HTML;
- ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler();
+ ContentHandler handler = new BasicContentHandlerFactory(type, -1).createHandler();
assertTrue(handler instanceof ToHTMLContentHandler);
p.parse(null, handler, null, null);
@@ -148,7 +148,7 @@ public void testHTML() throws Exception {
//now test write limit
p = new MockParser(10);
- handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler();
+ handler = new BasicContentHandlerFactory(type, 5).createHandler();
assertTrue(handler instanceof WriteOutContentHandler);
assertWriteLimitReached(p, (WriteOutContentHandler) handler);
extracted = handler.toString();
@@ -158,7 +158,7 @@ public void testHTML() throws Exception {
//now test outputstream call
p = new MockParser(OVER_DEFAULT);
ByteArrayOutputStream os = new ByteArrayOutputStream();
- handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8);
+ handler = new BasicContentHandlerFactory(type, -1).createHandler(os, UTF_8);
assertTrue(handler instanceof ToHTMLContentHandler);
p.parse(null, handler, null, null);
assertContains("This is the title", os.toByteArray());
@@ -170,7 +170,7 @@ public void testHTML() throws Exception {
p = new MockParser(10);
os = new ByteArrayOutputStream();
- handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8);
+ handler = new BasicContentHandlerFactory(type, 5).createHandler(os, UTF_8);
assertTrue(handler instanceof WriteOutContentHandler);
assertWriteLimitReached(p, (WriteOutContentHandler) handler);
assertEquals(0, os.toByteArray().length);
@@ -180,7 +180,7 @@ public void testHTML() throws Exception {
public void testXML() throws Exception {
Parser p = new MockParser(OVER_DEFAULT);
BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.HTML;
- ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler();
+ ContentHandler handler = new BasicContentHandlerFactory(type, -1).createHandler();
assertTrue(handler instanceof ToXMLContentHandler);
p.parse(null, handler, new Metadata(), null);
@@ -191,7 +191,7 @@ public void testXML() throws Exception {
//now test write limit
p = new MockParser(10);
- handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler();
+ handler = new BasicContentHandlerFactory(type, 5).createHandler();
assertTrue(handler instanceof WriteOutContentHandler);
assertWriteLimitReached(p, (WriteOutContentHandler) handler);
extracted = handler.toString();
@@ -201,7 +201,7 @@ public void testXML() throws Exception {
//now test outputstream call
p = new MockParser(OVER_DEFAULT);
ByteArrayOutputStream os = new ByteArrayOutputStream();
- handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8);
+ handler = new BasicContentHandlerFactory(type, -1).createHandler(os, UTF_8);
assertTrue(handler instanceof ToXMLContentHandler);
p.parse(null, handler, null, null);
@@ -214,7 +214,7 @@ public void testXML() throws Exception {
p = new MockParser(10);
os = new ByteArrayOutputStream();
- handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8);
+ handler = new BasicContentHandlerFactory(type, 5).createHandler(os, UTF_8);
assertTrue(handler instanceof WriteOutContentHandler);
assertWriteLimitReached(p, (WriteOutContentHandler) handler);
assertEquals(0, os.toByteArray().length);
@@ -224,7 +224,7 @@ public void testXML() throws Exception {
public void testBody() throws Exception {
Parser p = new MockParser(OVER_DEFAULT);
BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.BODY;
- ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler();
+ ContentHandler handler = new BasicContentHandlerFactory(type, -1).createHandler();
assertTrue(handler instanceof BodyContentHandler);
@@ -236,7 +236,7 @@ public void testBody() throws Exception {
//now test write limit
p = new MockParser(10);
- handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler();
+ handler = new BasicContentHandlerFactory(type, 5).createHandler();
assertTrue(handler instanceof BodyContentHandler);
assertWriteLimitReached(p, (BodyContentHandler) handler);
extracted = handler.toString();
@@ -246,7 +246,7 @@ public void testBody() throws Exception {
//now test outputstream call
p = new MockParser(OVER_DEFAULT);
ByteArrayOutputStream os = new ByteArrayOutputStream();
- handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8);
+ handler = new BasicContentHandlerFactory(type, -1).createHandler(os, UTF_8);
assertTrue(handler instanceof BodyContentHandler);
p.parse(null, handler, null, null);
assertNotContains("title", os.toByteArray());
@@ -257,7 +257,7 @@ public void testBody() throws Exception {
p = new MockParser(10);
os = new ByteArrayOutputStream();
- handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8);
+ handler = new BasicContentHandlerFactory(type, 5).createHandler(os, UTF_8);
assertTrue(handler instanceof WriteOutContentHandler);
assertWriteLimitReached(p, (WriteOutContentHandler) handler);
assertEquals(0, os.toByteArray().length);
diff --git a/tika-eval/tika-eval-app/src/main/resources/pipes-iterator-template.json b/tika-eval/tika-eval-app/src/main/resources/pipes-iterator-template.json
index a5a7ddfad37..4ae623d6065 100644
--- a/tika-eval/tika-eval-app/src/main/resources/pipes-iterator-template.json
+++ b/tika-eval/tika-eval-app/src/main/resources/pipes-iterator-template.json
@@ -1,18 +1,6 @@
{
"basePath": "FETCHER_BASE_PATH",
"countTotal": true,
- "baseConfig": {
- "fetcherId": "fsf",
- "emitterId": "",
- "handlerConfig": {
- "type": "TEXT",
- "parseMode": "RMETA",
- "writeLimit": -1,
- "maxEmbeddedResources": -1,
- "throwOnWriteLimitReached": true
- },
- "onParseException": "EMIT",
- "maxWaitMs": 600000,
- "queueSize": 10000
- }
-}
\ No newline at end of file
+ "fetcherId": "fsf",
+ "emitterId": ""
+}
diff --git a/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java b/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java
index 4796401ebf8..c42a562c898 100644
--- a/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java
+++ b/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java
@@ -17,7 +17,6 @@
package org.apache.tika.example;
import java.io.IOException;
-import java.io.OutputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
@@ -147,7 +146,7 @@ public void parse(TikaInputStream tis, ContentHandler handler, Metadata original
public void parse(TikaInputStream tis, ContentHandlerFactory handlers, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// We only work with one ContentHandler as far as the user is
// concerned, any others are purely internal!
- parse(tis, handlers.getNewContentHandler(), metadata, context);
+ parse(tis, handlers.createHandler(), metadata, context);
}
protected class CharsetContentHandlerFactory implements ContentHandlerFactory {
@@ -157,18 +156,13 @@ protected class CharsetContentHandlerFactory implements ContentHandlerFactory {
private ContentHandler handler;
@Override
- public ContentHandler getNewContentHandler() {
+ public ContentHandler createHandler() {
index++;
if (index < charsetsToTry.length) {
return new BodyContentHandler();
}
return handler;
}
-
- @Override
- public ContentHandler getNewContentHandler(OutputStream os, Charset charset) {
- return getNewContentHandler();
- }
}
protected class CharsetTester {
diff --git a/tika-example/src/main/java/org/apache/tika/example/PipesForkParserExample.java b/tika-example/src/main/java/org/apache/tika/example/PipesForkParserExample.java
index e4439b801f6..4b69d10afa0 100644
--- a/tika-example/src/main/java/org/apache/tika/example/PipesForkParserExample.java
+++ b/tika-example/src/main/java/org/apache/tika/example/PipesForkParserExample.java
@@ -26,7 +26,7 @@
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.pipes.api.HandlerConfig;
+import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.pipes.core.PipesException;
import org.apache.tika.pipes.fork.PipesForkParser;
import org.apache.tika.pipes.fork.PipesForkParserConfig;
@@ -277,7 +277,7 @@ public void parseWithMetadata(Path filePath)
public void parseEmbeddedDocumentsRmeta(Path filePath)
throws IOException, InterruptedException, TikaException, PipesException {
PipesForkParserConfig config = new PipesForkParserConfig()
- .setParseMode(HandlerConfig.PARSE_MODE.RMETA);
+ .setParseMode(ParseMode.RMETA);
try (PipesForkParser parser = new PipesForkParser(config);
TikaInputStream tis = TikaInputStream.get(filePath)) {
@@ -334,7 +334,7 @@ public void parseEmbeddedDocumentsRmeta(Path filePath)
public void parseEmbeddedDocumentsConcatenate(Path filePath)
throws IOException, InterruptedException, TikaException, PipesException {
PipesForkParserConfig config = new PipesForkParserConfig()
- .setParseMode(HandlerConfig.PARSE_MODE.CONCATENATE);
+ .setParseMode(ParseMode.CONCATENATE);
try (PipesForkParser parser = new PipesForkParser(config);
TikaInputStream tis = TikaInputStream.get(filePath)) {
diff --git a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java
index e1b32ceb259..cdfb7391b99 100644
--- a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java
+++ b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java
@@ -64,7 +64,7 @@
import org.apache.tika.cli.TikaCLI;
import org.apache.tika.config.JsonConfigHelper;
-import org.apache.tika.pipes.api.HandlerConfig;
+import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.utils.SystemUtils;
/**
@@ -220,7 +220,7 @@ private Path getTikaConfig(Path pipesDirectory, Path testFileFolderPath) throws
replacements.put("EMITTER_TOPIC", EMITTER_TOPIC);
replacements.put("BOOTSTRAP_SERVERS", kafka.getBootstrapServers());
replacements.put("FETCHER_BASE_PATH", testFileFolderPath);
- replacements.put("PARSE_MODE", HandlerConfig.PARSE_MODE.RMETA.name());
+ replacements.put("PARSE_MODE", ParseMode.RMETA.name());
replacements.put("LOG4J_JVM_ARG", "-Dlog4j.configurationFile=" + log4jPropFile.toAbsolutePath());
JsonConfigHelper.writeConfigFromResource("/kafka/plugins-template.json",
diff --git a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/plugins-template.json b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/plugins-template.json
index 7dc28288517..128a1a8b441 100644
--- a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/plugins-template.json
+++ b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/plugins-template.json
@@ -1,4 +1,12 @@
{
+ "content-handler-factory": {
+ "basic-content-handler-factory": {
+ "type": "TEXT",
+ "writeLimit": -1,
+ "maxEmbeddedResources": -1,
+ "throwOnWriteLimitReached": true
+ }
+ },
"parsers": [
{
"default-parser": {}
@@ -77,23 +85,13 @@
"groupId": "grpid",
"autoOffsetReset": "earliest",
"pollDelayMs": 1000,
- "baseConfig": {
- "fetcherId": "fsf",
- "emitterId": "ke",
- "handlerConfig": {
- "type": "TEXT",
- "parseMode": "PARSE_MODE",
- "writeLimit": -1,
- "maxEmbeddedResources": -1,
- "throwOnWriteLimitReached": true
- },
- "onParseException": "EMIT",
- "maxWaitMs": 600000,
- "queueSize": 10000
- }
+ "fetcherId": "fsf",
+ "emitterId": "ke"
}
},
"pipes": {
+ "parseMode": "PARSE_MODE",
+ "onParseException": "EMIT",
"emitMaxEstimatedBytes": 100000,
"emitWithinMillis": 10,
"numEmitters": 1,
diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java
index f105be64acf..ee5145f2840 100644
--- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java
+++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java
@@ -53,7 +53,7 @@
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.pipes.api.HandlerConfig;
+import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.pipes.api.emitter.Emitter;
import org.apache.tika.pipes.core.emitter.EmitterManager;
import org.apache.tika.pipes.emitter.opensearch.HttpClientConfig;
@@ -98,7 +98,7 @@ public void testPluginsConfig(@TempDir Path pipesDirectory) throws Exception {
Path pluginsConfg = getPluginsConfig(
pipesDirectory, OpenSearchEmitterConfig.AttachmentStrategy.PARENT_CHILD,
OpenSearchEmitterConfig.UpdateStrategy.OVERWRITE,
- HandlerConfig.PARSE_MODE.RMETA, "https://opensearch", Paths.get("testDocs"));
+ ParseMode.RMETA, "https://opensearch", Paths.get("testDocs"));
// PipesReporter reporter = ReporterManager.load(pluginsConfg);
// System.out.println(reporter);
// PipesIterator pipesIterator = PipesIteratorManager.load(pluginsConfg);
@@ -115,7 +115,7 @@ public void testBasicFSToOpenSearch(@TempDir Path pipesDirectory, @TempDir Path
sendMappings(client, endpoint, TEST_INDEX, "opensearch-mappings.json");
runPipes(client, OpenSearchEmitterConfig.AttachmentStrategy.SEPARATE_DOCUMENTS,
- OpenSearchEmitterConfig.UpdateStrategy.UPSERT, HandlerConfig.PARSE_MODE.CONCATENATE, endpoint,
+ OpenSearchEmitterConfig.UpdateStrategy.UPSERT, ParseMode.CONCATENATE, endpoint,
pipesDirectory, testDocDirectory);
String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " +
@@ -184,7 +184,7 @@ public void testParentChildFSToOpenSearch(@TempDir Path pipesDirectory, @TempDir
runPipes(client, OpenSearchEmitterConfig.AttachmentStrategy.PARENT_CHILD,
OpenSearchEmitterConfig.UpdateStrategy.OVERWRITE,
- HandlerConfig.PARSE_MODE.RMETA, endpoint, pipesDirectory, testDocDirectory);
+ ParseMode.RMETA, endpoint, pipesDirectory, testDocDirectory);
String query = "{ \"track_total_hits\": true, \"from\":0, \"size\": 10000, \"query\": { \"match\": { \"content\": { " +
"\"query\": \"happiness\" } } } }";
@@ -252,7 +252,7 @@ public void testSeparateDocsFSToOpenSearch(@TempDir Path pipesDirectory, @TempDi
runPipes(client, OpenSearchEmitterConfig.AttachmentStrategy.SEPARATE_DOCUMENTS,
OpenSearchEmitterConfig.UpdateStrategy.OVERWRITE,
- HandlerConfig.PARSE_MODE.RMETA, endpoint,
+ ParseMode.RMETA, endpoint,
pipesDirectory, testDocDirectory);
String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " +
@@ -318,7 +318,7 @@ public void testUpsertSeparateDocsFSToOpenSearch(@TempDir Path pipesDirectory, @
runPipes(client, OpenSearchEmitterConfig.AttachmentStrategy.SEPARATE_DOCUMENTS,
OpenSearchEmitterConfig.UpdateStrategy.UPSERT,
- HandlerConfig.PARSE_MODE.RMETA, endpoint, pipesDirectory, testDocDirectory);
+ ParseMode.RMETA, endpoint, pipesDirectory, testDocDirectory);
String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " +
"\"query\": \"happiness\" } } } }";
@@ -378,7 +378,7 @@ public void testUpsert(@TempDir Path pipesDirectory, @TempDir Path testDocDirect
String endpoint = CONTAINER.getHttpHostAddress() + "/" + TEST_INDEX;
sendMappings(client, endpoint, TEST_INDEX, "opensearch-mappings.json");
Path pluginsConfigFile = getPluginsConfig(pipesDirectory, OpenSearchEmitterConfig.AttachmentStrategy.SEPARATE_DOCUMENTS,
- OpenSearchEmitterConfig.UpdateStrategy.UPSERT, HandlerConfig.PARSE_MODE.RMETA,
+ OpenSearchEmitterConfig.UpdateStrategy.UPSERT, ParseMode.RMETA,
endpoint, testDocDirectory);
TikaJsonConfig tikaJsonConfig = TikaJsonConfig.load(pluginsConfigFile);
@@ -450,7 +450,7 @@ protected void sendMappings(OpensearchTestClient client, String endpoint, String
private void runPipes(OpensearchTestClient client, OpenSearchEmitterConfig.AttachmentStrategy attachmentStrategy,
OpenSearchEmitterConfig.UpdateStrategy updateStrategy,
- HandlerConfig.PARSE_MODE parseMode, String endpoint, Path pipesDirectory, Path testDocDirectory) throws Exception {
+ ParseMode parseMode, String endpoint, Path pipesDirectory, Path testDocDirectory) throws Exception {
Path pluginsConfig = getPluginsConfig(pipesDirectory, attachmentStrategy, updateStrategy, parseMode,
endpoint, testDocDirectory);
@@ -466,7 +466,7 @@ private void runPipes(OpensearchTestClient client, OpenSearchEmitterConfig.Attac
@NotNull
private Path getPluginsConfig(Path pipesDirectory, OpenSearchEmitterConfig.AttachmentStrategy attachmentStrategy,
OpenSearchEmitterConfig.UpdateStrategy updateStrategy,
- HandlerConfig.PARSE_MODE parseMode, String endpoint, Path testDocDirectory) throws IOException {
+ ParseMode parseMode, String endpoint, Path testDocDirectory) throws IOException {
Path tikaConfig = pipesDirectory.resolve("plugins-config.json");
Path log4jPropFile = pipesDirectory.resolve("log4j2.xml");
diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/plugins-template.json b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/plugins-template.json
index 16e2a4fc968..2b4f98f92e3 100644
--- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/plugins-template.json
+++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/plugins-template.json
@@ -1,4 +1,12 @@
{
+ "content-handler-factory": {
+ "basic-content-handler-factory": {
+ "type": "TEXT",
+ "writeLimit": -1,
+ "maxEmbeddedResources": -1,
+ "throwOnWriteLimitReached": true
+ }
+ },
"fetchers": {
"fsf": {
"file-system-fetcher": {
@@ -29,20 +37,8 @@
"file-system-pipes-iterator": {
"basePath": "FETCHER_BASE_PATH",
"countTotal": true,
- "baseConfig": {
- "fetcherId": "fsf",
- "emitterId": "ose",
- "handlerConfig": {
- "type": "TEXT",
- "parseMode": "PARSE_MODE",
- "writeLimit": -1,
- "maxEmbeddedResources": -1,
- "throwOnWriteLimitReached": true
- },
- "onParseException": "EMIT",
- "maxWaitMs": 600000,
- "queueSize": 10000
- }
+ "fetcherId": "fsf",
+ "emitterId": "ose"
}
},
"pipes-reporters": {
@@ -60,6 +56,8 @@
}
},
"pipes": {
+ "parseMode": "PARSE_MODE",
+ "onParseException": "EMIT",
"emitStrategy": {
"type": "DYNAMIC",
"thresholdBytes": 10000
@@ -93,6 +91,5 @@
}
}
],
-
"plugin-roots": "target/plugins"
-}
\ No newline at end of file
+}
diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json
index a6a0c512679..172a0c1c0ec 100644
--- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json
+++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json
@@ -1,4 +1,12 @@
{
+ "content-handler-factory": {
+ "basic-content-handler-factory": {
+ "type": "TEXT",
+ "writeLimit": -1,
+ "maxEmbeddedResources": -1,
+ "throwOnWriteLimitReached": true
+ }
+ },
"parsers": [
{
"default-parser": {}
@@ -70,20 +78,8 @@
"file-system-pipes-iterator": {
"basePath": "FETCHER_BASE_PATH",
"countTotal": true,
- "baseConfig": {
- "fetcherId": "fsf",
- "emitterId": "ose",
- "handlerConfig": {
- "type": "TEXT",
- "parseMode": "PARSE_MODE",
- "writeLimit": -1,
- "maxEmbeddedResources": -1,
- "throwOnWriteLimitReached": true
- },
- "onParseException": "EMIT",
- "maxWaitMs": 600000,
- "queueSize": 10000
- }
+ "fetcherId": "fsf",
+ "emitterId": "ose"
}
},
"pipes-reporters": {
@@ -101,6 +97,8 @@
}
},
"pipes": {
+ "parseMode": "PARSE_MODE",
+ "onParseException": "EMIT",
"emitStrategy": {
"type": "DYNAMIC",
"thresholdBytes": 10000
diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java
index 92b3c6b2479..888396343fa 100644
--- a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java
+++ b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java
@@ -55,6 +55,7 @@
import org.apache.tika.cli.TikaCLI;
import org.apache.tika.config.JsonConfigHelper;
+import org.apache.tika.pipes.api.ParseMode;
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
@Testcontainers(disabledWithoutDocker = true)
@@ -140,7 +141,7 @@ void s3PipelineIteratorS3FetcherAndS3Emitter() throws Exception {
// Create plugins config JSON
Map replacements = new HashMap<>();
replacements.put("LOG4J_JVM_ARG", "-Dlog4j.configurationFile=" + log4jPropFile.toAbsolutePath());
- replacements.put("PARSE_MODE", org.apache.tika.pipes.api.HandlerConfig.PARSE_MODE.RMETA.name());
+ replacements.put("PARSE_MODE", ParseMode.RMETA.name());
replacements.put("PIPE_ITERATOR_BUCKET", FETCH_BUCKET);
replacements.put("EMIT_BUCKET", EMIT_BUCKET);
replacements.put("FETCH_BUCKET", FETCH_BUCKET);
diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/plugins-template.json b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/plugins-template.json
index 1efc929ce35..816d5c49e58 100644
--- a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/plugins-template.json
+++ b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/plugins-template.json
@@ -1,4 +1,12 @@
{
+ "content-handler-factory": {
+ "basic-content-handler-factory": {
+ "type": "TEXT",
+ "writeLimit": -1,
+ "maxEmbeddedResources": -1,
+ "throwOnWriteLimitReached": true
+ }
+ },
"fetchers": {
"s3f": {
"s3-fetcher": {
@@ -44,23 +52,13 @@
"secretKey": "SECRET_KEY",
"endpointConfigurationService": "ENDPOINT_CONFIGURATION_SERVICE",
"pathStyleAccessEnabled": true,
- "baseConfig": {
- "fetcherId": "s3f",
- "emitterId": "s3e",
- "handlerConfig": {
- "type": "TEXT",
- "parseMode": "PARSE_MODE",
- "writeLimit": -1,
- "maxEmbeddedResources": -1,
- "throwOnWriteLimitReached": true
- },
- "onParseException": "EMIT",
- "maxWaitMs": 600000,
- "queueSize": 10000
- }
+ "fetcherId": "s3f",
+ "emitterId": "s3e"
}
},
"pipes": {
+ "parseMode": "PARSE_MODE",
+ "onParseException": "EMIT",
"emitMaxEstimatedBytes": 100000,
"emitWithinMillis": 10,
"numEmitters": 1,
diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
index fb195df8562..0fea4b0cd07 100644
--- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
+++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java
@@ -48,7 +48,7 @@
import org.apache.tika.cli.TikaCLI;
import org.apache.tika.config.JsonConfigHelper;
-import org.apache.tika.pipes.api.HandlerConfig;
+import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.pipes.emitter.solr.SolrEmitterConfig;
import org.apache.tika.utils.SystemUtils;
@@ -210,7 +210,7 @@ protected void runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter(Path pipesDire
Path tikaConfigFile = getTikaConfig(pipesDirectory,
SolrEmitterConfig.UpdateStrategy.ADD, SolrEmitterConfig.AttachmentStrategy.PARENT_CHILD,
- HandlerConfig.PARSE_MODE.RMETA);
+ ParseMode.RMETA);
TikaCLI.main(new String[]{"-a", "-c", tikaConfigFile.toAbsolutePath().toString()});
@@ -244,7 +244,7 @@ protected void runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter(Path pipesDire
tikaConfigFile = getTikaConfig(pipesDirectory,
SolrEmitterConfig.UpdateStrategy.UPDATE_MUST_EXIST,
SolrEmitterConfig.AttachmentStrategy.PARENT_CHILD,
- HandlerConfig.PARSE_MODE.RMETA);
+ ParseMode.RMETA);
TikaCLI.main(new String[]{"-a", "-c", tikaConfigFile.toAbsolutePath().toString()});
@@ -263,7 +263,7 @@ protected void runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter(Path pipesDire
private Path getTikaConfig(Path pipesDirectory,
SolrEmitterConfig.UpdateStrategy updateStrategy,
SolrEmitterConfig.AttachmentStrategy attachmentStrategy,
- HandlerConfig.PARSE_MODE parseMode) throws IOException {
+ ParseMode parseMode) throws IOException {
Path tikaConfig = pipesDirectory.resolve("plugins-config.json");
Path log4jPropFile = pipesDirectory.resolve("log4j2.xml");
diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/plugins-template.json b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/plugins-template.json
index 366be952746..63cf5d73b50 100644
--- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/plugins-template.json
+++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/plugins-template.json
@@ -1,4 +1,12 @@
{
+ "content-handler-factory": {
+ "basic-content-handler-factory": {
+ "type": "TEXT",
+ "writeLimit": -1,
+ "maxEmbeddedResources": -1,
+ "throwOnWriteLimitReached": true
+ }
+ },
"parsers": [
{
"default-parser": {}
@@ -74,23 +82,13 @@
"rows": 100,
"connectionTimeout": 10000,
"socketTimeout": 60000,
- "baseConfig": {
- "fetcherId": "fsf",
- "emitterId": "se",
- "handlerConfig": {
- "type": "TEXT",
- "parseMode": "PARSE_MODE",
- "writeLimit": -1,
- "maxEmbeddedResources": -1,
- "throwOnWriteLimitReached": true
- },
- "onParseException": "EMIT",
- "maxWaitMs": 600000,
- "queueSize": 10000
- }
+ "fetcherId": "fsf",
+ "emitterId": "se"
}
},
"pipes": {
+ "parseMode": "PARSE_MODE",
+ "onParseException": "EMIT",
"emitStrategy": {
"type": "DYNAMIC",
"thresholdBytes": 10000
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index ffaa2cac246..195da525caa 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1518,7 +1518,7 @@ private Metadata testWriteLimit(String fileName, int limit) throws Exception {
BasicContentHandlerFactory factory = new BasicContentHandlerFactory(
BasicContentHandlerFactory.HANDLER_TYPE.TEXT, limit
);
- ContentHandler contentHandler = factory.getNewContentHandler();
+ ContentHandler contentHandler = factory.createHandler();
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
try (TikaInputStream tis = getResourceAsStream("/test-documents/" + fileName)) {
diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
index 6576c904ea2..15586c526cf 100644
--- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
+++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
@@ -37,7 +37,6 @@
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
import org.apache.tika.pipes.api.pipesiterator.PipesIterator;
@@ -47,6 +46,7 @@
import org.apache.tika.plugins.ExtensionConfig;
import org.apache.tika.plugins.TikaPluginManager;
import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.utils.StringUtils;
public class TikaAsyncCLI {
@@ -290,9 +290,8 @@ private static void configureHandler(FetchEmitTuple t, SimpleAsyncConfig asyncCo
if (asyncConfig.getHandlerType() == BasicContentHandlerFactory.HANDLER_TYPE.TEXT) {
return;
}
- HandlerConfig handlerConfig = new HandlerConfig(asyncConfig.getHandlerType(), HandlerConfig.PARSE_MODE.RMETA,
- -1, -1, false);
- t.getParseContext().set(HandlerConfig.class, handlerConfig);
+ ContentHandlerFactory factory = new BasicContentHandlerFactory(asyncConfig.getHandlerType(), -1);
+ t.getParseContext().set(ContentHandlerFactory.class, factory);
}
private static void configureExtractBytes(FetchEmitTuple t, SimpleAsyncConfig asyncConfig) {
diff --git a/tika-pipes/tika-async-cli/src/main/resources/config-template.json b/tika-pipes/tika-async-cli/src/main/resources/config-template.json
index e295290dd4b..d4c70d5d731 100644
--- a/tika-pipes/tika-async-cli/src/main/resources/config-template.json
+++ b/tika-pipes/tika-async-cli/src/main/resources/config-template.json
@@ -1,4 +1,12 @@
{
+ "content-handler-factory": {
+ "basic-content-handler-factory": {
+ "type": "TEXT",
+ "writeLimit": -1,
+ "maxEmbeddedResources": -1,
+ "throwOnWriteLimitReached": true
+ }
+ },
"parsers": [
{
"default-parser": {}
@@ -45,21 +53,15 @@
"file-system-pipes-iterator": {
"basePath": "FETCHER_BASE_PATH",
"countTotal": true,
- "baseConfig": {
- "fetcherId": "fsf",
- "emitterId": "fse",
- "handlerConfig": {
- "type": "TEXT",
- "parseMode": "RMETA",
- "writeLimit": -1,
- "maxEmbeddedResources": -1,
- "throwOnWriteLimitReached": true
- },
- "onParseException": "EMIT",
- "maxWaitMs": 600000,
- "queueSize": 10000
- }
+ "fetcherId": "fsf",
+ "emitterId": "fse",
+ "onParseException": "EMIT",
+ "maxWaitMs": 600000,
+ "queueSize": 10000
}
},
+ "pipes": {
+ "parseMode": "RMETA"
+ },
"plugin-roots": "PLUGIN_ROOTS"
}
diff --git a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
index 4bd181699e6..6d26b6dd0fa 100644
--- a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
+++ b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
@@ -17,7 +17,6 @@
package org.apache.tika.async.cli;
-import static org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig.DEFAULT_HANDLER_CONFIG;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -44,7 +43,6 @@
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
import org.apache.tika.pipes.api.pipesiterator.PipesIterator;
@@ -112,8 +110,6 @@ public void setUp() throws Exception {
@Test
public void testRecursiveUnpacking() throws Exception {
-// TikaAsyncCLI cli = new TikaAsyncCLI();
- // cli.main(new String[]{ configDir.resolve("tika-config.xml").toAbsolutePath().toString()});
AsyncProcessor processor = AsyncProcessor.load(configDir.resolve("tika-config.json"));
EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = new EmbeddedDocumentBytesConfig(true);
@@ -122,7 +118,6 @@ public void testRecursiveUnpacking() throws Exception {
embeddedDocumentBytesConfig.setSuffixStrategy(EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.NONE);
embeddedDocumentBytesConfig.setEmbeddedIdPrefix("-");
ParseContext parseContext = new ParseContext();
- parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG);
parseContext.set(EmbeddedDocumentBytesConfig.class, embeddedDocumentBytesConfig);
FetchEmitTuple t =
new FetchEmitTuple("myId-1", new FetchKey("fsf", "mock.xml"),
@@ -133,7 +128,6 @@ public void testRecursiveUnpacking() throws Exception {
for (int i = 0; i < 10; i++) {
processor.offer(PipesIterator.COMPLETED_SEMAPHORE, 1000);
}
- //TODO clean this up
while (processor.checkActive()) {
Thread.sleep(100);
}
@@ -161,14 +155,9 @@ public void testRecursiveUnpacking() throws Exception {
@Test
public void testStopsOnApplicationError() throws Exception {
- // Test that AsyncProcessor stops processing when an application error occurs
- // (TIKA-4570)
AsyncProcessor processor = AsyncProcessor.load(configDir.resolve("tika-config.json"));
- // Create a tuple with a non-existent fetcher - this will cause FETCHER_NOT_FOUND
- // which is a TASK_EXCEPTION but will stop processing in CLI mode (default)
ParseContext parseContext = new ParseContext();
- parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG);
FetchEmitTuple badTuple = new FetchEmitTuple(
"bad-tuple-1",
new FetchKey("non-existent-fetcher", "some-file.txt"),
@@ -177,10 +166,8 @@ public void testStopsOnApplicationError() throws Exception {
parseContext,
FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT);
- // Offer the bad tuple
processor.offer(badTuple, 1000);
- // Wait for the error to be detected
int maxWaitMs = 30000;
int waited = 0;
while (!processor.hasApplicationError() && waited < maxWaitMs) {
@@ -188,11 +175,9 @@ public void testStopsOnApplicationError() throws Exception {
waited += 100;
}
- // Verify that the application error was detected
assertTrue(processor.hasApplicationError(),
"AsyncProcessor should detect application error from bad fetcher");
- // Verify that subsequent offers throw PipesException
FetchEmitTuple anotherTuple = new FetchEmitTuple(
"another-tuple",
new FetchKey("fsf", "mock.xml"),
diff --git a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/HandlerConfig.java b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/HandlerConfig.java
deleted file mode 100644
index b336f1a4fcc..00000000000
--- a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/HandlerConfig.java
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.pipes.api;
-
-import java.io.Serializable;
-import java.util.Locale;
-import java.util.Objects;
-
-import org.apache.tika.sax.BasicContentHandlerFactory;
-
-/**
- * Configuration for content handler behavior during parsing.
- */
-public class HandlerConfig implements Serializable {
-
- /**
- * {@link PARSE_MODE#RMETA} "recursive metadata" is the same as the -J option
- * in tika-app and the /rmeta endpoint in tika-server. Each embedded file is represented as
- * its own metadata object.
- *
- * {@link PARSE_MODE#CONCATENATE} is similar
- * to the legacy tika-app behavior and the /tika endpoint (accept: application/json) in
- * tika-server. This concatenates the
- * contents of embedded files and returns a single metadata object for the file no
- * matter how many embedded objects there are; this option throws away metadata from
- * embedded objects and silently skips exceptions in embedded objects.
- */
- public enum PARSE_MODE {
- RMETA,
- CONCATENATE;
-
- public static PARSE_MODE parseMode(String modeString) {
- for (PARSE_MODE m : PARSE_MODE.values()) {
- if (m.name().equalsIgnoreCase(modeString)) {
- return m;
- }
- }
- StringBuilder sb = new StringBuilder();
- int i = 0;
- for (PARSE_MODE m : PARSE_MODE.values()) {
- if (i++ > 0) {
- sb.append(", ");
- }
- sb.append(m.name().toLowerCase(Locale.US));
- }
- throw new IllegalArgumentException("mode must be one of: (" + sb +
- "). I regret I do not understand: " + modeString);
- }
- }
- BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
- PARSE_MODE parseMode = PARSE_MODE.RMETA;
- int writeLimit = -1;
- int maxEmbeddedResources = -1;
- boolean throwOnWriteLimitReached = true;
-
- public HandlerConfig() {
-
- }
-
- public HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE type, PARSE_MODE parseMode, int writeLimit, int maxEmbeddedResources, boolean throwOnWriteLimitReached) {
- this.type = type;
- this.parseMode = parseMode;
- this.writeLimit = writeLimit;
- this.maxEmbeddedResources = maxEmbeddedResources;
- this.throwOnWriteLimitReached = throwOnWriteLimitReached;
- }
-
- public BasicContentHandlerFactory.HANDLER_TYPE getType() {
- return type;
- }
-
- public void setType(BasicContentHandlerFactory.HANDLER_TYPE type) {
- this.type = type;
- }
-
- public void setType(String typeString) {
- this.type = BasicContentHandlerFactory.HANDLER_TYPE.valueOf(typeString);
- }
-
- public PARSE_MODE getParseMode() {
- return parseMode;
- }
-
- public void setParseMode(PARSE_MODE parseMode) {
- this.parseMode = parseMode;
- }
-
- public void setParseMode(String parseMode) {
- this.parseMode = PARSE_MODE.valueOf(parseMode);
- }
-
- public int getWriteLimit() {
- return writeLimit;
- }
-
- public void setWriteLimit(int writeLimit) {
- this.writeLimit = writeLimit;
- }
-
- public int getMaxEmbeddedResources() {
- return maxEmbeddedResources;
- }
-
- public void setMaxEmbeddedResources(int maxEmbeddedResources) {
- this.maxEmbeddedResources = maxEmbeddedResources;
- }
-
- public boolean isThrowOnWriteLimitReached() {
- return throwOnWriteLimitReached;
- }
-
- public void setThrowOnWriteLimitReached(boolean throwOnWriteLimitReached) {
- this.throwOnWriteLimitReached = throwOnWriteLimitReached;
- }
-
- @Override
- public final boolean equals(Object o) {
- if (!(o instanceof HandlerConfig that)) {
- return false;
- }
-
- return writeLimit == that.writeLimit && maxEmbeddedResources == that.maxEmbeddedResources && throwOnWriteLimitReached == that.throwOnWriteLimitReached &&
- type == that.type && parseMode == that.parseMode;
- }
-
- @Override
- public int hashCode() {
- int result = Objects.hashCode(type);
- result = 31 * result + Objects.hashCode(parseMode);
- result = 31 * result + writeLimit;
- result = 31 * result + maxEmbeddedResources;
- result = 31 * result + Boolean.hashCode(throwOnWriteLimitReached);
- return result;
- }
-}
diff --git a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java
new file mode 100644
index 00000000000..edd82729dad
--- /dev/null
+++ b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.api;
+
+import java.util.Locale;
+
+/**
+ * Controls how embedded documents are handled during parsing.
+ *
+ * This can be set as a default in PipesConfig (loaded from tika-config.json)
+ * or overridden per-file via ParseContext.
+ */
+public enum ParseMode {
+
+ /**
+ * Each embedded file gets its own metadata object in a list.
+ *
+ * This is equivalent to the -J option in tika-app and the /rmeta endpoint
+ * in tika-server. The result is a list of metadata objects, one for each
+ * document (container + all embedded documents).
+ */
+ RMETA,
+
+ /**
+ * Concatenates content from all embedded files into a single document.
+ *
+ * This is equivalent to the legacy tika-app behavior and the /tika endpoint
+ * in tika-server. The result is a single metadata object with concatenated
+ * content from all documents.
+ */
+ CONCATENATE;
+
+ /**
+ * Parses a string to a ParseMode enum value.
+ *
+ * @param modeString the string to parse (case-insensitive)
+ * @return the corresponding ParseMode
+ * @throws IllegalArgumentException if the string doesn't match any mode
+ */
+ public static ParseMode parse(String modeString) {
+ if (modeString == null) {
+ throw new IllegalArgumentException("Parse mode cannot be null");
+ }
+ String normalized = modeString.toUpperCase(Locale.ROOT).trim();
+ try {
+ return ParseMode.valueOf(normalized);
+ } catch (IllegalArgumentException e) {
+ throw new IllegalArgumentException(
+ "Invalid parse mode: '" + modeString + "'. " +
+ "Must be one of: RMETA, CONCATENATE");
+ }
+ }
+}
diff --git a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorBaseConfig.java b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorBaseConfig.java
deleted file mode 100644
index 021d62e400a..00000000000
--- a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorBaseConfig.java
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.pipes.api.pipesiterator;
-
-import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
-import org.apache.tika.sax.BasicContentHandlerFactory;
-
-
-public record PipesIteratorBaseConfig(String fetcherId, String emitterId, HandlerConfig handlerConfig,
- FetchEmitTuple.ON_PARSE_EXCEPTION onParseException, long maxWaitMs, int queueSize) {
-
- public static final HandlerConfig DEFAULT_HANDLER_CONFIG = new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, HandlerConfig.PARSE_MODE.RMETA,
- -1, -1, true);
- private static final FetchEmitTuple.ON_PARSE_EXCEPTION DEFAULT_ON_PARSE_EXCEPTION = FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT;
- private static final long DEFAULT_MAX_WAIT_MS = 600_000;
- private static final int DEFAULT_QUEUE_SIZE = 10000;
-
- public PipesIteratorBaseConfig(String fetcherId, String emitterId) {
- this(fetcherId, emitterId, DEFAULT_HANDLER_CONFIG, DEFAULT_ON_PARSE_EXCEPTION, DEFAULT_MAX_WAIT_MS, DEFAULT_QUEUE_SIZE);
- }
-
-}
diff --git a/tika-pipes/tika-pipes-api/src/main/resources/META-INF/tika/other-configs.idx b/tika-pipes/tika-pipes-api/src/main/resources/META-INF/tika/other-configs.idx
deleted file mode 100644
index 4b66482790f..00000000000
--- a/tika-pipes/tika-pipes-api/src/main/resources/META-INF/tika/other-configs.idx
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Component registry for tika-pipes-api
-# Format: friendly-name=fully.qualified.ClassName
-# this has to be manually generated for now because of the dependency graph
-
-handler-config=org.apache.tika.pipes.api.HandlerConfig
diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesConfig.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesConfig.java
index 8daae166ff0..74cd509a0a7 100644
--- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesConfig.java
+++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesConfig.java
@@ -21,6 +21,8 @@
import org.apache.tika.config.loader.TikaJsonConfig;
import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.pipes.api.FetchEmitTuple;
+import org.apache.tika.pipes.api.ParseMode;
public class PipesConfig {
@@ -85,6 +87,17 @@ public class PipesConfig {
*/
private boolean stopOnlyOnFatal = false;
+ /**
+ * Default parse mode for how embedded documents are handled.
+ * Can be overridden per-file via ParseContext.
+ */
+ private ParseMode parseMode = ParseMode.RMETA;
+
+ /**
+ * Default behavior when a parse exception occurs.
+ */
+ private FetchEmitTuple.ON_PARSE_EXCEPTION onParseException = FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT;
+
private ArrayList forkedJvmArgs = new ArrayList<>();
private String javaPath = "java";
@@ -361,6 +374,52 @@ public void setStopOnlyOnFatal(boolean stopOnlyOnFatal) {
this.stopOnlyOnFatal = stopOnlyOnFatal;
}
+ /**
+ * Gets the default parse mode for how embedded documents are handled.
+ *
+ * @return the default parse mode
+ */
+ public ParseMode getParseMode() {
+ return parseMode;
+ }
+
+ /**
+ * Sets the default parse mode for how embedded documents are handled.
+ * This can be overridden per-file via ParseContext.
+ *
+ * @param parseMode the parse mode (RMETA or CONCATENATE)
+ */
+ public void setParseMode(ParseMode parseMode) {
+ this.parseMode = parseMode;
+ }
+
+ /**
+ * Sets the default parse mode from a string.
+ *
+ * @param parseMode the parse mode name (rmeta or concatenate)
+ */
+ public void setParseMode(String parseMode) {
+ this.parseMode = ParseMode.parse(parseMode);
+ }
+
+ /**
+ * Gets the default behavior when a parse exception occurs.
+ *
+ * @return the parse exception behavior
+ */
+ public FetchEmitTuple.ON_PARSE_EXCEPTION getOnParseException() {
+ return onParseException;
+ }
+
+ /**
+ * Sets the default behavior when a parse exception occurs.
+ *
+ * @param onParseException the parse exception behavior
+ */
+ public void setOnParseException(FetchEmitTuple.ON_PARSE_EXCEPTION onParseException) {
+ this.onParseException = onParseException;
+ }
+
public String getConfigStoreType() {
return configStoreType;
}
diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java
index cdf89be42d8..278410e7dbf 100644
--- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java
+++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java
@@ -24,25 +24,22 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.module.SimpleModule;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
+import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.serialization.serdes.MetadataSerializer;
-import org.apache.tika.serialization.serdes.ParseContextDeserializer;
-import org.apache.tika.serialization.serdes.ParseContextSerializer;
public class JsonFetchEmitTuple {
- private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+ private static final ObjectMapper OBJECT_MAPPER;
static {
- SimpleModule module = new SimpleModule();
- module.addDeserializer(FetchEmitTuple.class, new FetchEmitTupleDeserializer());
- module.addSerializer(FetchEmitTuple.class, new FetchEmitTupleSerializer());
- module.addSerializer(Metadata.class, new MetadataSerializer());
- module.addSerializer(ParseContext.class, new ParseContextSerializer());
- module.addDeserializer(ParseContext.class, new ParseContextDeserializer());
- OBJECT_MAPPER.registerModule(module);
+ // Use TikaObjectMapperFactory which provides TikaModule with Metadata/ParseContext serializers
+ OBJECT_MAPPER = TikaObjectMapperFactory.createMapper();
+
+ // Add FetchEmitTuple-specific serializers
+ SimpleModule fetchEmitModule = new SimpleModule();
+ fetchEmitModule.addDeserializer(FetchEmitTuple.class, new FetchEmitTupleDeserializer());
+ fetchEmitModule.addSerializer(FetchEmitTuple.class, new FetchEmitTupleSerializer());
+ OBJECT_MAPPER.registerModule(fetchEmitModule);
}
public static FetchEmitTuple fromJson(Reader reader) throws IOException {
diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
index a395677c96a..af3e75f50af 100644
--- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
+++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
@@ -44,7 +44,7 @@
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
+import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.ContentHandlerFactory;
@@ -61,17 +61,22 @@ class ParseHandler {
private final CountDownLatch countDownLatch;
private final AutoDetectParser autoDetectParser;
private final RecursiveParserWrapper recursiveParserWrapper;
+ private final ContentHandlerFactory defaultContentHandlerFactory;
+ private final ParseMode defaultParseMode;
ParseHandler(Detector detector, Digester digester, ArrayBlockingQueue intermediateResult,
CountDownLatch countDownLatch, AutoDetectParser autoDetectParser,
- RecursiveParserWrapper recursiveParserWrapper) {
+ RecursiveParserWrapper recursiveParserWrapper, ContentHandlerFactory defaultContentHandlerFactory,
+ ParseMode defaultParseMode) {
this.detector = detector;
this.digester = digester;
this.intermediateResult = intermediateResult;
this.countDownLatch = countDownLatch;
this.autoDetectParser = autoDetectParser;
this.recursiveParserWrapper = recursiveParserWrapper;
+ this.defaultContentHandlerFactory = defaultContentHandlerFactory;
+ this.defaultParseMode = defaultParseMode;
}
PipesWorker.ParseDataOrPipesResult parseWithStream(FetchEmitTuple fetchEmitTuple, TikaInputStream stream, Metadata metadata, ParseContext parseContext)
@@ -79,12 +84,13 @@ PipesWorker.ParseDataOrPipesResult parseWithStream(FetchEmitTuple fetchEmitTuple
List metadataList;
//this adds the EmbeddedDocumentByteStore to the parsecontext
- HandlerConfig handlerConfig = parseContext.get(HandlerConfig.class);
- if (handlerConfig.getParseMode() == HandlerConfig.PARSE_MODE.RMETA) {
+ ParseMode parseMode = getParseMode(parseContext);
+ ContentHandlerFactory contentHandlerFactory = getContentHandlerFactory(parseContext);
+ if (parseMode == ParseMode.RMETA) {
metadataList =
- parseRecursive(fetchEmitTuple, handlerConfig, stream, metadata, parseContext);
+ parseRecursive(fetchEmitTuple, contentHandlerFactory, stream, metadata, parseContext);
} else {
- metadataList = parseConcatenated(fetchEmitTuple, handlerConfig, stream, metadata,
+ metadataList = parseConcatenated(fetchEmitTuple, contentHandlerFactory, stream, metadata,
parseContext);
}
@@ -92,6 +98,24 @@ PipesWorker.ParseDataOrPipesResult parseWithStream(FetchEmitTuple fetchEmitTuple
parseContext.get(EmbeddedDocumentBytesHandler.class)), null);
}
+ private ParseMode getParseMode(ParseContext parseContext) {
+ ParseMode mode = parseContext.get(ParseMode.class);
+ if (mode != null) {
+ return mode;
+ }
+ // Fall back to default loaded from TikaLoader
+ return defaultParseMode;
+ }
+
+ private ContentHandlerFactory getContentHandlerFactory(ParseContext parseContext) {
+ ContentHandlerFactory factory = parseContext.get(ContentHandlerFactory.class);
+ if (factory != null) {
+ return factory;
+ }
+ // Fall back to default loaded from TikaLoader
+ return defaultContentHandlerFactory;
+ }
+
private void _preParse(FetchEmitTuple t, TikaInputStream tis, Metadata metadata,
@@ -133,14 +157,16 @@ private Metadata preParse(FetchEmitTuple t, TikaInputStream tis, Metadata metada
}
public List parseRecursive(FetchEmitTuple fetchEmitTuple,
- HandlerConfig handlerConfig, TikaInputStream stream,
+ ContentHandlerFactory contentHandlerFactory, TikaInputStream stream,
Metadata metadata, ParseContext parseContext) throws InterruptedException {
//Intentionally do not add the metadata filter here!
//We need to let stacktraces percolate
+ int maxEmbeddedResources = -1;
+ if (contentHandlerFactory instanceof BasicContentHandlerFactory) {
+ maxEmbeddedResources = ((BasicContentHandlerFactory) contentHandlerFactory).getMaxEmbeddedResources();
+ }
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(handlerConfig.getType(),
- handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(),
- parseContext), handlerConfig.getMaxEmbeddedResources());
+ contentHandlerFactory, maxEmbeddedResources);
long start = System.currentTimeMillis();
@@ -168,25 +194,24 @@ public List parseRecursive(FetchEmitTuple fetchEmitTuple,
}
public List parseConcatenated(FetchEmitTuple fetchEmitTuple,
- HandlerConfig handlerConfig, TikaInputStream stream,
+ ContentHandlerFactory contentHandlerFactory, TikaInputStream stream,
Metadata metadata, ParseContext parseContext) throws InterruptedException {
- ContentHandlerFactory contentHandlerFactory =
- new BasicContentHandlerFactory(handlerConfig.getType(),
- handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(),
- parseContext);
-
- ContentHandler handler = contentHandlerFactory.getNewContentHandler();
+ ContentHandler handler = contentHandlerFactory.createHandler();
+ int maxEmbedded = -1;
+ if (contentHandlerFactory instanceof BasicContentHandlerFactory) {
+ maxEmbedded = ((BasicContentHandlerFactory) contentHandlerFactory).getMaxEmbeddedResources();
+ }
+ final int finalMaxEmbedded = maxEmbedded;
parseContext.set(DocumentSelector.class, new DocumentSelector() {
- final int maxEmbedded = handlerConfig.getMaxEmbeddedResources();
int embedded = 0;
@Override
public boolean select(Metadata metadata) {
- if (maxEmbedded < 0) {
+ if (finalMaxEmbedded < 0) {
return true;
}
- return embedded++ < maxEmbedded;
+ return embedded++ < finalMaxEmbedded;
}
});
diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
index 8a90b4d89c3..8c04a110c05 100644
--- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
+++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
@@ -77,6 +77,7 @@
import org.apache.tika.pipes.core.fetcher.FetcherManager;
import org.apache.tika.plugins.ExtensionConfig;
import org.apache.tika.plugins.TikaPluginManager;
+import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.serialization.ParseContextUtils;
import org.apache.tika.utils.ExceptionUtils;
@@ -150,6 +151,7 @@ public byte getByte() {
private final PipesConfig pipesConfig;
private final Socket socket;
private final MetadataFilter defaultMetadataFilter;
+ private final ContentHandlerFactory defaultContentHandlerFactory;
private AutoDetectParser autoDetectParser;
private RecursiveParserWrapper rMetaParser;
private FetcherManager fetcherManager;
@@ -176,7 +178,8 @@ public static PipesServer load(int port, Path tikaConfigPath) throws Exception {
socket.setSoTimeout((int) pipesConfig.getSocketTimeoutMs());
MetadataFilter metadataFilter = tikaLoader.loadMetadataFilters();
- PipesServer pipesServer = new PipesServer(pipesClientId, tikaLoader, pipesConfig, socket, dis, dos, metadataFilter);
+ ContentHandlerFactory contentHandlerFactory = tikaLoader.loadContentHandlerFactory();
+ PipesServer pipesServer = new PipesServer(pipesClientId, tikaLoader, pipesConfig, socket, dis, dos, metadataFilter, contentHandlerFactory);
pipesServer.initializeResources();
LOG.debug("pipesClientId={}: PipesServer loaded and ready", pipesClientId);
return pipesServer;
@@ -209,7 +212,7 @@ public static PipesServer load(int port, Path tikaConfigPath) throws Exception {
}
public PipesServer(String pipesClientId, TikaLoader tikaLoader, PipesConfig pipesConfig, Socket socket, DataInputStream in,
- DataOutputStream out, MetadataFilter metadataFilter) throws TikaConfigException,
+ DataOutputStream out, MetadataFilter metadataFilter, ContentHandlerFactory contentHandlerFactory) throws TikaConfigException,
IOException {
this.pipesClientId = pipesClientId;
@@ -217,6 +220,7 @@ public PipesServer(String pipesClientId, TikaLoader tikaLoader, PipesConfig pipe
this.pipesConfig = pipesConfig;
this.socket = socket;
this.defaultMetadataFilter = metadataFilter;
+ this.defaultContentHandlerFactory = contentHandlerFactory;
this.input = new DataInputStream(in);
this.output = new DataOutputStream(out);
this.heartbeatIntervalMs = pipesConfig.getHeartbeatIntervalMs();
@@ -357,7 +361,8 @@ public void mainLoop() {
private PipesWorker getPipesWorker(ArrayBlockingQueue intermediateResult, FetchEmitTuple fetchEmitTuple, CountDownLatch countDownLatch) {
FetchHandler fetchHandler = new FetchHandler(fetcherManager);
- ParseHandler parseHandler = new ParseHandler(detector, digester, intermediateResult, countDownLatch, autoDetectParser, rMetaParser);
+ ParseHandler parseHandler = new ParseHandler(detector, digester, intermediateResult, countDownLatch, autoDetectParser,
+ rMetaParser, defaultContentHandlerFactory, pipesConfig.getParseMode());
Long thresholdBytes = pipesConfig.getEmitStrategy().getThresholdBytes();
long threshold = (thresholdBytes != null) ? thresholdBytes : EmitStrategyConfig.DEFAULT_DIRECT_EMIT_THRESHOLD_BYTES;
EmitHandler emitHandler = new EmitHandler(defaultMetadataFilter, emitStrategy, emitterManager, threshold);
diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
index 8d15c92a0a0..b7793881274 100644
--- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
+++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
@@ -16,8 +16,6 @@
*/
package org.apache.tika.pipes.core.server;
-import static org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig.DEFAULT_HANDLER_CONFIG;
-
import java.io.Closeable;
import java.io.IOException;
import java.time.Duration;
@@ -41,7 +39,6 @@
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
import org.apache.tika.pipes.api.PipesResult;
import org.apache.tika.pipes.core.PipesResults;
import org.apache.tika.pipes.core.emitter.EmitterManager;
@@ -149,9 +146,8 @@ protected ParseDataOrPipesResult parseFromTuple() throws TikaException, Interrup
private ParseContext setupParseContext(FetchEmitTuple fetchEmitTuple) throws TikaException, IOException {
ParseContext parseContext = fetchEmitTuple.getParseContext();
- if (parseContext.get(HandlerConfig.class) == null) {
- parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG);
- }
+ // ContentHandlerFactory and ParseMode are retrieved from ParseContext in ParseHandler.
+ // They are set in ParseContext from PipesConfig loaded via TikaLoader at startup.
EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = parseContext.get(EmbeddedDocumentBytesConfig.class);
if (embeddedDocumentBytesConfig == null) {
//make sure there's one here -- or do we make this default in fetchemit tuple?
diff --git a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleTest.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleTest.java
index 4168d37a6f2..1650e7d00ad 100644
--- a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleTest.java
+++ b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleTest.java
@@ -27,10 +27,11 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
+import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
public class JsonFetchEmitTupleTest {
@@ -45,8 +46,11 @@ public void testBasic() throws Exception {
ParseContext parseContext = new ParseContext();
- HandlerConfig h = new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.XML, HandlerConfig.PARSE_MODE.CONCATENATE, 10000, 10, true);
- parseContext.set(HandlerConfig.class, h);
+ // Set ContentHandlerFactory and ParseMode in ParseContext
+ ContentHandlerFactory factory = new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.XML, 10000);
+ parseContext.set(ContentHandlerFactory.class, factory);
+ parseContext.set(ParseMode.class, ParseMode.CONCATENATE);
FetchEmitTuple t = new FetchEmitTuple("my_id", new FetchKey("my_fetcher", "fetchKey1"), new EmitKey("my_emitter", "emitKey1"), m, parseContext,
FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP);
@@ -66,12 +70,10 @@ public void testFetchRange() throws Exception {
m.add("m2", "v3");
m.add("m3", "v4");
- /**
- * TODO -- add this to the ParseContext
- * new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.XML,
- * HandlerConfig.PARSE_MODE.CONCATENATE,
- * 10000,10, true),
- */
+ // TODO -- add this to the ParseContext:
+ // parseContext.set(ContentHandlerFactory.class, new BasicContentHandlerFactory(
+ // BasicContentHandlerFactory.HANDLER_TYPE.XML, 10000));
+ // parseContext.set(ParseMode.class, ParseMode.CONCATENATE);
FetchEmitTuple t = new FetchEmitTuple("my_id", new FetchKey("my_fetcher", "fetchKey1", 10, 1000), new EmitKey("my_emitter", "emitKey1"), m, new ParseContext(),
FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP);
StringWriter writer = new StringWriter();
@@ -83,14 +85,12 @@ public void testFetchRange() throws Exception {
@Test
public void testBytes() throws Exception {
- /**
- * TODO -- add these to the ParseContext
- EmbeddedDocumentBytesConfig bytesConfig = new EmbeddedDocumentBytesConfig(true);
- bytesConfig.setEmitter("emitter");
- * new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.XML,
- * HandlerConfig.PARSE_MODE.CONCATENATE,
- * 10000,10, true)
- */
+ // TODO -- add these to the ParseContext:
+ // EmbeddedDocumentBytesConfig bytesConfig = new EmbeddedDocumentBytesConfig(true);
+ // bytesConfig.setEmitter("emitter");
+ // parseContext.set(ContentHandlerFactory.class, new BasicContentHandlerFactory(
+ // BasicContentHandlerFactory.HANDLER_TYPE.XML, 10000));
+ // parseContext.set(ParseMode.class, ParseMode.CONCATENATE);
FetchEmitTuple t = new FetchEmitTuple("my_id", new FetchKey("my_fetcher", "fetchKey1", 10, 1000), new EmitKey("my_emitter", "emitKey1"), new Metadata(), new ParseContext(),
FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP);
StringWriter writer = new StringWriter();
diff --git a/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParser.java b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParser.java
index 0420596d58c..cfb9251e30c 100644
--- a/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParser.java
+++ b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParser.java
@@ -33,13 +33,14 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
+import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.pipes.api.PipesResult;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
import org.apache.tika.pipes.core.PipesConfig;
import org.apache.tika.pipes.core.PipesException;
import org.apache.tika.pipes.core.PipesParser;
+import org.apache.tika.sax.ContentHandlerFactory;
/**
* A ForkParser implementation backed by {@link PipesParser}.
@@ -86,7 +87,8 @@
* Example usage:
*
* PipesForkParserConfig config = new PipesForkParserConfig();
- * config.setHandlerConfig(new HandlerConfig(HANDLER_TYPE.TEXT, PARSE_MODE.RMETA, -1, -1, true));
+ * config.setHandlerType(HANDLER_TYPE.TEXT);
+ * config.setParseMode(ParseMode.RMETA);
*
* try (PipesForkParser parser = new PipesForkParser(config)) {
* // Parse from a file
@@ -204,8 +206,9 @@ public PipesForkResult parse(TikaInputStream tis, Metadata metadata, ParseContex
FetchKey fetchKey = new FetchKey(config.getFetcherName(), absolutePath);
EmitKey emitKey = new EmitKey("", id); // Empty emitter name since we're using PASSBACK_ALL
- // Add handler config to parse context so server knows how to handle content
- parseContext.set(HandlerConfig.class, config.getHandlerConfig());
+ // Add content handler factory and parse mode to parse context
+ parseContext.set(ContentHandlerFactory.class, config.getContentHandlerFactory());
+ parseContext.set(ParseMode.class, config.getParseMode());
FetchEmitTuple tuple = new FetchEmitTuple(id, fetchKey, emitKey, metadata, parseContext);
diff --git a/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserConfig.java b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserConfig.java
index 8ffa0b555f1..467a2189730 100644
--- a/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserConfig.java
+++ b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserConfig.java
@@ -20,9 +20,10 @@
import java.util.ArrayList;
import java.util.List;
-import org.apache.tika.pipes.api.HandlerConfig;
+import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.pipes.core.PipesConfig;
import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
/**
* Configuration for {@link PipesForkParser}.
@@ -33,13 +34,15 @@
public class PipesForkParserConfig {
private final PipesConfig pipesConfig;
- private HandlerConfig handlerConfig;
+ private ContentHandlerFactory contentHandlerFactory;
+ private ParseMode parseMode = ParseMode.RMETA;
private String fetcherName = PipesForkParser.DEFAULT_FETCHER_NAME;
private Path pluginsDir;
public PipesForkParserConfig() {
this.pipesConfig = new PipesConfig();
- this.handlerConfig = new HandlerConfig();
+ this.contentHandlerFactory = new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1);
// Default to single client for simple fork parser use case
this.pipesConfig.setNumClients(1);
}
@@ -54,25 +57,34 @@ public PipesConfig getPipesConfig() {
}
/**
- * Get the handler configuration that specifies how content should be handled.
+ * Get the content handler factory that specifies how content should be handled.
*
- * @return the handler configuration
+ * @return the content handler factory
*/
- public HandlerConfig getHandlerConfig() {
- return handlerConfig;
+ public ContentHandlerFactory getContentHandlerFactory() {
+ return contentHandlerFactory;
}
/**
- * Set the handler configuration.
+ * Set the content handler factory.
*
- * @param handlerConfig the handler configuration
+ * @param contentHandlerFactory the content handler factory
* @return this config for chaining
*/
- public PipesForkParserConfig setHandlerConfig(HandlerConfig handlerConfig) {
- this.handlerConfig = handlerConfig;
+ public PipesForkParserConfig setContentHandlerFactory(ContentHandlerFactory contentHandlerFactory) {
+ this.contentHandlerFactory = contentHandlerFactory;
return this;
}
+ /**
+ * Get the parse mode.
+ *
+ * @return the parse mode
+ */
+ public ParseMode getParseMode() {
+ return parseMode;
+ }
+
/**
* Set the handler type (TEXT, HTML, XML, etc.).
*
@@ -80,7 +92,7 @@ public PipesForkParserConfig setHandlerConfig(HandlerConfig handlerConfig) {
* @return this config for chaining
*/
public PipesForkParserConfig setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE type) {
- this.handlerConfig.setType(type);
+ this.contentHandlerFactory = new BasicContentHandlerFactory(type, -1);
return this;
}
@@ -90,8 +102,8 @@ public PipesForkParserConfig setHandlerType(BasicContentHandlerFactory.HANDLER_T
* @param parseMode the parse mode
* @return this config for chaining
*/
- public PipesForkParserConfig setParseMode(HandlerConfig.PARSE_MODE parseMode) {
- this.handlerConfig.setParseMode(parseMode);
+ public PipesForkParserConfig setParseMode(ParseMode parseMode) {
+ this.parseMode = parseMode;
return this;
}
@@ -102,7 +114,9 @@ public PipesForkParserConfig setParseMode(HandlerConfig.PARSE_MODE parseMode) {
* @return this config for chaining
*/
public PipesForkParserConfig setWriteLimit(int writeLimit) {
- this.handlerConfig.setWriteLimit(writeLimit);
+ if (contentHandlerFactory instanceof BasicContentHandlerFactory bcf) {
+ this.contentHandlerFactory = new BasicContentHandlerFactory(bcf.getType(), writeLimit);
+ }
return this;
}
@@ -113,7 +127,9 @@ public PipesForkParserConfig setWriteLimit(int writeLimit) {
* @return this config for chaining
*/
public PipesForkParserConfig setMaxEmbeddedResources(int maxEmbeddedResources) {
- this.handlerConfig.setMaxEmbeddedResources(maxEmbeddedResources);
+ if (contentHandlerFactory instanceof BasicContentHandlerFactory bcf) {
+ bcf.setMaxEmbeddedResources(maxEmbeddedResources);
+ }
return this;
}
diff --git a/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java b/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java
index 30fc322dcef..34e56552b33 100644
--- a/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java
+++ b/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java
@@ -38,7 +38,7 @@
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.pipes.api.HandlerConfig;
+import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.pipes.api.PipesResult;
import org.apache.tika.sax.BasicContentHandlerFactory;
@@ -80,7 +80,7 @@ public void testParseTextFile() throws Exception {
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
- .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
+ .setParseMode(ParseMode.RMETA)
.setTimeoutMillis(60000)
.addJvmArg("-Xmx256m");
@@ -114,7 +114,7 @@ public void testParseWithMetadata() throws Exception {
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
- .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
+ .setParseMode(ParseMode.RMETA)
.setTimeoutMillis(60000);
try (PipesForkParser parser = new PipesForkParser(config);
@@ -144,7 +144,7 @@ public void testParseMultipleFiles() throws Exception {
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
- .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
+ .setParseMode(ParseMode.RMETA)
.setTimeoutMillis(60000);
try (PipesForkParser parser = new PipesForkParser(config)) {
@@ -171,7 +171,7 @@ public void testConcatenateMode() throws Exception {
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
- .setParseMode(HandlerConfig.PARSE_MODE.CONCATENATE)
+ .setParseMode(ParseMode.CONCATENATE)
.setTimeoutMillis(60000);
try (PipesForkParser parser = new PipesForkParser(config);
@@ -204,7 +204,7 @@ public void testRmetaModeWithEmbedded() throws Exception {
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
- .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
+ .setParseMode(ParseMode.RMETA)
.setTimeoutMillis(60000);
try (PipesForkParser parser = new PipesForkParser(config);
@@ -232,7 +232,7 @@ public void testDefaultConfigMatchesExplicitRmeta() throws Exception {
PipesForkParserConfig explicitConfig = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
- .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
+ .setParseMode(ParseMode.RMETA)
.setTimeoutMillis(60000);
int explicitMetadataCount;
@@ -268,7 +268,7 @@ public void testTextVsXhtmlHandlerType() throws Exception {
PipesForkParserConfig textConfig = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
- .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
+ .setParseMode(ParseMode.RMETA)
.setTimeoutMillis(60000);
String textContent;
@@ -288,7 +288,7 @@ public void testTextVsXhtmlHandlerType() throws Exception {
PipesForkParserConfig xmlConfig = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.XML)
- .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
+ .setParseMode(ParseMode.RMETA)
.setTimeoutMillis(60000);
String xmlContent;
@@ -322,7 +322,7 @@ public void testWriteLimit() throws Exception {
PipesForkParserConfig config = new PipesForkParserConfig()
.setPluginsDir(PLUGINS_DIR)
.setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
- .setParseMode(HandlerConfig.PARSE_MODE.RMETA)
+ .setParseMode(ParseMode.RMETA)
.setWriteLimit(100) // Limit to 100 characters
.setTimeoutMillis(60000);
diff --git a/tika-pipes/tika-pipes-integration-tests/pom.xml b/tika-pipes/tika-pipes-integration-tests/pom.xml
index d1833aa5002..56bb2d1225b 100644
--- a/tika-pipes/tika-pipes-integration-tests/pom.xml
+++ b/tika-pipes/tika-pipes-integration-tests/pom.xml
@@ -141,6 +141,14 @@
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+
+
+ false
+
+
diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
index b85bcdf516a..1cba2622ac2 100644
--- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
+++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
@@ -617,6 +617,54 @@ public void testEmitterNotFound(@TempDir Path tmp) throws Exception {
}
}
+ @Test
+ public void testCustomContentHandlerFactory(@TempDir Path tmp) throws Exception {
+ // Test that a custom ContentHandlerFactory configured in tika-config.json
+ // is properly used during parsing. The UppercasingContentHandlerFactory
+ // converts all extracted text to uppercase.
+ Path inputDir = tmp.resolve("input");
+ Files.createDirectories(inputDir);
+
+ // Create a simple mock XML file with known content
+ String mockContent = "" + "" +
+ "Test Author " +
+ "Hello World from Tika " +
+ " ";
+ String testFile = "test-uppercase.xml";
+ Files.write(inputDir.resolve(testFile), mockContent.getBytes(StandardCharsets.UTF_8));
+
+ // Use the uppercasing config
+ Path tikaConfigPath = PluginsTestHelper.getFileSystemFetcherConfig(
+ "tika-config-uppercasing.json", tmp, inputDir, tmp.resolve("output"), false);
+ TikaJsonConfig tikaJsonConfig = TikaJsonConfig.load(tikaConfigPath);
+ PipesConfig pipesConfig = PipesConfig.load(tikaJsonConfig);
+
+ try (PipesClient pipesClient = new PipesClient(pipesConfig, tikaConfigPath)) {
+ FetchEmitTuple tuple = new FetchEmitTuple(testFile,
+ new FetchKey(fetcherName, testFile),
+ new EmitKey(), new Metadata(), new ParseContext(),
+ FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP);
+
+ PipesResult pipesResult = pipesClient.process(tuple);
+
+ // Should succeed
+ assertTrue(pipesResult.isSuccess(),
+ "Processing should succeed. Got status: " + pipesResult.status() +
+ ", message: " + pipesResult.message());
+
+ Assertions.assertNotNull(pipesResult.emitData().getMetadataList());
+ assertEquals(1, pipesResult.emitData().getMetadataList().size());
+
+ Metadata metadata = pipesResult.emitData().getMetadataList().get(0);
+
+ // The content should be uppercased due to UppercasingContentHandlerFactory
+ String content = metadata.get(TikaCoreProperties.TIKA_CONTENT);
+ Assertions.assertNotNull(content, "Content should not be null");
+ assertTrue(content.contains("HELLO WORLD FROM TIKA"),
+ "Content should be uppercased. Actual content: " + content);
+ }
+ }
+
@Test
public void testHeartbeatProtocol(@TempDir Path tmp) throws Exception {
// Test that heartbeat protocol works correctly and doesn't cause protocol errors
diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesServerTest.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesServerTest.java
index 7c137084c79..621822fd236 100644
--- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesServerTest.java
+++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesServerTest.java
@@ -16,9 +16,6 @@
*/
package org.apache.tika.pipes.core;
-
-
-
import org.apache.tika.TikaTest;
public class PipesServerTest extends TikaTest {
diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/async/MockDigesterFactory.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/async/MockDigesterFactory.java
deleted file mode 100644
index 9df3e9866f3..00000000000
--- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/async/MockDigesterFactory.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.pipes.core.async;
-
-import org.apache.tika.config.TikaComponent;
-import org.apache.tika.digest.Digester;
-import org.apache.tika.digest.DigesterFactory;
-import org.apache.tika.digest.Encoder;
-import org.apache.tika.digest.InputStreamDigester;
-
-@TikaComponent
-public class MockDigesterFactory implements DigesterFactory {
-
- @Override
- public Digester build() {
- return new InputStreamDigester(1000000, "SHA-256", "X-TIKA:digest:SHA-256", new MockEncoder());
- }
-
- private static class MockEncoder implements Encoder {
-
- @Override
- public String encode(byte[] bytes) {
- StringBuilder hexString = new StringBuilder(2 * bytes.length);
- for (int i = 0; i < bytes.length; i++) {
- String hex = Integer.toHexString(0xff & bytes[i]);
- if (hex.length() == 1) {
- hexString.append('0');
- }
- hexString.append(hex);
- }
- return hexString.toString();
- }
- }
-
-}
diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json
index f0283182078..5873c39a87b 100644
--- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json
+++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json
@@ -1,4 +1,12 @@
{
+ "content-handler-factory": {
+ "basic-content-handler-factory": {
+ "type": "TEXT",
+ "writeLimit": -1,
+ "maxEmbeddedResources": -1,
+ "throwOnWriteLimitReached": true
+ }
+ },
"fetchers": {
"fsf": {
"file-system-fetcher": {
@@ -18,27 +26,15 @@
},
"pipes-iterator": {
"file-system-pipes-iterator": {
- "fspi": {
- "basePath": "FETCHER_BASE_PATH",
- "countTotal": true,
- "baseConfig": {
- "fetcherId": "fsf",
- "emitterId": "fse",
- "handlerConfig": {
- "type": "TEXT",
- "parseMode": "RMETA",
- "writeLimit": -1,
- "maxEmbeddedResources": -1,
- "throwOnWriteLimitReached": true
- },
- "onParseException": "EMIT",
- "maxWaitMs": 600000,
- "queueSize": 10000
- }
- }
+ "basePath": "FETCHER_BASE_PATH",
+ "countTotal": true,
+ "fetcherId": "fsf",
+ "emitterId": "fse"
}
},
"pipes": {
+ "parseMode": "RMETA",
+ "onParseException": "EMIT",
"numClients": 4,
"timeoutMillis": 5000,
"emitIntermediateResults": "EMIT_INTERMEDIATE_RESULTS",
diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json
index 153a68796dc..529e878cb60 100644
--- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json
+++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json
@@ -1,4 +1,12 @@
{
+ "content-handler-factory": {
+ "basic-content-handler-factory": {
+ "type": "TEXT",
+ "writeLimit": -1,
+ "maxEmbeddedResources": -1,
+ "throwOnWriteLimitReached": true
+ }
+ },
"fetchers": {
"fsf": {
"file-system-fetcher": {
@@ -18,27 +26,15 @@
},
"pipes-iterator": {
"file-system-pipes-iterator": {
- "fspi": {
- "basePath": "FETCHER_BASE_PATH",
- "countTotal": true,
- "baseConfig": {
- "fetcherId": "fsf",
- "emitterId": "fse",
- "handlerConfig": {
- "type": "TEXT",
- "parseMode": "RMETA",
- "writeLimit": -1,
- "maxEmbeddedResources": -1,
- "throwOnWriteLimitReached": true
- },
- "onParseException": "EMIT",
- "maxWaitMs": 600000,
- "queueSize": 10000
- }
- }
+ "basePath": "FETCHER_BASE_PATH",
+ "countTotal": true,
+ "fetcherId": "fsf",
+ "emitterId": "fse"
}
},
"pipes": {
+ "parseMode": "RMETA",
+ "onParseException": "EMIT",
"numClients": 4,
"timeoutMillis": 5000,
"emitIntermediateResults": "EMIT_INTERMEDIATE_RESULTS",
diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
index 873ce685a52..b58bfe269c6 100644
--- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
+++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
@@ -1,4 +1,12 @@
{
+ "content-handler-factory": {
+ "basic-content-handler-factory": {
+ "type": "TEXT",
+ "writeLimit": -1,
+ "maxEmbeddedResources": -1,
+ "throwOnWriteLimitReached": true
+ }
+ },
"fetchers": {
"fsf": {
"file-system-fetcher": {
@@ -18,30 +26,18 @@
},
"pipes-iterator": {
"file-system-pipes-iterator": {
- "fspi": {
- "basePath": "FETCHER_BASE_PATH",
- "countTotal": true,
- "baseConfig": {
- "fetcherId": "fsf",
- "emitterId": "fse",
- "handlerConfig": {
- "type": "TEXT",
- "parseMode": "RMETA",
- "writeLimit": -1,
- "maxEmbeddedResources": -1,
- "throwOnWriteLimitReached": true
- },
- "onParseException": "EMIT",
- "maxWaitMs": 600000,
- "queueSize": 10000
- }
- }
+ "basePath": "FETCHER_BASE_PATH",
+ "countTotal": true,
+ "fetcherId": "fsf",
+ "emitterId": "fse"
}
},
"pipes": {
+ "parseMode": "RMETA",
+ "onParseException": "EMIT",
"numClients": 4,
"timeoutMillis": 5000,
- "emitIntermediateResults": EMIT_INTERMEDIATE_RESULTS,
+ "emitIntermediateResults": "EMIT_INTERMEDIATE_RESULTS",
"forkedJvmArgs": ["-Xmx512m"],
"emitStrategy": {
"type": "DYNAMIC",
diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json
new file mode 100644
index 00000000000..e7d8a21c028
--- /dev/null
+++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json
@@ -0,0 +1,52 @@
+{
+ "content-handler-factory": {
+ "uppercasing-content-handler-factory": {}
+ },
+ "fetchers": {
+ "fsf": {
+ "file-system-fetcher": {
+ "basePath": "FETCHER_BASE_PATH",
+ "extractFileSystemMetadata": false
+ }
+ }
+ },
+ "emitters": {
+ "fse": {
+ "file-system-emitter": {
+ "basePath": "EMITTER_BASE_PATH",
+ "fileExtension": "json",
+ "onExists": "EXCEPTION"
+ }
+ }
+ },
+ "pipes-iterator": {
+ "file-system-pipes-iterator": {
+ "basePath": "FETCHER_BASE_PATH",
+ "countTotal": true,
+ "fetcherId": "fsf",
+ "emitterId": "fse"
+ }
+ },
+ "pipes": {
+ "parseMode": "RMETA",
+ "onParseException": "EMIT",
+ "numClients": 4,
+ "timeoutMillis": 5000,
+ "emitIntermediateResults": "EMIT_INTERMEDIATE_RESULTS",
+ "forkedJvmArgs": ["-Xmx512m"],
+ "emitStrategy": {
+ "type": "DYNAMIC",
+ "thresholdBytes": 1000000
+ }
+ },
+ "auto-detect-parser": {
+ "spoolToDisk": 1000000,
+ "outputThreshold": 1000000,
+ "skipContainerDocumentDigest": false,
+ "digesterFactory": {
+ "mock-digester-factory": {}
+ },
+ "throwOnZeroBytes": false
+ },
+ "plugin-roots": "PLUGINS_PATHS"
+}
diff --git a/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorBase.java b/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorBase.java
index 4fd11352da1..8a4622dcb8e 100644
--- a/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorBase.java
+++ b/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorBase.java
@@ -54,6 +54,16 @@ public abstract class PipesIteratorBase extends AbstractTikaExtension implements
private int added = 0;
private FutureTask futureTask;
+ /**
+ * The fetcher ID to use for fetching documents.
+ */
+ private String fetcherId;
+
+ /**
+ * The emitter ID to use for emitting results.
+ */
+ private String emitterId;
+
public PipesIteratorBase(ExtensionConfig pluginConfig) {
super(pluginConfig);
}
diff --git a/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorConfig.java b/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorConfig.java
new file mode 100644
index 00000000000..e8356a64a86
--- /dev/null
+++ b/tika-pipes/tika-pipes-iterator-commons/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIteratorConfig.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.pipesiterator;
+
+import java.util.Objects;
+
+/**
+ * Abstract base class for pipes iterator configurations.
+ * Provides the common fetcherId and emitterId fields that all iterators need.
+ *
+ * ContentHandlerFactory, ParseMode, and other parsing settings should be loaded
+ * from tika-config.json via TikaLoader and set in PipesConfig.
+ */
+public abstract class PipesIteratorConfig {
+
+ private String fetcherId;
+ private String emitterId;
+
+ public String getFetcherId() {
+ return fetcherId;
+ }
+
+ public void setFetcherId(String fetcherId) {
+ this.fetcherId = fetcherId;
+ }
+
+ public String getEmitterId() {
+ return emitterId;
+ }
+
+ public void setEmitterId(String emitterId) {
+ this.emitterId = emitterId;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (!(o instanceof PipesIteratorConfig that)) return false;
+ return Objects.equals(fetcherId, that.fetcherId) &&
+ Objects.equals(emitterId, that.emitterId);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(fetcherId, emitterId);
+ }
+}
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIterator.java
index 8d56f2e87d7..855059914c6 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIterator.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIterator.java
@@ -36,10 +36,8 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig;
import org.apache.tika.pipes.pipesiterator.PipesIteratorBase;
import org.apache.tika.plugins.ExtensionConfig;
import org.apache.tika.utils.StringUtils;
@@ -81,10 +79,8 @@ private void checkConfig(AZBlobPipesIteratorConfig config) throws TikaConfigExce
@Override
protected void enqueue() throws InterruptedException, IOException, TimeoutException {
- PipesIteratorBaseConfig baseConfig = config.getBaseConfig();
- String fetcherId = baseConfig.fetcherId();
- String emitterId = baseConfig.emitterId();
- HandlerConfig handlerConfig = baseConfig.handlerConfig();
+ String fetcherId = config.getFetcherId();
+ String emitterId = config.getEmitterId();
long start = System.currentTimeMillis();
int count = 0;
@@ -125,10 +121,9 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept
}
//TODO -- extract metadata about content length etc from properties
ParseContext parseContext = new ParseContext();
- parseContext.set(HandlerConfig.class, handlerConfig);
tryToAdd(new FetchEmitTuple(blob.getName(), new FetchKey(fetcherId, blob.getName()),
new EmitKey(emitterId, blob.getName()), new Metadata(), parseContext,
- baseConfig.onParseException()));
+ FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT));
count++;
}
long elapsed = System.currentTimeMillis() - start;
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorConfig.java
index 068ff346044..ef3d78a49a8 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorConfig.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorConfig.java
@@ -22,10 +22,9 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig;
+import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig;
-public class AZBlobPipesIteratorConfig implements PipesIteratorConfig {
+public class AZBlobPipesIteratorConfig extends PipesIteratorConfig {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@@ -45,7 +44,6 @@ public static AZBlobPipesIteratorConfig load(final String json)
private String container;
private String prefix = "";
private long timeoutMillis = 360000;
- private PipesIteratorBaseConfig baseConfig = null;
public String getSasToken() {
return sasToken;
@@ -68,32 +66,28 @@ public long getTimeoutMillis() {
}
@Override
- public PipesIteratorBaseConfig getBaseConfig() {
- return baseConfig;
- }
-
- @Override
- public final boolean equals(Object o) {
+ public boolean equals(Object o) {
if (!(o instanceof AZBlobPipesIteratorConfig that)) {
return false;
}
-
+ if (!super.equals(o)) {
+ return false;
+ }
return timeoutMillis == that.timeoutMillis &&
Objects.equals(sasToken, that.sasToken) &&
Objects.equals(endpoint, that.endpoint) &&
Objects.equals(container, that.container) &&
- Objects.equals(prefix, that.prefix) &&
- Objects.equals(baseConfig, that.baseConfig);
+ Objects.equals(prefix, that.prefix);
}
@Override
public int hashCode() {
- int result = Objects.hashCode(sasToken);
+ int result = super.hashCode();
+ result = 31 * result + Objects.hashCode(sasToken);
result = 31 * result + Objects.hashCode(endpoint);
result = 31 * result + Objects.hashCode(container);
result = 31 * result + Objects.hashCode(prefix);
result = 31 * result + Long.hashCode(timeoutMillis);
- result = 31 * result + Objects.hashCode(baseConfig);
return result;
}
}
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/iterator/azblob/TestAZBlobPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/iterator/azblob/TestAZBlobPipesIterator.java
index 298b16ebc84..4c81e4ae8df 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/iterator/azblob/TestAZBlobPipesIterator.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/iterator/azblob/TestAZBlobPipesIterator.java
@@ -48,10 +48,9 @@ public void testSimple() throws Exception {
configNode.put("endpoint", ""); // use one
configNode.put("sasToken", ""); // find one
- ObjectNode baseConfigNode = MAPPER.createObjectNode();
- baseConfigNode.put("fetcherId", "az-blob");
- baseConfigNode.put("emitterId", "test-emitter");
- configNode.set("baseConfig", baseConfigNode);
+ // Add fetcherId and emitterId at root level (not nested in baseConfig)
+ configNode.put("fetcherId", "az-blob");
+ configNode.put("emitterId", "test-emitter");
ExtensionConfig extensionConfig = new ExtensionConfig("test-az-blob", "az-blob-pipes-iterator",
MAPPER.writeValueAsString(configNode));
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIterator.java
index 7ca24c03e15..317db26e132 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIterator.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIterator.java
@@ -34,7 +34,6 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
import org.apache.tika.pipes.pipesiterator.PipesIteratorBase;
@@ -91,8 +90,8 @@ public static CSVPipesIterator build(ExtensionConfig extensionConfig) throws IOE
@Override
protected void enqueue() throws InterruptedException, IOException, TimeoutException {
- String fetcherPluginId = config.getBaseConfig().fetcherId();
- String emitterName = config.getBaseConfig().emitterId();
+ String fetcherId = config.getFetcherId();
+ String emitterId = config.getEmitterId();
try (Reader reader = Files.newBufferedReader(config.getCsvPath(), charset)) {
Iterable records = CSVFormat.EXCEL.parse(reader);
List headers = new ArrayList<>();
@@ -103,17 +102,16 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept
}
try {
- checkFetchEmitValidity(fetcherPluginId, emitterName, fetchEmitKeyIndices, headers);
+ checkFetchEmitValidity(fetcherId, emitterId, fetchEmitKeyIndices, headers);
} catch (TikaConfigException e) {
throw new IOException(e);
}
- HandlerConfig handlerConfig = config.getBaseConfig().handlerConfig();
for (CSVRecord record : records) {
String id = record.get(fetchEmitKeyIndices.idIndex);
String fetchKey = record.get(fetchEmitKeyIndices.fetchKeyIndex);
String emitKey = record.get(fetchEmitKeyIndices.emitKeyIndex);
- if (StringUtils.isBlank(fetchKey) && !StringUtils.isBlank(fetcherPluginId)) {
- LOGGER.debug("Fetcher specified ({}), but no fetchkey was found in ({})", fetcherPluginId, record);
+ if (StringUtils.isBlank(fetchKey) && !StringUtils.isBlank(fetcherId)) {
+ LOGGER.debug("Fetcher specified ({}), but no fetchkey was found in ({})", fetcherId, record);
}
if (StringUtils.isBlank(emitKey)) {
throw new IOException("emitKey must not be blank in :" + record);
@@ -121,27 +119,26 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept
Metadata metadata = loadMetadata(fetchEmitKeyIndices, headers, record);
ParseContext parseContext = new ParseContext();
- parseContext.set(HandlerConfig.class, handlerConfig);
- tryToAdd(new FetchEmitTuple(id, new FetchKey(fetcherPluginId, fetchKey), new EmitKey(emitterName, emitKey), metadata, parseContext,
- config.getBaseConfig().onParseException()));
+ tryToAdd(new FetchEmitTuple(id, new FetchKey(fetcherId, fetchKey), new EmitKey(emitterId, emitKey), metadata, parseContext,
+ FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT));
}
}
}
- private void checkFetchEmitValidity(String fetcherPluginId, String emitterName, FetchEmitKeyIndices fetchEmitKeyIndices, List headers) throws TikaConfigException {
+ private void checkFetchEmitValidity(String fetcherId, String emitterId, FetchEmitKeyIndices fetchEmitKeyIndices, List headers) throws TikaConfigException {
String fetchKeyColumn = config.getFetchKeyColumn();
String emitKeyColumn = config.getEmitKeyColumn();
String idColumn = config.getIdColumn();
- if (StringUtils.isBlank(emitterName)) {
- throw new TikaConfigException("must specify at least an emitterName");
+ if (StringUtils.isBlank(emitterId)) {
+ throw new TikaConfigException("must specify at least an emitterId");
}
- if (StringUtils.isBlank(fetcherPluginId) && !StringUtils.isBlank(fetchKeyColumn)) {
- throw new TikaConfigException("If specifying a 'fetchKeyColumn', " + "you must also specify a 'fetcherPluginId'");
+ if (StringUtils.isBlank(fetcherId) && !StringUtils.isBlank(fetchKeyColumn)) {
+ throw new TikaConfigException("If specifying a 'fetchKeyColumn', " + "you must also specify a 'fetcherId'");
}
- if (StringUtils.isBlank(fetcherPluginId)) {
+ if (StringUtils.isBlank(fetcherId)) {
LOGGER.info("No fetcher specified. This will be metadata only");
}
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorConfig.java
index 46bee035e9a..3a5231821d8 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorConfig.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorConfig.java
@@ -23,10 +23,9 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig;
+import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig;
-public class CSVPipesIteratorConfig implements PipesIteratorConfig {
+public class CSVPipesIteratorConfig extends PipesIteratorConfig {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@@ -45,7 +44,6 @@ public static CSVPipesIteratorConfig load(final String json)
private String fetchKeyColumn;
private String emitKeyColumn;
private String idColumn;
- private PipesIteratorBaseConfig baseConfig = null;
public Path getCsvPath() {
return csvPath;
@@ -64,30 +62,26 @@ public String getIdColumn() {
}
@Override
- public PipesIteratorBaseConfig getBaseConfig() {
- return baseConfig;
- }
-
- @Override
- public final boolean equals(Object o) {
+ public boolean equals(Object o) {
if (!(o instanceof CSVPipesIteratorConfig that)) {
return false;
}
-
+ if (!super.equals(o)) {
+ return false;
+ }
return Objects.equals(csvPath, that.csvPath) &&
Objects.equals(fetchKeyColumn, that.fetchKeyColumn) &&
Objects.equals(emitKeyColumn, that.emitKeyColumn) &&
- Objects.equals(idColumn, that.idColumn) &&
- Objects.equals(baseConfig, that.baseConfig);
+ Objects.equals(idColumn, that.idColumn);
}
@Override
public int hashCode() {
- int result = Objects.hashCode(csvPath);
+ int result = super.hashCode();
+ result = 31 * result + Objects.hashCode(csvPath);
result = 31 * result + Objects.hashCode(fetchKeyColumn);
result = 31 * result + Objects.hashCode(emitKeyColumn);
result = 31 * result + Objects.hashCode(idColumn);
- result = 31 * result + Objects.hashCode(baseConfig);
return result;
}
}
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/java/org/apache/tika/pipes/iterator/csv/TestCSVPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/java/org/apache/tika/pipes/iterator/csv/TestCSVPipesIterator.java
index d423119e9d9..b2f090231b3 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/java/org/apache/tika/pipes/iterator/csv/TestCSVPipesIterator.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/test/java/org/apache/tika/pipes/iterator/csv/TestCSVPipesIterator.java
@@ -114,11 +114,9 @@ private CSVPipesIterator createIterator(Path csvPath, String fetcherName, String
jsonConfig.put("idColumn", idColumn);
}
- // Add baseConfig
- ObjectNode baseConfig = OBJECT_MAPPER.createObjectNode();
- baseConfig.put("fetcherId", fetcherName);
- baseConfig.put("emitterId", emitterName);
- jsonConfig.set("baseConfig", baseConfig);
+ // Add fetcherId and emitterId at root level (not nested in baseConfig)
+ jsonConfig.put("fetcherId", fetcherName);
+ jsonConfig.put("emitterId", emitterName);
ExtensionConfig extensionConfig = new ExtensionConfig("test-csv-iterator", "csv-pipes-iterator",
OBJECT_MAPPER.writeValueAsString(jsonConfig));
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIterator.java
index 4dedfaf478f..bb4b1fc0dcd 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIterator.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIterator.java
@@ -34,10 +34,8 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig;
import org.apache.tika.pipes.api.pipesiterator.TotalCountResult;
import org.apache.tika.pipes.api.pipesiterator.TotalCounter;
import org.apache.tika.pipes.pipesiterator.PipesIteratorBase;
@@ -79,9 +77,8 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept
"\"basePath\" directory does not exist: " + config
.getBasePath().toAbsolutePath());
}
- PipesIteratorBaseConfig config = this.config.getBaseConfig();
try {
- Files.walkFileTree(this.config.getBasePath(), new FSFileVisitor(config.fetcherId(), config.emitterId()));
+ Files.walkFileTree(config.getBasePath(), new FSFileVisitor(config.getFetcherId(), config.getEmitterId()));
} catch (IOException e) {
Throwable cause = e.getCause();
if (cause != null && cause instanceof TimeoutException) {
@@ -139,15 +136,14 @@ public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs)
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
- String relPath = config
+ String relPath = FileSystemPipesIterator.this.config
.getBasePath().relativize(file).toString();
- PipesIteratorBaseConfig config = FileSystemPipesIterator.this.config.getBaseConfig();
try {
ParseContext parseContext = new ParseContext();
- parseContext.set(HandlerConfig.class, config.handlerConfig());
+ // ContentHandlerFactory, ParseMode, and onParseException come from PipesConfig loaded via TikaLoader
tryToAdd(new FetchEmitTuple(relPath, new FetchKey(fetcherId, relPath),
new EmitKey(emitterId, relPath), new Metadata(), parseContext,
- config.onParseException()));
+ FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT));
} catch (TimeoutException e) {
throw new IOException(e);
} catch (InterruptedException e) {
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorConfig.java
index 0648620fc4c..61eeeb66a65 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorConfig.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorConfig.java
@@ -23,10 +23,9 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig;
+import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig;
-public class FileSystemPipesIteratorConfig implements PipesIteratorConfig {
+public class FileSystemPipesIteratorConfig extends PipesIteratorConfig {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@@ -44,7 +43,6 @@ public static FileSystemPipesIteratorConfig load(final String json)
private Path basePath = null;
private boolean countTotal = true;
- private PipesIteratorBaseConfig baseConfig = null;
public Path getBasePath() {
return basePath;
@@ -55,24 +53,21 @@ public boolean isCountTotal() {
}
@Override
- public PipesIteratorBaseConfig getBaseConfig() {
- return baseConfig;
- }
-
- @Override
- public final boolean equals(Object o) {
+ public boolean equals(Object o) {
if (!(o instanceof FileSystemPipesIteratorConfig that)) {
return false;
}
-
- return countTotal == that.countTotal && Objects.equals(basePath, that.basePath) && Objects.equals(baseConfig, that.baseConfig);
+ if (!super.equals(o)) {
+ return false;
+ }
+ return countTotal == that.countTotal && Objects.equals(basePath, that.basePath);
}
@Override
public int hashCode() {
- int result = Objects.hashCode(basePath);
+ int result = super.hashCode();
+ result = 31 * result + Objects.hashCode(basePath);
result = 31 * result + Boolean.hashCode(countTotal);
- result = 31 * result + Objects.hashCode(baseConfig);
return result;
}
}
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIterator.java
index f25fd696af1..0b64c18812c 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIterator.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIterator.java
@@ -30,10 +30,8 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig;
import org.apache.tika.pipes.pipesiterator.PipesIteratorBase;
import org.apache.tika.plugins.ExtensionConfig;
import org.apache.tika.utils.StringUtils;
@@ -71,12 +69,10 @@ public static GCSPipesIterator build(ExtensionConfig extensionConfig) throws IOE
@Override
protected void enqueue() throws InterruptedException, IOException, TimeoutException {
- PipesIteratorBaseConfig baseConfig = config.getBaseConfig();
- String fetcherPluginId = baseConfig.fetcherId();
- String emitterName = baseConfig.emitterId();
+ String fetcherId = config.getFetcherId();
+ String emitterId = config.getEmitterId();
long start = System.currentTimeMillis();
int count = 0;
- HandlerConfig handlerConfig = baseConfig.handlerConfig();
Page blobs = null;
String prefix = config.getPrefix();
@@ -96,9 +92,8 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept
LOGGER.debug("adding ({}) {} in {} ms", count, blob.getName(), elapsed);
//TODO -- allow user specified metadata as the "id"?
ParseContext parseContext = new ParseContext();
- parseContext.set(HandlerConfig.class, handlerConfig);
- tryToAdd(new FetchEmitTuple(blob.getName(), new FetchKey(fetcherPluginId, blob.getName()), new EmitKey(emitterName, blob.getName()), new Metadata(), parseContext,
- baseConfig.onParseException()));
+ tryToAdd(new FetchEmitTuple(blob.getName(), new FetchKey(fetcherId, blob.getName()), new EmitKey(emitterId, blob.getName()), new Metadata(), parseContext,
+ FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT));
count++;
}
long elapsed = System.currentTimeMillis() - start;
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorConfig.java
index f4c4f1690e3..d87fea102a1 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorConfig.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorConfig.java
@@ -22,10 +22,9 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig;
+import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig;
-public class GCSPipesIteratorConfig implements PipesIteratorConfig {
+public class GCSPipesIteratorConfig extends PipesIteratorConfig {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@@ -42,7 +41,6 @@ public static GCSPipesIteratorConfig load(final String json)
private String bucket;
private String prefix = "";
private String projectId = "";
- private PipesIteratorBaseConfig baseConfig = null;
public String getBucket() {
return bucket;
@@ -57,28 +55,24 @@ public String getProjectId() {
}
@Override
- public PipesIteratorBaseConfig getBaseConfig() {
- return baseConfig;
- }
-
- @Override
- public final boolean equals(Object o) {
+ public boolean equals(Object o) {
if (!(o instanceof GCSPipesIteratorConfig that)) {
return false;
}
-
+ if (!super.equals(o)) {
+ return false;
+ }
return Objects.equals(bucket, that.bucket) &&
Objects.equals(prefix, that.prefix) &&
- Objects.equals(projectId, that.projectId) &&
- Objects.equals(baseConfig, that.baseConfig);
+ Objects.equals(projectId, that.projectId);
}
@Override
public int hashCode() {
- int result = Objects.hashCode(bucket);
+ int result = super.hashCode();
+ result = 31 * result + Objects.hashCode(bucket);
result = 31 * result + Objects.hashCode(prefix);
result = 31 * result + Objects.hashCode(projectId);
- result = 31 * result + Objects.hashCode(baseConfig);
return result;
}
}
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/iterator/gcs/TestGCSPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/iterator/gcs/TestGCSPipesIterator.java
index a8f2310b71e..3af3c4342c2 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/iterator/gcs/TestGCSPipesIterator.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/iterator/gcs/TestGCSPipesIterator.java
@@ -86,11 +86,9 @@ private GCSPipesIterator createIterator(String bucket, String projectId, String
jsonConfig.put("prefix", prefix);
}
- // Add baseConfig
- ObjectNode baseConfig = OBJECT_MAPPER.createObjectNode();
- baseConfig.put("fetcherId", fetcherName);
- baseConfig.put("emitterId", emitterName);
- jsonConfig.set("baseConfig", baseConfig);
+ // Add fetcherId and emitterId at root level (not nested in baseConfig)
+ jsonConfig.put("fetcherId", fetcherName);
+ jsonConfig.put("emitterId", emitterName);
ExtensionConfig extensionConfig = new ExtensionConfig("test-gcs-iterator", "gcs-pipes-iterator",
OBJECT_MAPPER.writeValueAsString(jsonConfig));
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIterator.java
index fbf86f4fe63..be0fccfdfa1 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIterator.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIterator.java
@@ -34,10 +34,8 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig;
import org.apache.tika.pipes.pipesiterator.PipesIteratorBase;
import org.apache.tika.plugins.ExtensionConfig;
import org.apache.tika.utils.StringUtils;
@@ -78,16 +76,15 @@ private JDBCPipesIterator(JDBCPipesIteratorConfig config, ExtensionConfig extens
throw new TikaConfigException("select must not be empty");
}
- PipesIteratorBaseConfig baseConfig = config.getBaseConfig();
- String fetcherName = baseConfig.fetcherId();
- String emitterName = baseConfig.emitterId();
+ String fetcherName = config.getFetcherId();
+ String emitterName = config.getEmitterId();
if (StringUtils.isBlank(fetcherName) && !StringUtils.isBlank(config.getFetchKeyColumn())) {
- throw new TikaConfigException("If you specify a 'fetchKeyColumn', you must specify a 'fetcherPluginId'");
+ throw new TikaConfigException("If you specify a 'fetchKeyColumn', you must specify a 'fetcherId'");
}
if (StringUtils.isBlank(emitterName) && !StringUtils.isBlank(config.getEmitKeyColumn())) {
- throw new TikaConfigException("If you specify an 'emitKeyColumn', you must specify an 'emitterPluginId'");
+ throw new TikaConfigException("If you specify an 'emitKeyColumn', you must specify an 'emitterId'");
}
if (StringUtils.isBlank(emitterName) && StringUtils.isBlank(fetcherName)) {
@@ -120,13 +117,11 @@ public static JDBCPipesIterator build(ExtensionConfig extensionConfig) throws IO
@Override
protected void enqueue() throws InterruptedException, IOException, TimeoutException {
- PipesIteratorBaseConfig baseConfig = config.getBaseConfig();
- String fetcherPluginId = baseConfig.fetcherId();
- String emitterName = baseConfig.emitterId();
+ String fetcherId = config.getFetcherId();
+ String emitterId = config.getEmitterId();
FetchEmitKeyIndices fetchEmitKeyIndices = null;
List headers = new ArrayList<>();
int rowCount = 0;
- HandlerConfig handlerConfig = baseConfig.handlerConfig();
LOGGER.debug("select: {}", config.getSelect());
try (Statement st = db.createStatement()) {
if (config.getFetchSize() > 0) {
@@ -139,10 +134,10 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept
while (rs.next()) {
if (headers.size() == 0) {
fetchEmitKeyIndices = loadHeaders(rs.getMetaData(), headers);
- checkFetchEmitValidity(fetcherPluginId, emitterName, fetchEmitKeyIndices, headers);
+ checkFetchEmitValidity(fetcherId, emitterId, fetchEmitKeyIndices, headers);
}
try {
- processRow(fetcherPluginId, emitterName, headers, fetchEmitKeyIndices, rs, handlerConfig, baseConfig);
+ processRow(fetcherId, emitterId, headers, fetchEmitKeyIndices, rs);
} catch (SQLException e) {
LOGGER.warn("Failed to insert: " + rs, e);
}
@@ -164,7 +159,7 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept
}
}
- private void checkFetchEmitValidity(String fetcherPluginId, String emitterName, FetchEmitKeyIndices fetchEmitKeyIndices, List headers) throws IOException {
+ private void checkFetchEmitValidity(String fetcherId, String emitterId, FetchEmitKeyIndices fetchEmitKeyIndices, List headers) throws IOException {
if (!StringUtils.isBlank(config.getFetchKeyColumn()) && fetchEmitKeyIndices.fetchKeyIndex < 0) {
throw new IOException(new TikaConfigException("Couldn't find fetchkey column: " + config.getFetchKeyColumn()));
}
@@ -180,9 +175,8 @@ private void checkFetchEmitValidity(String fetcherPluginId, String emitterName,
}
}
- private void processRow(String fetcherPluginId, String emitterName, List headers,
- FetchEmitKeyIndices fetchEmitKeyIndices, ResultSet rs,
- HandlerConfig handlerConfig, PipesIteratorBaseConfig baseConfig)
+ private void processRow(String fetcherId, String emitterId, List headers,
+ FetchEmitKeyIndices fetchEmitKeyIndices, ResultSet rs)
throws SQLException, TimeoutException, InterruptedException {
Metadata metadata = new Metadata();
String fetchKey = "";
@@ -233,9 +227,8 @@ private void processRow(String fetcherPluginId, String emitterName, List
}
}
ParseContext parseContext = new ParseContext();
- parseContext.set(HandlerConfig.class, handlerConfig);
- tryToAdd(new FetchEmitTuple(id, new FetchKey(fetcherPluginId, fetchKey, fetchStartRange, fetchEndRange), new EmitKey(emitterName, emitKey), metadata, parseContext,
- baseConfig.onParseException()));
+ tryToAdd(new FetchEmitTuple(id, new FetchKey(fetcherId, fetchKey, fetchStartRange, fetchEndRange), new EmitKey(emitterId, emitKey), metadata, parseContext,
+ FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT));
}
private String toString(ResultSet rs) throws SQLException {
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorConfig.java
index 5cdfa0a7076..ff6b68d229e 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorConfig.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorConfig.java
@@ -22,10 +22,9 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig;
+import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig;
-public class JDBCPipesIteratorConfig implements PipesIteratorConfig {
+public class JDBCPipesIteratorConfig extends PipesIteratorConfig {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@@ -49,7 +48,6 @@ public static JDBCPipesIteratorConfig load(final String json)
private String select;
private int fetchSize = -1;
private int queryTimeoutSeconds = -1;
- private PipesIteratorBaseConfig baseConfig = null;
public String getIdColumn() {
return idColumn;
@@ -88,16 +86,13 @@ public int getQueryTimeoutSeconds() {
}
@Override
- public PipesIteratorBaseConfig getBaseConfig() {
- return baseConfig;
- }
-
- @Override
- public final boolean equals(Object o) {
+ public boolean equals(Object o) {
if (!(o instanceof JDBCPipesIteratorConfig that)) {
return false;
}
-
+ if (!super.equals(o)) {
+ return false;
+ }
return fetchSize == that.fetchSize &&
queryTimeoutSeconds == that.queryTimeoutSeconds &&
Objects.equals(idColumn, that.idColumn) &&
@@ -106,13 +101,13 @@ public final boolean equals(Object o) {
Objects.equals(fetchKeyRangeEndColumn, that.fetchKeyRangeEndColumn) &&
Objects.equals(emitKeyColumn, that.emitKeyColumn) &&
Objects.equals(connection, that.connection) &&
- Objects.equals(select, that.select) &&
- Objects.equals(baseConfig, that.baseConfig);
+ Objects.equals(select, that.select);
}
@Override
public int hashCode() {
- int result = Objects.hashCode(idColumn);
+ int result = super.hashCode();
+ result = 31 * result + Objects.hashCode(idColumn);
result = 31 * result + Objects.hashCode(fetchKeyColumn);
result = 31 * result + Objects.hashCode(fetchKeyRangeStartColumn);
result = 31 * result + Objects.hashCode(fetchKeyRangeEndColumn);
@@ -121,7 +116,6 @@ public int hashCode() {
result = 31 * result + Objects.hashCode(select);
result = 31 * result + fetchSize;
result = 31 * result + queryTimeoutSeconds;
- result = 31 * result + Objects.hashCode(baseConfig);
return result;
}
}
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/java/org/apache/tika/pipes/iterator/jdbc/TestJDBCPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/java/org/apache/tika/pipes/iterator/jdbc/TestJDBCPipesIterator.java
index 30aac2d57f8..ce7c5989433 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/java/org/apache/tika/pipes/iterator/jdbc/TestJDBCPipesIterator.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/test/java/org/apache/tika/pipes/iterator/jdbc/TestJDBCPipesIterator.java
@@ -163,12 +163,9 @@ private JDBCPipesIterator createIterator() throws Exception {
jsonConfig.put("fetchKeyColumn", "my_fetchkey");
jsonConfig.put("emitKeyColumn", "my_fetchkey");
- // Add baseConfig
- ObjectNode baseConfig = OBJECT_MAPPER.createObjectNode();
- baseConfig.put("fetcherId", "s3f");
- baseConfig.put("emitterId", "s3e");
- baseConfig.put("queueSize", 57);
- jsonConfig.set("baseConfig", baseConfig);
+ // Add fetcherId and emitterId at root level (not nested in baseConfig)
+ jsonConfig.put("fetcherId", "s3f");
+ jsonConfig.put("emitterId", "s3e");
ExtensionConfig extensionConfig = new ExtensionConfig("test-jdbc-iterator", "jdbc-pipes-iterator",
OBJECT_MAPPER.writeValueAsString(jsonConfig));
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorConfig.java
index a9942a625ca..c3f6f53924b 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorConfig.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorConfig.java
@@ -23,10 +23,9 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig;
+import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig;
-public class JsonPipesIteratorConfig implements PipesIteratorConfig {
+public class JsonPipesIteratorConfig extends PipesIteratorConfig {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@@ -42,31 +41,26 @@ public static JsonPipesIteratorConfig load(final String json)
}
private Path jsonPath;
- private PipesIteratorBaseConfig baseConfig = null;
public Path getJsonPath() {
return jsonPath;
}
@Override
- public PipesIteratorBaseConfig getBaseConfig() {
- return baseConfig;
- }
-
- @Override
- public final boolean equals(Object o) {
+ public boolean equals(Object o) {
if (!(o instanceof JsonPipesIteratorConfig that)) {
return false;
}
-
- return Objects.equals(jsonPath, that.jsonPath) &&
- Objects.equals(baseConfig, that.baseConfig);
+ if (!super.equals(o)) {
+ return false;
+ }
+ return Objects.equals(jsonPath, that.jsonPath);
}
@Override
public int hashCode() {
- int result = Objects.hashCode(jsonPath);
- result = 31 * result + Objects.hashCode(baseConfig);
+ int result = super.hashCode();
+ result = 31 * result + Objects.hashCode(jsonPath);
return result;
}
}
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/java/org/apache/tika/pipes/pipesiterator/json/TestJsonPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/java/org/apache/tika/pipes/pipesiterator/json/TestJsonPipesIterator.java
index 4211387888f..53aca506ffc 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/java/org/apache/tika/pipes/pipesiterator/json/TestJsonPipesIterator.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/java/org/apache/tika/pipes/pipesiterator/json/TestJsonPipesIterator.java
@@ -71,34 +71,4 @@ private JsonPipesIterator createIterator(Path jsonPath) throws Exception {
OBJECT_MAPPER.writeValueAsString(jsonConfig));
return JsonPipesIterator.build(extensionConfig);
}
-
-
- /*
- //use this to generate test files
- public static void main(String[] args) throws Exception {
- Path p = Paths.get("/home/tallison/Intellij/tika-main/tika-pipes/tika-pipes-iterators" +
- "/tika-pipes-iterator-json/src/test/resources/test-documents/test-with-embedded" +
- "-bytes.json");
- try (BufferedWriter writer = Files.newBufferedWriter(p, StandardCharsets.UTF_8)) {
- HandlerConfig handlerConfig =
- new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
- HandlerConfig.PARSE_MODE.RMETA, -1, -1,
- false);
- EmbeddedDocumentBytesConfig config = new EmbeddedDocumentBytesConfig(true);
- for (int i = 0; i < 100; i++) {
- String id = "myid-"+i;
- FetchEmitTuple t = new FetchEmitTuple(
- id,
- new FetchKey("fs", i + ".xml"),
- new EmitKey("fs", i + ".xml.json"),
- new Metadata(),
- handlerConfig,
- FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT,
- config);
- String line = JsonFetchEmitTuple.toJson(t);
- writer.write(line);
- writer.newLine();
- }
- }
- }*/
}
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test-with-embedded-bytes.json b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test-with-embedded-bytes.json
index 74883069062..daef89edaa6 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test-with-embedded-bytes.json
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test-with-embedded-bytes.json
@@ -4,12 +4,6 @@
"fetchKey": "0.xml",
"emitter": "fs",
"emitKey": "0.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -25,12 +19,6 @@
"fetchKey": "1.xml",
"emitter": "fs",
"emitKey": "1.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -46,12 +34,6 @@
"fetchKey": "2.xml",
"emitter": "fs",
"emitKey": "2.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -67,12 +49,6 @@
"fetchKey": "3.xml",
"emitter": "fs",
"emitKey": "3.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -88,12 +64,6 @@
"fetchKey": "4.xml",
"emitter": "fs",
"emitKey": "4.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -109,12 +79,6 @@
"fetchKey": "5.xml",
"emitter": "fs",
"emitKey": "5.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -130,12 +94,6 @@
"fetchKey": "6.xml",
"emitter": "fs",
"emitKey": "6.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -151,12 +109,6 @@
"fetchKey": "7.xml",
"emitter": "fs",
"emitKey": "7.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -172,12 +124,6 @@
"fetchKey": "8.xml",
"emitter": "fs",
"emitKey": "8.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -193,12 +139,6 @@
"fetchKey": "9.xml",
"emitter": "fs",
"emitKey": "9.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -214,12 +154,6 @@
"fetchKey": "10.xml",
"emitter": "fs",
"emitKey": "10.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -235,12 +169,6 @@
"fetchKey": "11.xml",
"emitter": "fs",
"emitKey": "11.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -256,12 +184,6 @@
"fetchKey": "12.xml",
"emitter": "fs",
"emitKey": "12.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -277,12 +199,6 @@
"fetchKey": "13.xml",
"emitter": "fs",
"emitKey": "13.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -298,12 +214,6 @@
"fetchKey": "14.xml",
"emitter": "fs",
"emitKey": "14.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -319,12 +229,6 @@
"fetchKey": "15.xml",
"emitter": "fs",
"emitKey": "15.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -340,12 +244,6 @@
"fetchKey": "16.xml",
"emitter": "fs",
"emitKey": "16.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -361,12 +259,6 @@
"fetchKey": "17.xml",
"emitter": "fs",
"emitKey": "17.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -382,12 +274,6 @@
"fetchKey": "18.xml",
"emitter": "fs",
"emitKey": "18.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -403,12 +289,6 @@
"fetchKey": "19.xml",
"emitter": "fs",
"emitKey": "19.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -424,12 +304,6 @@
"fetchKey": "20.xml",
"emitter": "fs",
"emitKey": "20.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -445,12 +319,6 @@
"fetchKey": "21.xml",
"emitter": "fs",
"emitKey": "21.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -466,12 +334,6 @@
"fetchKey": "22.xml",
"emitter": "fs",
"emitKey": "22.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -487,12 +349,6 @@
"fetchKey": "23.xml",
"emitter": "fs",
"emitKey": "23.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -508,12 +364,6 @@
"fetchKey": "24.xml",
"emitter": "fs",
"emitKey": "24.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -529,12 +379,6 @@
"fetchKey": "25.xml",
"emitter": "fs",
"emitKey": "25.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -550,12 +394,6 @@
"fetchKey": "26.xml",
"emitter": "fs",
"emitKey": "26.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -571,12 +409,6 @@
"fetchKey": "27.xml",
"emitter": "fs",
"emitKey": "27.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -592,12 +424,6 @@
"fetchKey": "28.xml",
"emitter": "fs",
"emitKey": "28.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -613,12 +439,6 @@
"fetchKey": "29.xml",
"emitter": "fs",
"emitKey": "29.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -634,12 +454,6 @@
"fetchKey": "30.xml",
"emitter": "fs",
"emitKey": "30.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -655,12 +469,6 @@
"fetchKey": "31.xml",
"emitter": "fs",
"emitKey": "31.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -676,12 +484,6 @@
"fetchKey": "32.xml",
"emitter": "fs",
"emitKey": "32.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -697,12 +499,6 @@
"fetchKey": "33.xml",
"emitter": "fs",
"emitKey": "33.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -718,12 +514,6 @@
"fetchKey": "34.xml",
"emitter": "fs",
"emitKey": "34.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -739,12 +529,6 @@
"fetchKey": "35.xml",
"emitter": "fs",
"emitKey": "35.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -760,12 +544,6 @@
"fetchKey": "36.xml",
"emitter": "fs",
"emitKey": "36.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -781,12 +559,6 @@
"fetchKey": "37.xml",
"emitter": "fs",
"emitKey": "37.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -802,12 +574,6 @@
"fetchKey": "38.xml",
"emitter": "fs",
"emitKey": "38.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -823,12 +589,6 @@
"fetchKey": "39.xml",
"emitter": "fs",
"emitKey": "39.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -844,12 +604,6 @@
"fetchKey": "40.xml",
"emitter": "fs",
"emitKey": "40.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -865,12 +619,6 @@
"fetchKey": "41.xml",
"emitter": "fs",
"emitKey": "41.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -886,12 +634,6 @@
"fetchKey": "42.xml",
"emitter": "fs",
"emitKey": "42.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -907,12 +649,6 @@
"fetchKey": "43.xml",
"emitter": "fs",
"emitKey": "43.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -928,12 +664,6 @@
"fetchKey": "44.xml",
"emitter": "fs",
"emitKey": "44.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -949,12 +679,6 @@
"fetchKey": "45.xml",
"emitter": "fs",
"emitKey": "45.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -970,12 +694,6 @@
"fetchKey": "46.xml",
"emitter": "fs",
"emitKey": "46.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -991,12 +709,6 @@
"fetchKey": "47.xml",
"emitter": "fs",
"emitKey": "47.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1012,12 +724,6 @@
"fetchKey": "48.xml",
"emitter": "fs",
"emitKey": "48.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1033,12 +739,6 @@
"fetchKey": "49.xml",
"emitter": "fs",
"emitKey": "49.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1054,12 +754,6 @@
"fetchKey": "50.xml",
"emitter": "fs",
"emitKey": "50.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1075,12 +769,6 @@
"fetchKey": "51.xml",
"emitter": "fs",
"emitKey": "51.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1096,12 +784,6 @@
"fetchKey": "52.xml",
"emitter": "fs",
"emitKey": "52.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1117,12 +799,6 @@
"fetchKey": "53.xml",
"emitter": "fs",
"emitKey": "53.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1138,12 +814,6 @@
"fetchKey": "54.xml",
"emitter": "fs",
"emitKey": "54.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1159,12 +829,6 @@
"fetchKey": "55.xml",
"emitter": "fs",
"emitKey": "55.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1180,12 +844,6 @@
"fetchKey": "56.xml",
"emitter": "fs",
"emitKey": "56.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1201,12 +859,6 @@
"fetchKey": "57.xml",
"emitter": "fs",
"emitKey": "57.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1222,12 +874,6 @@
"fetchKey": "58.xml",
"emitter": "fs",
"emitKey": "58.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1243,12 +889,6 @@
"fetchKey": "59.xml",
"emitter": "fs",
"emitKey": "59.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1264,12 +904,6 @@
"fetchKey": "60.xml",
"emitter": "fs",
"emitKey": "60.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1285,12 +919,6 @@
"fetchKey": "61.xml",
"emitter": "fs",
"emitKey": "61.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1306,12 +934,6 @@
"fetchKey": "62.xml",
"emitter": "fs",
"emitKey": "62.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1327,12 +949,6 @@
"fetchKey": "63.xml",
"emitter": "fs",
"emitKey": "63.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1348,12 +964,6 @@
"fetchKey": "64.xml",
"emitter": "fs",
"emitKey": "64.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1369,12 +979,6 @@
"fetchKey": "65.xml",
"emitter": "fs",
"emitKey": "65.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1390,12 +994,6 @@
"fetchKey": "66.xml",
"emitter": "fs",
"emitKey": "66.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1411,12 +1009,6 @@
"fetchKey": "67.xml",
"emitter": "fs",
"emitKey": "67.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1432,12 +1024,6 @@
"fetchKey": "68.xml",
"emitter": "fs",
"emitKey": "68.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1453,12 +1039,6 @@
"fetchKey": "69.xml",
"emitter": "fs",
"emitKey": "69.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1474,12 +1054,6 @@
"fetchKey": "70.xml",
"emitter": "fs",
"emitKey": "70.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1495,12 +1069,6 @@
"fetchKey": "71.xml",
"emitter": "fs",
"emitKey": "71.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1516,12 +1084,6 @@
"fetchKey": "72.xml",
"emitter": "fs",
"emitKey": "72.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1537,12 +1099,6 @@
"fetchKey": "73.xml",
"emitter": "fs",
"emitKey": "73.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1558,12 +1114,6 @@
"fetchKey": "74.xml",
"emitter": "fs",
"emitKey": "74.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1579,12 +1129,6 @@
"fetchKey": "75.xml",
"emitter": "fs",
"emitKey": "75.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1600,12 +1144,6 @@
"fetchKey": "76.xml",
"emitter": "fs",
"emitKey": "76.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1621,12 +1159,6 @@
"fetchKey": "77.xml",
"emitter": "fs",
"emitKey": "77.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1642,12 +1174,6 @@
"fetchKey": "78.xml",
"emitter": "fs",
"emitKey": "78.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1663,12 +1189,6 @@
"fetchKey": "79.xml",
"emitter": "fs",
"emitKey": "79.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1684,12 +1204,6 @@
"fetchKey": "80.xml",
"emitter": "fs",
"emitKey": "80.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1705,12 +1219,6 @@
"fetchKey": "81.xml",
"emitter": "fs",
"emitKey": "81.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1726,12 +1234,6 @@
"fetchKey": "82.xml",
"emitter": "fs",
"emitKey": "82.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1747,12 +1249,6 @@
"fetchKey": "83.xml",
"emitter": "fs",
"emitKey": "83.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1768,12 +1264,6 @@
"fetchKey": "84.xml",
"emitter": "fs",
"emitKey": "84.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1789,12 +1279,6 @@
"fetchKey": "85.xml",
"emitter": "fs",
"emitKey": "85.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1810,12 +1294,6 @@
"fetchKey": "86.xml",
"emitter": "fs",
"emitKey": "86.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1831,12 +1309,6 @@
"fetchKey": "87.xml",
"emitter": "fs",
"emitKey": "87.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1852,12 +1324,6 @@
"fetchKey": "88.xml",
"emitter": "fs",
"emitKey": "88.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1873,12 +1339,6 @@
"fetchKey": "89.xml",
"emitter": "fs",
"emitKey": "89.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1894,12 +1354,6 @@
"fetchKey": "90.xml",
"emitter": "fs",
"emitKey": "90.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1915,12 +1369,6 @@
"fetchKey": "91.xml",
"emitter": "fs",
"emitKey": "91.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1936,12 +1384,6 @@
"fetchKey": "92.xml",
"emitter": "fs",
"emitKey": "92.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1957,12 +1399,6 @@
"fetchKey": "93.xml",
"emitter": "fs",
"emitKey": "93.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1978,12 +1414,6 @@
"fetchKey": "94.xml",
"emitter": "fs",
"emitKey": "94.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -1999,12 +1429,6 @@
"fetchKey": "95.xml",
"emitter": "fs",
"emitKey": "95.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -2020,12 +1444,6 @@
"fetchKey": "96.xml",
"emitter": "fs",
"emitKey": "96.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -2041,12 +1459,6 @@
"fetchKey": "97.xml",
"emitter": "fs",
"emitKey": "97.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -2062,12 +1474,6 @@
"fetchKey": "98.xml",
"emitter": "fs",
"emitKey": "98.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
@@ -2083,12 +1489,6 @@
"fetchKey": "99.xml",
"emitter": "fs",
"emitKey": "99.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit",
"embeddedDocumentBytesConfig": {
"extractEmbeddedDocumentBytes": true,
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test.json b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test.json
index 721410fd3a8..e5199c6cbac 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test.json
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test.json
@@ -4,12 +4,6 @@
"fetchKey": "0.xml",
"emitter": "fs",
"emitKey": "0.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -18,12 +12,6 @@
"fetchKey": "1.xml",
"emitter": "fs",
"emitKey": "1.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -32,12 +20,6 @@
"fetchKey": "2.xml",
"emitter": "fs",
"emitKey": "2.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -46,12 +28,6 @@
"fetchKey": "3.xml",
"emitter": "fs",
"emitKey": "3.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -60,12 +36,6 @@
"fetchKey": "4.xml",
"emitter": "fs",
"emitKey": "4.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -74,12 +44,6 @@
"fetchKey": "5.xml",
"emitter": "fs",
"emitKey": "5.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -88,12 +52,6 @@
"fetchKey": "6.xml",
"emitter": "fs",
"emitKey": "6.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -102,12 +60,6 @@
"fetchKey": "7.xml",
"emitter": "fs",
"emitKey": "7.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -116,12 +68,6 @@
"fetchKey": "8.xml",
"emitter": "fs",
"emitKey": "8.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -130,12 +76,6 @@
"fetchKey": "9.xml",
"emitter": "fs",
"emitKey": "9.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -144,12 +84,6 @@
"fetchKey": "10.xml",
"emitter": "fs",
"emitKey": "10.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -158,12 +92,6 @@
"fetchKey": "11.xml",
"emitter": "fs",
"emitKey": "11.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -172,12 +100,6 @@
"fetchKey": "12.xml",
"emitter": "fs",
"emitKey": "12.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -186,12 +108,6 @@
"fetchKey": "13.xml",
"emitter": "fs",
"emitKey": "13.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -200,12 +116,6 @@
"fetchKey": "14.xml",
"emitter": "fs",
"emitKey": "14.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -214,12 +124,6 @@
"fetchKey": "15.xml",
"emitter": "fs",
"emitKey": "15.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -228,12 +132,6 @@
"fetchKey": "16.xml",
"emitter": "fs",
"emitKey": "16.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -242,12 +140,6 @@
"fetchKey": "17.xml",
"emitter": "fs",
"emitKey": "17.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -256,12 +148,6 @@
"fetchKey": "18.xml",
"emitter": "fs",
"emitKey": "18.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -270,12 +156,6 @@
"fetchKey": "19.xml",
"emitter": "fs",
"emitKey": "19.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -284,12 +164,6 @@
"fetchKey": "20.xml",
"emitter": "fs",
"emitKey": "20.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -298,12 +172,6 @@
"fetchKey": "21.xml",
"emitter": "fs",
"emitKey": "21.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -312,12 +180,6 @@
"fetchKey": "22.xml",
"emitter": "fs",
"emitKey": "22.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -326,12 +188,6 @@
"fetchKey": "23.xml",
"emitter": "fs",
"emitKey": "23.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -340,12 +196,6 @@
"fetchKey": "24.xml",
"emitter": "fs",
"emitKey": "24.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -354,12 +204,6 @@
"fetchKey": "25.xml",
"emitter": "fs",
"emitKey": "25.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -368,12 +212,6 @@
"fetchKey": "26.xml",
"emitter": "fs",
"emitKey": "26.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -382,12 +220,6 @@
"fetchKey": "27.xml",
"emitter": "fs",
"emitKey": "27.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -396,12 +228,6 @@
"fetchKey": "28.xml",
"emitter": "fs",
"emitKey": "28.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -410,12 +236,6 @@
"fetchKey": "29.xml",
"emitter": "fs",
"emitKey": "29.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -424,12 +244,6 @@
"fetchKey": "30.xml",
"emitter": "fs",
"emitKey": "30.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -438,12 +252,6 @@
"fetchKey": "31.xml",
"emitter": "fs",
"emitKey": "31.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -452,12 +260,6 @@
"fetchKey": "32.xml",
"emitter": "fs",
"emitKey": "32.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -466,12 +268,6 @@
"fetchKey": "33.xml",
"emitter": "fs",
"emitKey": "33.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -480,12 +276,6 @@
"fetchKey": "34.xml",
"emitter": "fs",
"emitKey": "34.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -494,12 +284,6 @@
"fetchKey": "35.xml",
"emitter": "fs",
"emitKey": "35.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -508,12 +292,6 @@
"fetchKey": "36.xml",
"emitter": "fs",
"emitKey": "36.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -522,12 +300,6 @@
"fetchKey": "37.xml",
"emitter": "fs",
"emitKey": "37.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -536,12 +308,6 @@
"fetchKey": "38.xml",
"emitter": "fs",
"emitKey": "38.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -550,12 +316,6 @@
"fetchKey": "39.xml",
"emitter": "fs",
"emitKey": "39.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -564,12 +324,6 @@
"fetchKey": "40.xml",
"emitter": "fs",
"emitKey": "40.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -578,12 +332,6 @@
"fetchKey": "41.xml",
"emitter": "fs",
"emitKey": "41.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -592,12 +340,6 @@
"fetchKey": "42.xml",
"emitter": "fs",
"emitKey": "42.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -606,12 +348,6 @@
"fetchKey": "43.xml",
"emitter": "fs",
"emitKey": "43.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -620,12 +356,6 @@
"fetchKey": "44.xml",
"emitter": "fs",
"emitKey": "44.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -634,12 +364,6 @@
"fetchKey": "45.xml",
"emitter": "fs",
"emitKey": "45.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -648,12 +372,6 @@
"fetchKey": "46.xml",
"emitter": "fs",
"emitKey": "46.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -662,12 +380,6 @@
"fetchKey": "47.xml",
"emitter": "fs",
"emitKey": "47.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -676,12 +388,6 @@
"fetchKey": "48.xml",
"emitter": "fs",
"emitKey": "48.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -690,12 +396,6 @@
"fetchKey": "49.xml",
"emitter": "fs",
"emitKey": "49.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -704,12 +404,6 @@
"fetchKey": "50.xml",
"emitter": "fs",
"emitKey": "50.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -718,12 +412,6 @@
"fetchKey": "51.xml",
"emitter": "fs",
"emitKey": "51.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -732,12 +420,6 @@
"fetchKey": "52.xml",
"emitter": "fs",
"emitKey": "52.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -746,12 +428,6 @@
"fetchKey": "53.xml",
"emitter": "fs",
"emitKey": "53.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -760,12 +436,6 @@
"fetchKey": "54.xml",
"emitter": "fs",
"emitKey": "54.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -774,12 +444,6 @@
"fetchKey": "55.xml",
"emitter": "fs",
"emitKey": "55.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -788,12 +452,6 @@
"fetchKey": "56.xml",
"emitter": "fs",
"emitKey": "56.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -802,12 +460,6 @@
"fetchKey": "57.xml",
"emitter": "fs",
"emitKey": "57.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -816,12 +468,6 @@
"fetchKey": "58.xml",
"emitter": "fs",
"emitKey": "58.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -830,12 +476,6 @@
"fetchKey": "59.xml",
"emitter": "fs",
"emitKey": "59.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -844,12 +484,6 @@
"fetchKey": "60.xml",
"emitter": "fs",
"emitKey": "60.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -858,12 +492,6 @@
"fetchKey": "61.xml",
"emitter": "fs",
"emitKey": "61.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -872,12 +500,6 @@
"fetchKey": "62.xml",
"emitter": "fs",
"emitKey": "62.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -886,12 +508,6 @@
"fetchKey": "63.xml",
"emitter": "fs",
"emitKey": "63.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -900,12 +516,6 @@
"fetchKey": "64.xml",
"emitter": "fs",
"emitKey": "64.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -914,12 +524,6 @@
"fetchKey": "65.xml",
"emitter": "fs",
"emitKey": "65.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -928,12 +532,6 @@
"fetchKey": "66.xml",
"emitter": "fs",
"emitKey": "66.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -942,12 +540,6 @@
"fetchKey": "67.xml",
"emitter": "fs",
"emitKey": "67.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -956,12 +548,6 @@
"fetchKey": "68.xml",
"emitter": "fs",
"emitKey": "68.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -970,12 +556,6 @@
"fetchKey": "69.xml",
"emitter": "fs",
"emitKey": "69.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -984,12 +564,6 @@
"fetchKey": "70.xml",
"emitter": "fs",
"emitKey": "70.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -998,12 +572,6 @@
"fetchKey": "71.xml",
"emitter": "fs",
"emitKey": "71.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1012,12 +580,6 @@
"fetchKey": "72.xml",
"emitter": "fs",
"emitKey": "72.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1026,12 +588,6 @@
"fetchKey": "73.xml",
"emitter": "fs",
"emitKey": "73.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1040,12 +596,6 @@
"fetchKey": "74.xml",
"emitter": "fs",
"emitKey": "74.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1054,12 +604,6 @@
"fetchKey": "75.xml",
"emitter": "fs",
"emitKey": "75.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1068,12 +612,6 @@
"fetchKey": "76.xml",
"emitter": "fs",
"emitKey": "76.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1082,12 +620,6 @@
"fetchKey": "77.xml",
"emitter": "fs",
"emitKey": "77.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1096,12 +628,6 @@
"fetchKey": "78.xml",
"emitter": "fs",
"emitKey": "78.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1110,12 +636,6 @@
"fetchKey": "79.xml",
"emitter": "fs",
"emitKey": "79.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1124,12 +644,6 @@
"fetchKey": "80.xml",
"emitter": "fs",
"emitKey": "80.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1138,12 +652,6 @@
"fetchKey": "81.xml",
"emitter": "fs",
"emitKey": "81.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1152,12 +660,6 @@
"fetchKey": "82.xml",
"emitter": "fs",
"emitKey": "82.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1166,12 +668,6 @@
"fetchKey": "83.xml",
"emitter": "fs",
"emitKey": "83.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1180,12 +676,6 @@
"fetchKey": "84.xml",
"emitter": "fs",
"emitKey": "84.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1194,12 +684,6 @@
"fetchKey": "85.xml",
"emitter": "fs",
"emitKey": "85.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1208,12 +692,6 @@
"fetchKey": "86.xml",
"emitter": "fs",
"emitKey": "86.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1222,12 +700,6 @@
"fetchKey": "87.xml",
"emitter": "fs",
"emitKey": "87.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1236,12 +708,6 @@
"fetchKey": "88.xml",
"emitter": "fs",
"emitKey": "88.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1250,12 +716,6 @@
"fetchKey": "89.xml",
"emitter": "fs",
"emitKey": "89.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1264,12 +724,6 @@
"fetchKey": "90.xml",
"emitter": "fs",
"emitKey": "90.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1278,12 +732,6 @@
"fetchKey": "91.xml",
"emitter": "fs",
"emitKey": "91.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1292,12 +740,6 @@
"fetchKey": "92.xml",
"emitter": "fs",
"emitKey": "92.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1306,12 +748,6 @@
"fetchKey": "93.xml",
"emitter": "fs",
"emitKey": "93.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1320,12 +756,6 @@
"fetchKey": "94.xml",
"emitter": "fs",
"emitKey": "94.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1334,12 +764,6 @@
"fetchKey": "95.xml",
"emitter": "fs",
"emitKey": "95.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1348,12 +772,6 @@
"fetchKey": "96.xml",
"emitter": "fs",
"emitKey": "96.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1362,12 +780,6 @@
"fetchKey": "97.xml",
"emitter": "fs",
"emitKey": "97.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1376,12 +788,6 @@
"fetchKey": "98.xml",
"emitter": "fs",
"emitKey": "98.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
{
@@ -1390,11 +796,5 @@
"fetchKey": "99.xml",
"emitter": "fs",
"emitKey": "99.xml.json",
- "handlerConfig": {
- "type": "text",
- "parseMode": "rmeta",
- "writeLimit": -1,
- "maxEmbeddedResources": -1
- },
"onParseException": "emit"
}
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIterator.java
index 285bc07188d..ccf9e2037f3 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIterator.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIterator.java
@@ -35,10 +35,8 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig;
import org.apache.tika.pipes.pipesiterator.PipesIteratorBase;
import org.apache.tika.plugins.ExtensionConfig;
@@ -99,10 +97,8 @@ private Object serializerClass(String className, Class> defaultClass) {
@Override
protected void enqueue() throws InterruptedException, TimeoutException {
- PipesIteratorBaseConfig baseConfig = config.getBaseConfig();
- String fetcherId = baseConfig.fetcherId();
- String emitterId = baseConfig.emitterId();
- HandlerConfig handlerConfig = baseConfig.handlerConfig();
+ String fetcherId = config.getFetcherId();
+ String emitterId = config.getEmitterId();
long start = System.currentTimeMillis();
int count = 0;
@@ -117,10 +113,9 @@ protected void enqueue() throws InterruptedException, TimeoutException {
LOGGER.debug("adding ({}) {} in {} ms", count, r.key(), elapsed);
}
ParseContext parseContext = new ParseContext();
- parseContext.set(HandlerConfig.class, handlerConfig);
tryToAdd(new FetchEmitTuple(r.key(), new FetchKey(fetcherId, r.key()),
new EmitKey(emitterId, r.key()), new Metadata(), parseContext,
- baseConfig.onParseException()));
+ FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT));
++count;
}
} while ((emitMax < 0 || count < emitMax) && !records.isEmpty());
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorConfig.java
index 53675ac233a..63342adbe31 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorConfig.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorConfig.java
@@ -22,10 +22,9 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig;
+import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig;
-public class KafkaPipesIteratorConfig implements PipesIteratorConfig {
+public class KafkaPipesIteratorConfig extends PipesIteratorConfig {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@@ -49,7 +48,6 @@ public static KafkaPipesIteratorConfig load(final String json)
private int pollDelayMs = 100;
private int emitMax = -1;
private int groupInitialRebalanceDelayMs = 3000;
- private PipesIteratorBaseConfig baseConfig = null;
public String getTopic() {
return topic;
@@ -88,16 +86,13 @@ public int getGroupInitialRebalanceDelayMs() {
}
@Override
- public PipesIteratorBaseConfig getBaseConfig() {
- return baseConfig;
- }
-
- @Override
- public final boolean equals(Object o) {
+ public boolean equals(Object o) {
if (!(o instanceof KafkaPipesIteratorConfig that)) {
return false;
}
-
+ if (!super.equals(o)) {
+ return false;
+ }
return pollDelayMs == that.pollDelayMs &&
emitMax == that.emitMax &&
groupInitialRebalanceDelayMs == that.groupInitialRebalanceDelayMs &&
@@ -106,13 +101,13 @@ public final boolean equals(Object o) {
Objects.equals(keySerializer, that.keySerializer) &&
Objects.equals(valueSerializer, that.valueSerializer) &&
Objects.equals(groupId, that.groupId) &&
- Objects.equals(autoOffsetReset, that.autoOffsetReset) &&
- Objects.equals(baseConfig, that.baseConfig);
+ Objects.equals(autoOffsetReset, that.autoOffsetReset);
}
@Override
public int hashCode() {
- int result = Objects.hashCode(topic);
+ int result = super.hashCode();
+ result = 31 * result + Objects.hashCode(topic);
result = 31 * result + Objects.hashCode(bootstrapServers);
result = 31 * result + Objects.hashCode(keySerializer);
result = 31 * result + Objects.hashCode(valueSerializer);
@@ -121,7 +116,6 @@ public int hashCode() {
result = 31 * result + pollDelayMs;
result = 31 * result + emitMax;
result = 31 * result + groupInitialRebalanceDelayMs;
- result = 31 * result + Objects.hashCode(baseConfig);
return result;
}
}
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/java/org/apache/tika/pipes/iterator/kafka/TestKafkaPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/java/org/apache/tika/pipes/iterator/kafka/TestKafkaPipesIterator.java
index f71c0b4db86..00e8e364203 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/java/org/apache/tika/pipes/iterator/kafka/TestKafkaPipesIterator.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/test/java/org/apache/tika/pipes/iterator/kafka/TestKafkaPipesIterator.java
@@ -49,10 +49,9 @@ public void testSimple() throws Exception {
configNode.put("bootstrapServers", ""); // use one
configNode.put("groupId", ""); // find one
- ObjectNode baseConfigNode = MAPPER.createObjectNode();
- baseConfigNode.put("fetcherId", "kafka");
- baseConfigNode.put("emitterId", "test-emitter");
- configNode.set("baseConfig", baseConfigNode);
+ // Add fetcherId and emitterId at root level (not nested in baseConfig)
+ configNode.put("fetcherId", "kafka");
+ configNode.put("emitterId", "test-emitter");
ExtensionConfig extensionConfig = new ExtensionConfig("test-kafka", "kafka-pipes-iterator",
MAPPER.writeValueAsString(configNode));
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIterator.java
index 6a9539ca316..6e6daa11526 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIterator.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIterator.java
@@ -46,10 +46,8 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig;
import org.apache.tika.pipes.pipesiterator.PipesIteratorBase;
import org.apache.tika.plugins.ExtensionConfig;
import org.apache.tika.utils.StringUtils;
@@ -125,12 +123,10 @@ public static S3PipesIterator build(ExtensionConfig extensionConfig) throws IOEx
@Override
protected void enqueue() throws InterruptedException, IOException, TimeoutException {
- PipesIteratorBaseConfig baseConfig = config.getBaseConfig();
- String fetcherPluginId = baseConfig.fetcherId();
- String emitterName = baseConfig.emitterId();
+ String fetcherId = config.getFetcherId();
+ String emitterId = config.getEmitterId();
long start = System.currentTimeMillis();
int count = 0;
- HandlerConfig handlerConfig = baseConfig.handlerConfig();
final Matcher fileNameMatcher;
if (fileNamePattern != null) {
fileNameMatcher = fileNamePattern.matcher("");
@@ -149,9 +145,8 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept
long elapsed = System.currentTimeMillis() - start;
LOGGER.debug("adding ({}) {} in {} ms", count, key, elapsed);
ParseContext parseContext = new ParseContext();
- parseContext.set(HandlerConfig.class, handlerConfig);
- tryToAdd(new FetchEmitTuple(key, new FetchKey(fetcherPluginId, key), new EmitKey(emitterName, key), new Metadata(), parseContext,
- baseConfig.onParseException()));
+ tryToAdd(new FetchEmitTuple(key, new FetchKey(fetcherId, key), new EmitKey(emitterId, key), new Metadata(), parseContext,
+ FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT));
count++;
}
long elapsed = System.currentTimeMillis() - start;
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorConfig.java
index dc4bd12c2e8..4e8cf3ef20d 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorConfig.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorConfig.java
@@ -22,10 +22,9 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig;
+import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig;
-public class S3PipesIteratorConfig implements PipesIteratorConfig {
+public class S3PipesIteratorConfig extends PipesIteratorConfig {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@@ -50,7 +49,6 @@ public static S3PipesIteratorConfig load(final String json)
private String fileNamePattern;
private int maxConnections = 50;
private boolean pathStyleAccessEnabled = false;
- private PipesIteratorBaseConfig baseConfig = null;
public String getPrefix() {
return prefix;
@@ -97,16 +95,13 @@ public boolean isPathStyleAccessEnabled() {
}
@Override
- public PipesIteratorBaseConfig getBaseConfig() {
- return baseConfig;
- }
-
- @Override
- public final boolean equals(Object o) {
+ public boolean equals(Object o) {
if (!(o instanceof S3PipesIteratorConfig that)) {
return false;
}
-
+ if (!super.equals(o)) {
+ return false;
+ }
return maxConnections == that.maxConnections &&
pathStyleAccessEnabled == that.pathStyleAccessEnabled &&
Objects.equals(prefix, that.prefix) &&
@@ -117,13 +112,13 @@ public final boolean equals(Object o) {
Objects.equals(credentialsProvider, that.credentialsProvider) &&
Objects.equals(profile, that.profile) &&
Objects.equals(bucket, that.bucket) &&
- Objects.equals(fileNamePattern, that.fileNamePattern) &&
- Objects.equals(baseConfig, that.baseConfig);
+ Objects.equals(fileNamePattern, that.fileNamePattern);
}
@Override
public int hashCode() {
- int result = Objects.hashCode(prefix);
+ int result = super.hashCode();
+ result = 31 * result + Objects.hashCode(prefix);
result = 31 * result + Objects.hashCode(region);
result = 31 * result + Objects.hashCode(accessKey);
result = 31 * result + Objects.hashCode(secretKey);
@@ -134,7 +129,6 @@ public int hashCode() {
result = 31 * result + Objects.hashCode(fileNamePattern);
result = 31 * result + maxConnections;
result = 31 * result + Boolean.hashCode(pathStyleAccessEnabled);
- result = 31 * result + Objects.hashCode(baseConfig);
return result;
}
}
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/iterator/s3/TestS3PipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/iterator/s3/TestS3PipesIterator.java
index d840fc29509..4104b54e698 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/iterator/s3/TestS3PipesIterator.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/iterator/s3/TestS3PipesIterator.java
@@ -50,10 +50,9 @@ public void testSimple() throws Exception {
jsonConfig.put("profile", ""); // use one
jsonConfig.put("credentialsProvider", "profile");
- ObjectNode baseConfig = OBJECT_MAPPER.createObjectNode();
- baseConfig.put("fetcherId", "s3");
- baseConfig.put("emitterId", "fs");
- jsonConfig.set("baseConfig", baseConfig);
+ // Add fetcherId and emitterId at root level (not nested in baseConfig)
+ jsonConfig.put("fetcherId", "s3");
+ jsonConfig.put("emitterId", "fs");
ExtensionConfig extensionConfig = new ExtensionConfig("test-s3-iterator", "s3-pipes-iterator",
OBJECT_MAPPER.writeValueAsString(jsonConfig));
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIterator.java b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIterator.java
index 6be72029b8c..02615bf12e6 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIterator.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIterator.java
@@ -42,10 +42,8 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig;
import org.apache.tika.pipes.pipesiterator.PipesIteratorBase;
import org.apache.tika.plugins.ExtensionConfig;
import org.apache.tika.utils.StringUtils;
@@ -119,9 +117,8 @@ private void configure() throws IOException, TikaConfigException {
@Override
protected void enqueue() throws InterruptedException, IOException, TimeoutException {
- PipesIteratorBaseConfig baseConfig = config.getBaseConfig();
- String fetcherId = baseConfig.fetcherId();
- String emitterId = baseConfig.emitterId();
+ String fetcherId = config.getFetcherId();
+ String emitterId = config.getEmitterId();
try (SolrClient solrClient = createSolrClient()) {
int fileCount = 0;
@@ -145,8 +142,6 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept
List filters = config.getFilters() != null ? config.getFilters() : Collections.emptyList();
query.setFilterQueries(filters.toArray(new String[]{}));
- HandlerConfig handlerConfig = baseConfig.handlerConfig();
-
String cursorMark = CursorMarkParams.CURSOR_MARK_START;
boolean done = false;
while (!done) {
@@ -167,9 +162,8 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept
}
LOGGER.info("iterator doc: {}, idField={}, fetchKey={}", sd, config.getIdField(), fetchKey);
ParseContext parseContext = new ParseContext();
- parseContext.set(HandlerConfig.class, handlerConfig);
tryToAdd(new FetchEmitTuple(fetchKey, new FetchKey(fetcherId, fetchKey), new EmitKey(emitterId, emitKey), new Metadata(), parseContext,
- baseConfig.onParseException()));
+ FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT));
}
if (cursorMark.equals(nextCursorMark)) {
done = true;
diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorConfig.java
index 60211ed9ac9..9c37a52819c 100644
--- a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorConfig.java
+++ b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorConfig.java
@@ -24,10 +24,9 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig;
-import org.apache.tika.pipes.api.pipesiterator.PipesIteratorConfig;
+import org.apache.tika.pipes.pipesiterator.PipesIteratorConfig;
-public class SolrPipesIteratorConfig implements PipesIteratorConfig {
+public class SolrPipesIteratorConfig extends PipesIteratorConfig {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@@ -60,7 +59,6 @@ public static SolrPipesIteratorConfig load(final String json)
private String authScheme;
private String proxyHost;
private int proxyPort = 0;
- private PipesIteratorBaseConfig baseConfig = null;
public String getSolrCollection() {
return solrCollection;
@@ -135,16 +133,13 @@ public int getProxyPort() {
}
@Override
- public PipesIteratorBaseConfig getBaseConfig() {
- return baseConfig;
- }
-
- @Override
- public final boolean equals(Object o) {
+ public boolean equals(Object o) {
if (!(o instanceof SolrPipesIteratorConfig that)) {
return false;
}
-
+ if (!super.equals(o)) {
+ return false;
+ }
return rows == that.rows &&
connectionTimeout == that.connectionTimeout &&
socketTimeout == that.socketTimeout &&
@@ -162,13 +157,13 @@ public final boolean equals(Object o) {
Objects.equals(userName, that.userName) &&
Objects.equals(password, that.password) &&
Objects.equals(authScheme, that.authScheme) &&
- Objects.equals(proxyHost, that.proxyHost) &&
- Objects.equals(baseConfig, that.baseConfig);
+ Objects.equals(proxyHost, that.proxyHost);
}
@Override
public int hashCode() {
- int result = Objects.hashCode(solrCollection);
+ int result = super.hashCode();
+ result = 31 * result + Objects.hashCode(solrCollection);
result = 31 * result + Objects.hashCode(solrUrls);
result = 31 * result + Objects.hashCode(solrZkHosts);
result = 31 * result + Objects.hashCode(solrZkChroot);
@@ -186,7 +181,6 @@ public int hashCode() {
result = 31 * result + Objects.hashCode(authScheme);
result = 31 * result + Objects.hashCode(proxyHost);
result = 31 * result + proxyPort;
- result = 31 * result + Objects.hashCode(baseConfig);
return result;
}
}
diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java
index 1ab7014ed8e..5ecfffecb5f 100644
--- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java
+++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java
@@ -48,6 +48,20 @@
*/
public class ComponentRegistry {
+ /**
+ * Built-in aliases for external dependencies.
+ * Maps component names to fully qualified class names.
+ */
+ private static final Map BUILTIN_ALIASES = createBuiltinAliases();
+
+ private static Map createBuiltinAliases() {
+ Map aliases = new HashMap<>();
+ // EmbeddedDocumentBytesConfig is in tika-pipes-core which can't depend on tika-core for @TikaComponent
+ aliases.put("embedded-document-bytes-config",
+ "org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig");
+ return Collections.unmodifiableMap(aliases);
+ }
+
private final Map components;
private final Map classNameToFriendlyName; // Reverse lookup by class name
private final ClassLoader classLoader;
diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java
index b57aae89ee9..e262bd64129 100644
--- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java
+++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java
@@ -40,10 +40,10 @@
* TikaLoader loader = TikaLoader.load(configPath);
*
* // Load by explicit key
- * HandlerConfig config = loader.configs().load("handler-config", HandlerConfig.class);
+ * MyConfig config = loader.configs().load("my-config", MyConfig.class);
*
* // Load by class name (auto-converts to kebab-case)
- * HandlerConfig config = loader.configs().load(HandlerConfig.class);
+ * MyConfig config = loader.configs().load(MyConfig.class);
*
*
* JSON configuration example:
@@ -57,7 +57,7 @@
*
* // Custom configs MUST be in "other-configs" (loaded via configs())
* "other-configs": {
- * "handler-config": {
+ * "my-config": {
* "timeout": 5000,
* "retries": 3
* },
@@ -93,7 +93,7 @@ public class ConfigLoader {
/**
* Loads a configuration object using the class name converted to kebab-case.
*
- * For example, {@code HandlerConfig.class} will look for key "handler-config".
+ * For example, {@code MyAppConfig.class} will look for key "my-app-config".
* Class name suffixes like "Config", "Configuration", "Settings" are stripped first.
*
* For interfaces, the JSON must specify the implementation (see {@link #load(String, Class)}).
@@ -213,7 +213,7 @@ public T load(String key, Class clazz, T defaultValue) throws TikaConfigE
*
* Example:
*
- * HandlerConfig defaults = new HandlerConfig();
+ * MyConfig defaults = new MyConfig();
* defaults.setTimeout(30000);
* defaults.setRetries(2);
* defaults.setEnabled(false);
@@ -221,9 +221,9 @@ public T load(String key, Class clazz, T defaultValue) throws TikaConfigE
* // JSON: { "enabled": true }
* // Result: timeout=30000, retries=2, enabled=true (merged!)
* // Note: 'defaults' object remains unchanged
- * HandlerConfig config = loader.configs().loadWithDefaults("handler-config",
- * HandlerConfig.class,
- * defaults);
+ * MyConfig config = loader.configs().loadWithDefaults("my-config",
+ * MyConfig.class,
+ * defaults);
*
*
* @param key The JSON key to load from
diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
index 2d9243b81a4..3d6a1ba4735 100644
--- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
+++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
@@ -111,6 +111,7 @@ public class TikaJsonConfig {
"detectors",
"encoding-detectors",
"metadata-filters",
+ "content-handler-factory",
"renderers",
"translator",
"auto-detect-parser",
diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
index 5bc29d88f99..b527532e5ba 100644
--- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
+++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
@@ -50,6 +50,8 @@
import org.apache.tika.parser.Parser;
import org.apache.tika.renderer.CompositeRenderer;
import org.apache.tika.renderer.Renderer;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.serialization.ComponentConfig;
import org.apache.tika.serialization.ComponentNameResolver;
import org.apache.tika.serialization.JsonMetadata;
@@ -143,6 +145,12 @@ private static void registerComponentConfigs() {
// Special cached instances that aren't standard components
private Parser autoDetectParser;
+ private Detector detectors;
+ private EncodingDetector encodingDetectors;
+ private MetadataFilter metadataFilter;
+ private ContentHandlerFactory contentHandlerFactory;
+ private Renderer renderers;
+ private Translator translator;
private ConfigLoader configLoader;
private GlobalSettings globalSettings;
@@ -272,6 +280,47 @@ public MetadataFilter loadMetadataFilters() throws TikaConfigException {
return get(MetadataFilter.class);
}
+ /**
+ * Loads and returns the content handler factory.
+ * If "content-handler-factory" section exists in config, uses that factory.
+ * If section missing, returns a default BasicContentHandlerFactory with TEXT handler.
+ * Results are cached - subsequent calls return the same instance.
+ *
+ * Example JSON:
+ *
+ * {
+ * "content-handler-factory": {
+ * "basic-content-handler-factory": {
+ * "type": "HTML",
+ * "writeLimit": 100000
+ * }
+ * }
+ * }
+ *
+ *
+ * @return the content handler factory
+ * @throws TikaConfigException if loading fails
+ */
+ public synchronized ContentHandlerFactory loadContentHandlerFactory() throws TikaConfigException {
+ if (contentHandlerFactory == null) {
+ // Check if content-handler-factory section exists in config
+ if (config.hasComponentSection("content-handler-factory")) {
+ try {
+ contentHandlerFactory = config.deserialize("content-handler-factory",
+ ContentHandlerFactory.class);
+ } catch (IOException e) {
+ throw new TikaConfigException("Failed to load content-handler-factory", e);
+ }
+ }
+ // Default to BasicContentHandlerFactory with TEXT handler if not configured
+ if (contentHandlerFactory == null) {
+ contentHandlerFactory = new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1);
+ }
+ }
+ return contentHandlerFactory;
+ }
+
/**
* Loads and returns all renderers.
* Syntactic sugar for {@code get(Renderer.class)}.
@@ -335,9 +384,9 @@ public synchronized Parser loadAutoDetectParser() throws TikaConfigException, IO
*
* Usage:
*
- * HandlerConfig config = loader.configs().load("handler-config", HandlerConfig.class);
+ * MyConfig config = loader.configs().load("my-config", MyConfig.class);
* // Or use kebab-case auto-conversion:
- * HandlerConfig config = loader.configs().load(HandlerConfig.class);
+ * MyConfig config = loader.configs().load(MyConfig.class);
*
*
* @return the ConfigLoader instance
diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java
index 6c6521a8716..39849524858 100644
--- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java
+++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java
@@ -47,6 +47,7 @@ public class TikaObjectMapperFactory {
"renderers",
"translators",
"digester-factories",
+ "content-handler-factories",
"other-configs"
};
diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java
index 80c54c0178f..195cfd6df05 100644
--- a/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java
+++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java
@@ -203,4 +203,26 @@ public static boolean hasComponentConfig(Class> componentClass) {
public static Set getComponentFields() {
return Collections.unmodifiableSet(FIELD_TO_CONFIG.keySet());
}
+
+ /**
+ * Gets the contextKey for a class from the component registry.
+ * The contextKey is recorded in the .idx file by the annotation processor.
+ *
+ * @param clazz the class to check
+ * @return the contextKey class if specified, or null if not registered or no contextKey
+ */
+ public static Class> getContextKey(Class> clazz) {
+ for (ComponentRegistry registry : REGISTRIES.values()) {
+ String friendlyName = registry.getFriendlyName(clazz);
+ if (friendlyName != null) {
+ try {
+ ComponentInfo info = registry.getComponentInfo(friendlyName);
+ return info.contextKey();
+ } catch (TikaConfigException e) {
+ // continue to next registry
+ }
+ }
+ }
+ return null;
+ }
}
diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java
index 16607e2ade3..049d9d0327e 100644
--- a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java
+++ b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java
@@ -27,7 +27,6 @@
import com.fasterxml.jackson.databind.module.SimpleModule;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.serialization.serdes.MetadataDeserializer;
import org.apache.tika.serialization.serdes.MetadataSerializer;
public class JsonMetadata {
@@ -56,13 +55,12 @@ private static void rebuildObjectMappers() {
JsonFactory factory = new JsonFactory();
factory.setStreamReadConstraints(streamReadConstraints);
+ // Use TikaModule which includes Metadata serializers
ObjectMapper mapper = new ObjectMapper(factory);
- SimpleModule baseModule = new SimpleModule();
- baseModule.addDeserializer(Metadata.class, new MetadataDeserializer());
- baseModule.addSerializer(Metadata.class, new MetadataSerializer());
- mapper.registerModule(baseModule);
+ mapper.registerModule(new TikaModule());
OBJECT_MAPPER = mapper;
+ // Pretty printer needs custom serializer with sort flag
ObjectMapper prettyMapper = new ObjectMapper(factory);
SimpleModule prettySerializerModule = new SimpleModule();
prettySerializerModule.addSerializer(Metadata.class, new MetadataSerializer(true));
diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java
index 2571c4c4b95..21f413087fd 100644
--- a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java
+++ b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java
@@ -28,7 +28,6 @@
import com.fasterxml.jackson.databind.module.SimpleModule;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.serialization.serdes.MetadataDeserializer;
import org.apache.tika.serialization.serdes.MetadataSerializer;
public class JsonMetadataList {
@@ -57,13 +56,12 @@ private static void rebuildObjectMappers() {
JsonFactory factory = new JsonFactory();
factory.setStreamReadConstraints(streamReadConstraints);
+ // Use TikaModule which includes Metadata serializers
ObjectMapper mapper = new ObjectMapper(factory);
- SimpleModule baseModule = new SimpleModule();
- baseModule.addDeserializer(Metadata.class, new MetadataDeserializer());
- baseModule.addSerializer(Metadata.class, new MetadataSerializer());
- mapper.registerModule(baseModule);
+ mapper.registerModule(new TikaModule());
OBJECT_MAPPER = mapper;
+ // Pretty printer needs custom serializer with sort flag
ObjectMapper prettyMapper = new ObjectMapper(factory);
SimpleModule prettySerializerModule = new SimpleModule();
prettySerializerModule.addSerializer(Metadata.class, new MetadataSerializer(true));
diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
index b50709702f1..249f7f71cfb 100644
--- a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
+++ b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
@@ -53,17 +53,24 @@
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
import org.apache.tika.language.translate.Translator;
+import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.DefaultParser;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.renderer.Renderer;
import org.apache.tika.sax.ContentHandlerDecoratorFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.serialization.serdes.DefaultDetectorSerializer;
import org.apache.tika.serialization.serdes.DefaultParserSerializer;
+import org.apache.tika.serialization.serdes.MetadataDeserializer;
+import org.apache.tika.serialization.serdes.MetadataSerializer;
+import org.apache.tika.serialization.serdes.ParseContextDeserializer;
+import org.apache.tika.serialization.serdes.ParseContextSerializer;
/**
* Jackson module that provides compact serialization for Tika components.
@@ -103,6 +110,7 @@ public class TikaModule extends SimpleModule {
COMPACT_FORMAT_INTERFACES.add(EmbeddedDocumentExtractorFactory.class);
COMPACT_FORMAT_INTERFACES.add(MetadataWriteFilterFactory.class);
COMPACT_FORMAT_INTERFACES.add(ContentHandlerDecoratorFactory.class);
+ COMPACT_FORMAT_INTERFACES.add(ContentHandlerFactory.class);
}
/**
@@ -120,6 +128,14 @@ private static boolean usesCompactFormat(Class> type) {
public TikaModule() {
super("TikaModule");
+
+ // Register Metadata serializers
+ addSerializer(Metadata.class, new MetadataSerializer());
+ addDeserializer(Metadata.class, new MetadataDeserializer());
+
+ // Register ParseContext serializers
+ addSerializer(ParseContext.class, new ParseContextSerializer());
+ addDeserializer(ParseContext.class, new ParseContextDeserializer());
}
/**
diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
index 997fe6e4fed..2dcf2961042 100644
--- a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
+++ b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
@@ -21,6 +21,7 @@
import java.io.IOException;
import java.util.Iterator;
+import java.util.Optional;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.databind.DeserializationContext;
@@ -30,6 +31,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.apache.tika.config.loader.ComponentInfo;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.serialization.ComponentNameResolver;
@@ -122,19 +124,22 @@ private static void deserializeTypedObjects(JsonNode typedNode, ParseContext par
JsonNode configNode = typedNode.get(componentName);
Class> configClass = null;
+ Class> contextKeyClass = null;
// First, try component registry lookup (for friendly names like "pdf-parser-config")
- try {
- configClass = ComponentNameResolver.resolveClass(
- componentName, ParseContextDeserializer.class.getClassLoader());
- } catch (ClassNotFoundException e) {
- // Not in registry, try as FQCN
+ Optional infoOpt = ComponentNameResolver.getComponentInfo(componentName);
+ if (infoOpt.isPresent()) {
+ ComponentInfo info = infoOpt.get();
+ configClass = info.componentClass();
+ contextKeyClass = info.contextKey();
}
// If not found in registry, try as fully qualified class name
if (configClass == null) {
try {
configClass = Class.forName(componentName);
+ // Check if the class has a contextKey via its annotation
+ contextKeyClass = ComponentNameResolver.getContextKey(configClass);
} catch (ClassNotFoundException e) {
LOG.warn("Could not find class for typed component '{}', storing as JSON config",
componentName);
@@ -144,11 +149,15 @@ private static void deserializeTypedObjects(JsonNode typedNode, ParseContext par
}
}
+ // Use contextKey if available, otherwise use the config class itself
+ Class> parseContextKey = (contextKeyClass != null) ? contextKeyClass : configClass;
+
// Deserialize and add to context
try {
Object config = mapper.treeToValue(configNode, configClass);
- parseContext.set((Class) configClass, config);
- LOG.debug("Deserialized typed object '{}' -> {}", componentName, configClass.getName());
+ parseContext.set((Class) parseContextKey, config);
+ LOG.debug("Deserialized typed object '{}' -> {} (contextKey={})",
+ componentName, configClass.getName(), parseContextKey.getName());
} catch (Exception e) {
LOG.warn("Failed to deserialize typed component '{}' as {}, storing as JSON config",
componentName, configClass.getName(), e);
diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java
index 903c48f3e7c..e2545d4033a 100644
--- a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java
+++ b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java
@@ -49,19 +49,25 @@ public class ParseContextSerializer extends JsonSerializer {
public static final String PARSE_CONTEXT = "parseContext";
public static final String TYPED = "typed";
+ // Plain mapper for serializing values without TikaModule's component wrapping
+ private static final ObjectMapper PLAIN_MAPPER = new ObjectMapper();
+
+ static {
+ // Allow serialization of classes with no properties
+ PLAIN_MAPPER.disable(com.fasterxml.jackson.databind.SerializationFeature.FAIL_ON_EMPTY_BEANS);
+ }
+
@Override
public void serialize(ParseContext parseContext, JsonGenerator gen,
SerializerProvider serializers) throws IOException {
gen.writeStartObject();
- ObjectMapper mapper = (ObjectMapper) gen.getCodec();
-
// First, serialize typed objects from the context map under "typed" key
Map contextMap = parseContext.getContextMap();
boolean hasTypedObjects = false;
for (Map.Entry entry : contextMap.entrySet()) {
- String className = entry.getKey();
+ String keyClassName = entry.getKey();
Object value = entry.getValue();
// Skip null values
@@ -69,10 +75,14 @@ public void serialize(ParseContext parseContext, JsonGenerator gen,
continue;
}
- // Try to find a friendly component name, otherwise use FQCN
- String keyName = findComponentName(className);
+ // Use the actual value's class for serialization, not the key class (which may be an interface)
+ // This ensures we can deserialize back to the concrete class
+ String valueClassName = value.getClass().getName();
+
+ // Try to find a friendly component name for the value's class, otherwise use FQCN
+ String keyName = findComponentName(valueClassName);
if (keyName == null) {
- keyName = className;
+ keyName = valueClassName;
}
if (!hasTypedObjects) {
@@ -81,7 +91,7 @@ public void serialize(ParseContext parseContext, JsonGenerator gen,
hasTypedObjects = true;
}
gen.writeFieldName(keyName);
- gen.writeRawValue(mapper.writeValueAsString(value));
+ gen.writeRawValue(PLAIN_MAPPER.writeValueAsString(value));
}
if (hasTypedObjects) {
diff --git a/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java
index 80063b151d5..1db87866e7f 100644
--- a/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java
+++ b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java
@@ -51,9 +51,9 @@ public void setUp() throws Exception {
// ==================== Test POJOs ====================
/**
- * Simple config POJO with properties.
+ * Simple config POJO with properties for testing config loading.
*/
- public static class HandlerConfig {
+ public static class RetryConfig {
private int timeout;
private int retries;
private boolean enabled;
@@ -185,7 +185,7 @@ public abstract static class AbstractHandler implements TestHandler {
@Test
public void testLoadByExplicitKey() throws Exception {
- HandlerConfig config = configLoader.load("handler-config", HandlerConfig.class);
+ RetryConfig config = configLoader.load("retry-config", RetryConfig.class);
assertNotNull(config);
assertEquals(5000, config.getTimeout());
@@ -195,7 +195,7 @@ public void testLoadByExplicitKey() throws Exception {
@Test
public void testLoadByClassNameKebabCase() throws Exception {
- HandlerConfig config = configLoader.load(HandlerConfig.class);
+ RetryConfig config = configLoader.load(RetryConfig.class);
assertNotNull(config);
assertEquals(5000, config.getTimeout());
@@ -224,20 +224,20 @@ public void testLoadByClassNameMyFeatureSettings() throws Exception {
@Test
public void testLoadWithDefaultValue() throws Exception {
- HandlerConfig config = configLoader.load("handler-config", HandlerConfig.class);
+ RetryConfig config = configLoader.load("retry-config", RetryConfig.class);
assertNotNull(config);
// Non-existent key with default
- HandlerConfig defaultConfig = new HandlerConfig();
+ RetryConfig defaultConfig = new RetryConfig();
defaultConfig.setTimeout(9999);
- HandlerConfig result = configLoader.load("non-existent", HandlerConfig.class, defaultConfig);
+ RetryConfig result = configLoader.load("non-existent", RetryConfig.class, defaultConfig);
assertEquals(9999, result.getTimeout());
}
@Test
public void testLoadMissingKeyReturnsNull() throws Exception {
- HandlerConfig config = configLoader.load("non-existent-key", HandlerConfig.class);
+ RetryConfig config = configLoader.load("non-existent-key", RetryConfig.class);
assertNull(config);
}
@@ -312,7 +312,7 @@ public void testLoadProhibitedKeyMetadataFilters() throws Exception {
@Test
public void testHasKey() throws Exception {
- assertTrue(configLoader.hasKey("handler-config"));
+ assertTrue(configLoader.hasKey("retry-config"));
assertTrue(configLoader.hasKey("simple-handler"));
assertFalse(configLoader.hasKey("non-existent"));
}
@@ -350,10 +350,10 @@ public void testLoadWithUnexpectedFieldFails() throws Exception {
TikaLoader loader = TikaLoader.load(configPath);
TikaConfigException ex = assertThrows(TikaConfigException.class, () ->
- loader.configs().load("handler-config", HandlerConfig.class));
+ loader.configs().load("retry-config", RetryConfig.class));
// Should contain information about the unrecognized field
- assertTrue(ex.getMessage().contains("handler-config") ||
+ assertTrue(ex.getMessage().contains("retry-config") ||
ex.getCause().getMessage().contains("Unrecognized") ||
ex.getCause().getMessage().contains("unexpectedField"),
"Exception should mention the unrecognized field");
@@ -370,7 +370,7 @@ public void testKebabCaseConversion() throws Exception {
@Test
public void testLoadByClassWithDefault() throws Exception {
- HandlerConfig config = configLoader.load(HandlerConfig.class);
+ RetryConfig config = configLoader.load(RetryConfig.class);
assertNotNull(config);
// Non-existent class
@@ -394,14 +394,14 @@ public void testLoadWithDefaultsPartialConfig() throws Exception {
TikaLoader loader = TikaLoader.load(configPath);
// Set up defaults
- HandlerConfig defaults = new HandlerConfig();
+ RetryConfig defaults = new RetryConfig();
defaults.setTimeout(30000);
defaults.setRetries(2);
defaults.setEnabled(false);
// JSON only has: { "enabled": true }
- HandlerConfig config = loader.configs().loadWithDefaults("handler-config",
- HandlerConfig.class,
+ RetryConfig config = loader.configs().loadWithDefaults("retry-config",
+ RetryConfig.class,
defaults);
assertNotNull(config);
@@ -417,14 +417,14 @@ public void testLoadWithDefaultsFullOverride() throws Exception {
getClass().getResource("/configs/test-partial-config.json").toURI());
TikaLoader loader = TikaLoader.load(configPath);
- HandlerConfig defaults = new HandlerConfig();
+ RetryConfig defaults = new RetryConfig();
defaults.setTimeout(30000);
defaults.setRetries(2);
defaults.setEnabled(false);
// JSON has: { "timeout": 10000, "retries": 5, "enabled": false }
- HandlerConfig config = loader.configs().loadWithDefaults("handler-config-full",
- HandlerConfig.class,
+ RetryConfig config = loader.configs().loadWithDefaults("retry-config-full",
+ RetryConfig.class,
defaults);
assertNotNull(config);
@@ -436,13 +436,13 @@ public void testLoadWithDefaultsFullOverride() throws Exception {
@Test
public void testLoadWithDefaultsMissingKey() throws Exception {
// When key doesn't exist, should return original defaults unchanged
- HandlerConfig defaults = new HandlerConfig();
+ RetryConfig defaults = new RetryConfig();
defaults.setTimeout(30000);
defaults.setRetries(2);
defaults.setEnabled(false);
- HandlerConfig config = configLoader.loadWithDefaults("non-existent-key",
- HandlerConfig.class,
+ RetryConfig config = configLoader.loadWithDefaults("non-existent-key",
+ RetryConfig.class,
defaults);
assertNotNull(config);
@@ -458,13 +458,13 @@ public void testLoadWithDefaultsByClass() throws Exception {
getClass().getResource("/configs/test-partial-config.json").toURI());
TikaLoader loader = TikaLoader.load(configPath);
- HandlerConfig defaults = new HandlerConfig();
+ RetryConfig defaults = new RetryConfig();
defaults.setTimeout(30000);
defaults.setRetries(2);
defaults.setEnabled(false);
- // Uses kebab-case: HandlerConfig -> "handler-config"
- HandlerConfig config = loader.configs().loadWithDefaults(HandlerConfig.class, defaults);
+ // Uses kebab-case: RetryConfig -> "retry-config"
+ RetryConfig config = loader.configs().loadWithDefaults(RetryConfig.class, defaults);
assertNotNull(config);
assertEquals(30000, config.getTimeout());
@@ -479,20 +479,20 @@ public void testLoadVsLoadWithDefaults() throws Exception {
getClass().getResource("/configs/test-partial-config.json").toURI());
TikaLoader loader = TikaLoader.load(configPath);
- HandlerConfig defaults = new HandlerConfig();
+ RetryConfig defaults = new RetryConfig();
defaults.setTimeout(30000);
defaults.setRetries(2);
defaults.setEnabled(false);
// Using load() - creates new object, loses defaults
- HandlerConfig config1 = loader.configs().load("handler-config", HandlerConfig.class);
+ RetryConfig config1 = loader.configs().load("retry-config", RetryConfig.class);
assertEquals(0, config1.getTimeout()); // ❌ Lost default!
assertEquals(0, config1.getRetries()); // ❌ Lost default!
assertTrue(config1.isEnabled()); // ✅ From JSON
// Using loadWithDefaults() - merges into defaults
- HandlerConfig config2 = loader.configs().loadWithDefaults("handler-config",
- HandlerConfig.class,
+ RetryConfig config2 = loader.configs().loadWithDefaults("retry-config",
+ RetryConfig.class,
defaults);
assertEquals(30000, config2.getTimeout()); // ✅ Kept default!
assertEquals(2, config2.getRetries()); // ✅ Kept default!
@@ -508,14 +508,14 @@ public void testLoadWithDefaultsDoesNotMutateOriginal() throws Exception {
getClass().getResource("/configs/test-partial-config.json").toURI());
TikaLoader loader = TikaLoader.load(configPath);
- HandlerConfig defaults = new HandlerConfig();
+ RetryConfig defaults = new RetryConfig();
defaults.setTimeout(30000);
defaults.setRetries(2);
defaults.setEnabled(false);
// Load config with partial override (JSON only has "enabled": true)
- HandlerConfig result = loader.configs().loadWithDefaults("handler-config",
- HandlerConfig.class,
+ RetryConfig result = loader.configs().loadWithDefaults("retry-config",
+ RetryConfig.class,
defaults);
// Verify result has merged values
@@ -541,17 +541,17 @@ public void testLoadWithDefaultsReusableDefaults() throws Exception {
getClass().getResource("/configs/test-partial-config.json").toURI());
TikaLoader loader = TikaLoader.load(configPath);
- HandlerConfig defaults = new HandlerConfig();
+ RetryConfig defaults = new RetryConfig();
defaults.setTimeout(30000);
defaults.setRetries(2);
defaults.setEnabled(false);
// Load multiple times with same defaults
- HandlerConfig config1 = loader.configs().loadWithDefaults("handler-config",
- HandlerConfig.class,
+ RetryConfig config1 = loader.configs().loadWithDefaults("retry-config",
+ RetryConfig.class,
defaults);
- HandlerConfig config2 = loader.configs().loadWithDefaults("handler-config-full",
- HandlerConfig.class,
+ RetryConfig config2 = loader.configs().loadWithDefaults("retry-config-full",
+ RetryConfig.class,
defaults);
// Verify results are different
@@ -564,8 +564,8 @@ public void testLoadWithDefaultsReusableDefaults() throws Exception {
assertFalse(defaults.isEnabled());
// Use defaults one more time
- HandlerConfig config3 = loader.configs().loadWithDefaults("non-existent",
- HandlerConfig.class,
+ RetryConfig config3 = loader.configs().loadWithDefaults("non-existent",
+ RetryConfig.class,
defaults);
assertEquals(defaults, config3); // Should return original when key missing
}
@@ -595,13 +595,13 @@ public void testLoadWithDefaultsComplexObjectImmutability() throws Exception {
@Test
public void testLoadWithDefaultsMissingKeyDoesNotClone() throws Exception {
// When key is missing, should return the original object (no unnecessary cloning)
- HandlerConfig defaults = new HandlerConfig();
+ RetryConfig defaults = new RetryConfig();
defaults.setTimeout(30000);
defaults.setRetries(2);
defaults.setEnabled(false);
- HandlerConfig result = configLoader.loadWithDefaults("non-existent-key",
- HandlerConfig.class,
+ RetryConfig result = configLoader.loadWithDefaults("non-existent-key",
+ RetryConfig.class,
defaults);
// Should return the exact same object when key is missing
@@ -619,17 +619,17 @@ public void testLoadWithDefaultsThreadSafety() throws Exception {
TikaLoader loader = TikaLoader.load(configPath);
// Shared defaults object
- HandlerConfig sharedDefaults = new HandlerConfig();
+ RetryConfig sharedDefaults = new RetryConfig();
sharedDefaults.setTimeout(30000);
sharedDefaults.setRetries(2);
sharedDefaults.setEnabled(false);
// Simulate concurrent usage (not a real concurrency test, just demonstrates safety)
- HandlerConfig result1 = loader.configs().loadWithDefaults("handler-config",
- HandlerConfig.class,
+ RetryConfig result1 = loader.configs().loadWithDefaults("retry-config",
+ RetryConfig.class,
sharedDefaults);
- HandlerConfig result2 = loader.configs().loadWithDefaults("handler-config-full",
- HandlerConfig.class,
+ RetryConfig result2 = loader.configs().loadWithDefaults("retry-config-full",
+ RetryConfig.class,
sharedDefaults);
// Both results should be valid
diff --git a/tika-serialization/src/test/java/org/apache/tika/sax/UppercasingContentHandlerFactory.java b/tika-serialization/src/test/java/org/apache/tika/sax/UppercasingContentHandlerFactory.java
index cf0c56043f8..2826320979e 100644
--- a/tika-serialization/src/test/java/org/apache/tika/sax/UppercasingContentHandlerFactory.java
+++ b/tika-serialization/src/test/java/org/apache/tika/sax/UppercasingContentHandlerFactory.java
@@ -16,9 +16,6 @@
*/
package org.apache.tika.sax;
-import java.io.OutputStream;
-import java.nio.charset.Charset;
-
import org.xml.sax.ContentHandler;
import org.apache.tika.config.TikaComponent;
@@ -36,17 +33,7 @@ public class UppercasingContentHandlerFactory implements ContentHandlerFactory {
private static final long serialVersionUID = 1L;
@Override
- public ContentHandler getNewContentHandler() {
+ public ContentHandler createHandler() {
return new UppercasingContentHandler(new ToTextContentHandler());
}
-
- @Override
- public ContentHandler getNewContentHandler(OutputStream os, Charset charset) {
- try {
- return new UppercasingContentHandler(new ToTextContentHandler(os, charset.name()));
- } catch (java.io.UnsupportedEncodingException e) {
- // Should never happen since we're using a valid Charset
- throw new RuntimeException("Unexpected encoding error", e);
- }
- }
}
diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
index 43c2b9cd74a..c8fd0e42210 100644
--- a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
+++ b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
@@ -39,6 +39,8 @@
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.filter.MockUpperCaseFilter;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.SimplePasswordProvider;
import org.apache.tika.serialization.serdes.ParseContextDeserializer;
import org.apache.tika.serialization.serdes.ParseContextSerializer;
@@ -313,4 +315,31 @@ public void testContextKeyDeserialization() throws Exception {
assertFalse(selector.select(new org.apache.tika.metadata.Metadata()),
"SkipEmbeddedDocumentSelector should return false for all documents");
}
+
+ @Test
+ public void testSimplePasswordProviderDeserialization() throws Exception {
+ // Test that SimplePasswordProvider with contextKey=PasswordProvider.class
+ // is stored in ParseContext with the contextKey
+ String json = """
+ {
+ "simple-password-provider": {
+ "password": "secret123"
+ }
+ }
+ """;
+
+ ObjectMapper mapper = createMapper();
+ ParseContext deserialized = mapper.readValue(json, ParseContext.class);
+
+ // Resolve the config
+ ParseContextUtils.resolveAll(deserialized, Thread.currentThread().getContextClassLoader());
+
+ // Should be accessible via PasswordProvider.class (the contextKey)
+ PasswordProvider provider = deserialized.get(PasswordProvider.class);
+ assertNotNull(provider, "PasswordProvider should be found via contextKey");
+ assertTrue(provider instanceof SimplePasswordProvider,
+ "Should be SimplePasswordProvider instance");
+ assertEquals("secret123", provider.getPassword(null),
+ "Password should match the configured value");
+ }
}
diff --git a/tika-serialization/src/test/resources/configs/test-config-loader.json b/tika-serialization/src/test/resources/configs/test-config-loader.json
index 5305f2a43a9..dd657c81e05 100644
--- a/tika-serialization/src/test/resources/configs/test-config-loader.json
+++ b/tika-serialization/src/test/resources/configs/test-config-loader.json
@@ -4,7 +4,7 @@
],
"other-configs": {
- "handler-config": {
+ "retry-config": {
"timeout": 5000,
"retries": 3,
"enabled": true
diff --git a/tika-serialization/src/test/resources/configs/test-partial-config.json b/tika-serialization/src/test/resources/configs/test-partial-config.json
index 866f2594b7c..5c5eab6992a 100644
--- a/tika-serialization/src/test/resources/configs/test-partial-config.json
+++ b/tika-serialization/src/test/resources/configs/test-partial-config.json
@@ -1,10 +1,10 @@
{
"other-configs": {
- "handler-config": {
+ "retry-config": {
"enabled": true
},
- "handler-config-full": {
+ "retry-config-full": {
"timeout": 10000,
"retries": 5,
"enabled": false
diff --git a/tika-serialization/src/test/resources/configs/test-unexpected-field.json b/tika-serialization/src/test/resources/configs/test-unexpected-field.json
index d250d5fa1d3..5946b399ea9 100644
--- a/tika-serialization/src/test/resources/configs/test-unexpected-field.json
+++ b/tika-serialization/src/test/resources/configs/test-unexpected-field.json
@@ -1,6 +1,6 @@
{
"other-configs": {
- "handler-config": {
+ "retry-config": {
"timeout": 5000,
"retries": 3,
"enabled": true,
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
index d215552db17..698241cc3bb 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
@@ -45,8 +45,9 @@
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
-import org.apache.tika.pipes.api.HandlerConfig;
+import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
import org.apache.tika.server.core.MetadataList;
import org.apache.tika.server.core.TikaServerParseException;
@@ -59,7 +60,7 @@ public class RecursiveMetadataResource {
private static final Logger LOG = LoggerFactory.getLogger(RecursiveMetadataResource.class);
public static List parseMetadata(TikaInputStream tis, Metadata metadata, MultivaluedMap httpHeaders,
- UriInfo info, HandlerConfig handlerConfig)
+ UriInfo info, ServerHandlerConfig handlerConfig)
throws Exception {
final ParseContext context = new ParseContext();
@@ -69,10 +70,16 @@ public static List parseMetadata(TikaInputStream tis, Metadata metadat
fillMetadata(parser, metadata, httpHeaders);
TikaResource.logRequest(LOG, "/rmeta", metadata);
- BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.getType();
+ // Check if a ContentHandlerFactory was provided in ParseContext
+ ContentHandlerFactory factory = context.get(ContentHandlerFactory.class);
+ if (factory == null) {
+ // Fall back to creating one from HTTP headers
+ BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.type();
+ factory = new BasicContentHandlerFactory(type, handlerConfig.writeLimit(), handlerConfig.throwOnWriteLimitReached(), context);
+ }
RecursiveParserWrapperHandler handler =
- new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(type, handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), context),
- handlerConfig.getMaxEmbeddedResources());
+ new RecursiveParserWrapperHandler(factory,
+ handlerConfig.maxEmbeddedResources());
try {
TikaResource.parse(wrapper, LOG, "/rmeta", tis, handler, metadata, context);
} catch (TikaServerParseException e) {
@@ -90,7 +97,7 @@ public static List parseMetadata(TikaInputStream tis, Metadata metadat
return metadataList;
}
- static HandlerConfig buildHandlerConfig(MultivaluedMap httpHeaders, String handlerTypeName, HandlerConfig.PARSE_MODE parseMode) {
+ static ServerHandlerConfig buildHandlerConfig(MultivaluedMap httpHeaders, String handlerTypeName, ParseMode parseMode) {
int writeLimit = -1;
if (httpHeaders.containsKey("writeLimit")) {
writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit"));
@@ -100,7 +107,7 @@ static HandlerConfig buildHandlerConfig(MultivaluedMap httpHeade
if (httpHeaders.containsKey("maxEmbeddedResources")) {
maxEmbeddedResources = Integer.parseInt(httpHeaders.getFirst("maxEmbeddedResources"));
}
- return new HandlerConfig(BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE), parseMode, writeLimit, maxEmbeddedResources,
+ return new ServerHandlerConfig(BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE), parseMode, writeLimit, maxEmbeddedResources,
TikaResource.getThrowOnWriteLimitReached(httpHeaders));
}
@@ -136,7 +143,7 @@ public Response getMetadataFromMultipart(Attachment att, @Context UriInfo info,
try (TikaInputStream tis = TikaInputStream.get(att.getObject(InputStream.class))) {
return Response
.ok(parseMetadataToMetadataList(tis, new Metadata(), att.getHeaders(), info,
- buildHandlerConfig(att.getHeaders(), handlerTypeName, HandlerConfig.PARSE_MODE.RMETA)))
+ buildHandlerConfig(att.getHeaders(), handlerTypeName, ParseMode.RMETA)))
.build();
}
}
@@ -163,21 +170,27 @@ public Response getMetadataWithConfig(
return Response
.ok(parseMetadataWithContext(tis, metadata, httpHeaders.getRequestHeaders(), info,
- buildHandlerConfig(httpHeaders.getRequestHeaders(), handlerTypeName != null ? handlerTypeName.substring(1) : null, HandlerConfig.PARSE_MODE.RMETA),
+ buildHandlerConfig(httpHeaders.getRequestHeaders(), handlerTypeName != null ? handlerTypeName.substring(1) : null, ParseMode.RMETA),
context))
.build();
}
}
private MetadataList parseMetadataWithContext(TikaInputStream tis, Metadata metadata, MultivaluedMap httpHeaders,
- UriInfo info, HandlerConfig handlerConfig, ParseContext context) throws Exception {
+ UriInfo info, ServerHandlerConfig handlerConfig, ParseContext context) throws Exception {
Parser parser = TikaResource.createParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
- BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.getType();
+ // Check if a ContentHandlerFactory was provided in ParseContext (e.g., from config JSON)
+ ContentHandlerFactory factory = context.get(ContentHandlerFactory.class);
+ if (factory == null) {
+ // Fall back to creating one from HTTP headers
+ BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.type();
+ factory = new BasicContentHandlerFactory(type, handlerConfig.writeLimit(), handlerConfig.throwOnWriteLimitReached(), context);
+ }
RecursiveParserWrapperHandler handler =
- new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(type, handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), context),
- handlerConfig.getMaxEmbeddedResources());
+ new RecursiveParserWrapperHandler(factory,
+ handlerConfig.maxEmbeddedResources());
try {
TikaResource.parse(wrapper, LOG, "/rmeta/config", tis, handler, metadata, context);
} catch (TikaServerParseException e) {
@@ -225,12 +238,13 @@ public Response getMetadata(InputStream is, @Context HttpHeaders httpHeaders, @C
try (TikaInputStream tis = TikaResource.getInputStream(is, metadata, httpHeaders, info)) {
return Response
.ok(parseMetadataToMetadataList(tis, metadata, httpHeaders.getRequestHeaders(), info,
- buildHandlerConfig(httpHeaders.getRequestHeaders(), handlerTypeName, HandlerConfig.PARSE_MODE.RMETA)))
+ buildHandlerConfig(httpHeaders.getRequestHeaders(), handlerTypeName, ParseMode.RMETA)))
.build();
}
}
- private MetadataList parseMetadataToMetadataList(TikaInputStream tis, Metadata metadata, MultivaluedMap httpHeaders, UriInfo info, HandlerConfig handlerConfig)
+ private MetadataList parseMetadataToMetadataList(TikaInputStream tis, Metadata metadata,
+ MultivaluedMap httpHeaders, UriInfo info, ServerHandlerConfig handlerConfig)
throws Exception {
return new MetadataList(parseMetadata(tis, metadata, httpHeaders, info, handlerConfig));
}
diff --git a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorConfig.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/ServerHandlerConfig.java
similarity index 58%
rename from tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorConfig.java
rename to tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/ServerHandlerConfig.java
index 09a9ab4abb3..b46802aecd1 100644
--- a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/pipesiterator/PipesIteratorConfig.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/ServerHandlerConfig.java
@@ -14,8 +14,21 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.pipes.api.pipesiterator;
+package org.apache.tika.server.core.resource;
-public interface PipesIteratorConfig {
- PipesIteratorBaseConfig getBaseConfig();
+import org.apache.tika.pipes.api.ParseMode;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+
+/**
+ * Server-internal configuration for request handlers.
+ * This holds configuration parsed from HTTP headers for a single request
+ * for the BasicContentHandlerFactory kinds of elements.
+ */
+public record ServerHandlerConfig(
+ BasicContentHandlerFactory.HANDLER_TYPE type,
+ ParseMode parseMode,
+ int writeLimit,
+ int maxEmbeddedResources,
+ boolean throwOnWriteLimitReached
+) {
}
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 9b312606b21..ad379252491 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -17,7 +17,6 @@
package org.apache.tika.server.core.resource;
import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig.DEFAULT_HANDLER_CONFIG;
import static org.apache.tika.server.core.resource.RecursiveMetadataResource.DEFAULT_HANDLER_TYPE;
import static org.apache.tika.server.core.resource.RecursiveMetadataResource.HANDLER_TYPE_PARAM;
@@ -77,6 +76,7 @@
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.apache.tika.sax.RichTextContentHandler;
import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
@@ -151,24 +151,22 @@ public static void mergeParseContextFromConfig(String configJson, ParseContext c
JsonNode root = mapper.readTree(configJson);
// Use root directly - the JSON should contain parser configs at the top level
ParseContext configuredContext = ParseContextDeserializer.readParseContext(root, mapper);
-
- // Copy jsonConfigs first (for SelfConfiguring parsers like PDFParser)
- for (Map.Entry entry : configuredContext.getJsonConfigs().entrySet()) {
- context.setJsonConfig(entry.getKey(), entry.getValue());
- }
-
- // Then resolve all configs to typed objects
ParseContextUtils.resolveAll(configuredContext, Thread.currentThread().getContextClassLoader());
-
- // Copy resolved typed objects from contextMap
+ // Copy resolved context entries
for (Map.Entry entry : configuredContext.getContextMap().entrySet()) {
try {
Class> clazz = Class.forName(entry.getKey());
context.set((Class) clazz, entry.getValue());
+ LOG.debug("Merged contextMap entry {} into context", entry.getKey());
} catch (ClassNotFoundException e) {
LOG.warn("Could not load class for parseContext entry: {}", entry.getKey());
}
}
+ // Copy jsonConfigs for lazy resolution by parsers (e.g., pdf-parser config)
+ for (Map.Entry entry : configuredContext.getJsonConfigs().entrySet()) {
+ context.setJsonConfig(entry.getKey(), entry.getValue().json());
+ LOG.debug("Merged jsonConfig entry {} into context", entry.getKey());
+ }
}
public static TikaInputStream getInputStream(InputStream is, Metadata metadata, HttpHeaders headers, UriInfo uriInfo) {
@@ -352,7 +350,8 @@ public static boolean getThrowOnWriteLimitReached(MultivaluedMap
throw new IllegalArgumentException("'throwOnWriteLimitReached' must be either 'true' or 'false'");
}
}
- return DEFAULT_HANDLER_CONFIG.isThrowOnWriteLimitReached();
+ // Default: throw on write limit reached
+ return true;
}
public static long getTaskTimeout(ParseContext parseContext) {
@@ -542,9 +541,14 @@ private void parseToMetadata(TikaInputStream tis, Metadata metadata, Multivalued
writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit"));
}
- BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
- BasicContentHandlerFactory fact = new BasicContentHandlerFactory(type, writeLimit, throwOnWriteLimitReached, context);
- ContentHandler contentHandler = fact.getNewContentHandler();
+ // Check if a ContentHandlerFactory was provided in ParseContext (e.g., from config JSON)
+ ContentHandlerFactory fact = context.get(ContentHandlerFactory.class);
+ if (fact == null) {
+ // Fall back to creating one from HTTP headers
+ BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
+ fact = new BasicContentHandlerFactory(type, writeLimit, throwOnWriteLimitReached, context);
+ }
+ ContentHandler contentHandler = fact.createHandler();
try {
parse(parser, LOG, info.getPath(), tis, contentHandler, metadata, context);
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java
index 7bff9149c01..ee899ab180e 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java
@@ -54,13 +54,14 @@
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
+import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
import org.apache.tika.pipes.core.fetcher.FetcherManager;
import org.apache.tika.pipes.core.serialization.JsonFetchEmitTuple;
import org.apache.tika.plugins.TikaPluginManager;
import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.serialization.JsonMetadataList;
import org.apache.tika.server.core.resource.PipesResource;
import org.apache.tika.server.core.writer.JSONObjWriter;
@@ -203,8 +204,9 @@ public void testPostXML() throws Exception {
userMetadata.add("my-key-multi", s);
}
ParseContext parseContext = new ParseContext();
- HandlerConfig handlerConfig = new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.XML, HandlerConfig.PARSE_MODE.RMETA, -1, -1, true);
- parseContext.set(HandlerConfig.class, handlerConfig);
+ parseContext.set(ContentHandlerFactory.class,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+ parseContext.set(ParseMode.class, ParseMode.RMETA);
FetchEmitTuple t =
new FetchEmitTuple("myId", new FetchKey(FETCHER_ID, "hello_world.xml"),
new EmitKey(EMITTER_JSON_ID, ""), userMetadata, parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT);
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerAsyncIntegrationTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerAsyncIntegrationTest.java
index 97e10201ac6..fb7aaf554bd 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerAsyncIntegrationTest.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerAsyncIntegrationTest.java
@@ -18,7 +18,6 @@
import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig.DEFAULT_HANDLER_CONFIG;
import static org.apache.tika.server.core.CXFTestBase.EMITTER_JSON_ID;
import static org.apache.tika.server.core.CXFTestBase.FETCHER_ID;
import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -49,10 +48,12 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
+import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
import org.apache.tika.pipes.core.serialization.JsonFetchEmitTupleList;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
@Disabled("useful for development...need to turn it into a real unit test")
public class TikaServerAsyncIntegrationTest extends IntegrationTestBase {
@@ -170,7 +171,9 @@ private JsonNode sendAsync(List fileNames) throws Exception {
private FetchEmitTuple getFetchEmitTuple(String fileName) throws IOException {
ParseContext parseContext = new ParseContext();
- parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG);
+ parseContext.set(ContentHandlerFactory.class,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+ parseContext.set(ParseMode.class, ParseMode.RMETA);
return new FetchEmitTuple(fileName, new FetchKey(FETCHER_ID, fileName), new EmitKey(EMITTER_JSON_ID, ""), new Metadata(), parseContext, ON_PARSE_EXCEPTION);
}
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java
index e7957c9df2d..bcbe5251c72 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java
@@ -18,7 +18,6 @@
import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig.DEFAULT_HANDLER_CONFIG;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -43,10 +42,12 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
+import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
import org.apache.tika.pipes.core.serialization.JsonFetchEmitTuple;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.utils.ProcessUtils;
public class TikaServerPipesIntegrationTest extends IntegrationTestBase {
@@ -221,7 +222,9 @@ private JsonNode testOneWithPerRequestTimeout(String fileName, long timeoutMilli
private String getJsonStringWithTimeout(String fileName, long timeoutMillis) throws IOException {
ParseContext parseContext = new ParseContext();
- parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG);
+ parseContext.set(ContentHandlerFactory.class,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+ parseContext.set(ParseMode.class, ParseMode.RMETA);
parseContext.setJsonConfig("tika-task-timeout", "{\"timeoutMillis\":" + timeoutMillis + "}");
FetchEmitTuple t = new FetchEmitTuple(fileName,
@@ -259,7 +262,9 @@ private JsonNode testOne(String fileName, boolean shouldFileExist, FetchEmitTupl
private String getJsonString(String fileName, FetchEmitTuple.ON_PARSE_EXCEPTION onParseException) throws IOException {
ParseContext parseContext = new ParseContext();
- parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG);
+ parseContext.set(ContentHandlerFactory.class,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+ parseContext.set(ParseMode.class, ParseMode.RMETA);
FetchEmitTuple t = new FetchEmitTuple(fileName, new FetchKey(CXFTestBase.FETCHER_ID, fileName),
new EmitKey(CXFTestBase.EMITTER_JSON_ID, ""), new Metadata(), parseContext, onParseException);
return JsonFetchEmitTuple.toJson(t);
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
index 8dcd90a296b..4946191552d 100644
--- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
@@ -427,8 +427,8 @@ public void testEmbeddedResourceLimit() throws Exception {
}
}
- // TIKA-3227 - TODO: re-enable once HandlerConfig is configurable via JSON
- // Use maxEmbeddedResources=0 in handler-config to skip embedded documents
+ // TIKA-3227 - TODO: re-enable once maxEmbeddedResources is configurable via JSON
+ // Use maxEmbeddedResources=0 in config to skip embedded documents
@Test
public void testWriteLimit() throws Exception {
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
index 3761fb0bfe2..078a83038e8 100644
--- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
@@ -16,7 +16,6 @@
*/
package org.apache.tika.server.standard;
-import static org.apache.tika.pipes.api.pipesiterator.PipesIteratorBaseConfig.DEFAULT_HANDLER_CONFIG;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -56,13 +55,15 @@
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
-import org.apache.tika.pipes.api.HandlerConfig;
+import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
import org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig;
import org.apache.tika.pipes.core.fetcher.FetcherManager;
import org.apache.tika.pipes.core.serialization.JsonFetchEmitTuple;
import org.apache.tika.plugins.TikaPluginManager;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.serialization.JsonMetadataList;
import org.apache.tika.server.core.CXFTestBase;
import org.apache.tika.server.core.FetcherStreamFactory;
@@ -181,9 +182,8 @@ public void testBasic() throws Exception {
@Test
public void testConcatenated() throws Exception {
ParseContext parseContext = new ParseContext();
- // Use addConfig with JSON for handler-config
- parseContext.setJsonConfig("handler-config",
- "{\"type\": \"TEXT\", \"parseMode\": \"CONCATENATE\", \"writeLimit\": -1, \"maxEmbeddedResources\": -1, \"throwOnWriteLimitReached\": true}");
+ // Set ParseMode directly - it's now separate from ContentHandlerFactory
+ parseContext.set(ParseMode.class, ParseMode.CONCATENATE);
FetchEmitTuple t = new FetchEmitTuple("myId", new FetchKey(FETCHER_ID, "test_recursive_embedded.docx"),
new EmitKey(EMITTER_JSON_ID, ""), new Metadata(), parseContext,
@@ -247,7 +247,10 @@ public void testBytes() throws Exception {
config.setZeroPadName(10);
config.setSuffixStrategy(EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.EXISTING);
ParseContext parseContext = new ParseContext();
- parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG);
+ // Set default content handler and parse mode
+ parseContext.set(ContentHandlerFactory.class,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+ parseContext.set(ParseMode.class, ParseMode.RMETA);
parseContext.set(EmbeddedDocumentBytesConfig.class, config);
FetchEmitTuple t =
new FetchEmitTuple("myId", new FetchKey(FETCHER_ID, "test_recursive_embedded.docx"),