Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ public class TikaComponentProcessor extends AbstractProcessor {
SERVICE_INTERFACES.put("org.apache.tika.renderer.Renderer", "renderers");
SERVICE_INTERFACES.put("org.apache.tika.metadata.filter.MetadataFilter", "metadata-filters");
SERVICE_INTERFACES.put("org.apache.tika.digest.DigesterFactory", "digester-factories");
SERVICE_INTERFACES.put("org.apache.tika.sax.ContentHandlerFactory",
"content-handler-factories");
}

private Messager messager;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
* <li>Component index files (META-INF/tika/{type}.idx) for name-based lookup</li>
* </ul>
*
* <p>This annotation is only used at compile time by the annotation processor.
* It is retained in .class files for tooling but not loaded by the runtime JVM.
* <p>This annotation is processed at compile time by the annotation processor.
* The contextKey is recorded in the .idx file for runtime resolution.
*
* <p>Example usage:
* <pre>
Expand Down
26 changes: 12 additions & 14 deletions tika-app/src/test/resources/configs/config-template.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
{
"content-handler-factory": {
"basic-content-handler-factory": {
"type": "TEXT",
"writeLimit": -1,
"maxEmbeddedResources": -1,
"throwOnWriteLimitReached": true
}
},
"fetchers": {
"fsf": {
"file-system-fetcher": {
Expand All @@ -21,23 +29,13 @@
"file-system-pipes-iterator": {
"basePath": "FETCHER_BASE_PATH",
"countTotal": true,
"baseConfig": {
"fetcherId": "fsf",
"emitterId": "fse",
"handlerConfig": {
"type": "TEXT",
"parseMode": "RMETA",
"writeLimit": -1,
"maxEmbeddedResources": -1,
"throwOnWriteLimitReached": true
},
"onParseException": "EMIT",
"maxWaitMs": 600000,
"queueSize": 10000
}
"fetcherId": "fsf",
"emitterId": "fse"
}
},
"pipes": {
"parseMode": "RMETA",
"onParseException": "EMIT",
"emitWithinMillis": 10000,
"emitMaxEstimatedBytes": 100000,
"queueSize": 10000,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ public void parse(TikaInputStream tis, ContentHandler recursiveParserWrapperHand
new EmbeddedParserDecorator(getWrappedParser(), "/", "/", parserState);
context.set(Parser.class, decorator);
ContentHandler localHandler =
parserState.recursiveParserWrapperHandler.getNewContentHandler();
parserState.recursiveParserWrapperHandler.createHandler();
long started = System.currentTimeMillis();
parserState.recursiveParserWrapperHandler.startDocument();
int writeLimit = -1;
Expand Down Expand Up @@ -241,7 +241,7 @@ public void parse(TikaInputStream tis, ContentHandler ignore, Metadata metadata,
metadata.set(TikaCoreProperties.EMBEDDED_ID, parserState.embeddedCount);
//get a fresh handler
ContentHandler localHandler =
parserState.recursiveParserWrapperHandler.getNewContentHandler();
parserState.recursiveParserWrapperHandler.createHandler();
parserState.recursiveParserWrapperHandler.startEmbeddedDocument(localHandler, metadata);

Parser preContextParser = context.get(Parser.class);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ private void parse(TikaInputStream tis, ContentHandler handler,
// If not, the user will get text from every parser
// mushed together onto the one solitary handler...
if (handlerFactory != null) {
handler = handlerFactory.getNewContentHandler();
handler = handlerFactory.createHandler();
}

// Record that we used this parser
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@
*/
package org.apache.tika.sax;

import java.io.OutputStream;
import java.io.Serializable;
import java.nio.charset.Charset;

import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
Expand Down Expand Up @@ -55,12 +53,8 @@ public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandle
this.maxEmbeddedResources = maxEmbeddedResources;
}

public ContentHandler getNewContentHandler() {
return contentHandlerFactory.getNewContentHandler();
}

public ContentHandler getNewContentHandler(OutputStream os, Charset charset) {
return contentHandlerFactory.getNewContentHandler(os, charset);
public ContentHandler createHandler() {
return contentHandlerFactory.createHandler();
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,30 @@
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;

import org.apache.tika.config.TikaComponent;
import org.apache.tika.parser.ParseContext;

/**
* Basic factory for creating common types of ContentHandlers
* Basic factory for creating common types of ContentHandlers.
* <p>
* Implements {@link StreamingContentHandlerFactory} to support both in-memory
* content extraction and streaming output to an OutputStream.
*/
public class BasicContentHandlerFactory implements ContentHandlerFactory, WriteLimiter {
@TikaComponent(contextKey = ContentHandlerFactory.class)
public class BasicContentHandlerFactory implements StreamingContentHandlerFactory, WriteLimiter {

private final HANDLER_TYPE type;
private final int writeLimit;
private HANDLER_TYPE type = HANDLER_TYPE.TEXT;
private int writeLimit = -1;
private boolean throwOnWriteLimitReached = true;
private int maxEmbeddedResources = -1;
private transient ParseContext parseContext;

private final boolean throwOnWriteLimitReached;

private final ParseContext parseContext;
/**
* No-arg constructor for bean-style configuration (e.g., Jackson deserialization).
* Creates a factory with TEXT handler type, unlimited write, and throwOnWriteLimitReached=true.
*/
public BasicContentHandlerFactory() {
}

/**
* Create a BasicContentHandlerFactory with {@link #throwOnWriteLimitReached} is true
Expand Down Expand Up @@ -70,7 +81,29 @@ public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit,
throw new IllegalArgumentException("parse context must not be null if " +
"throwOnWriteLimitReached is false");
}
}

/**
* Full constructor with all parameters including maxEmbeddedResources.
*
* @param type basic type of handler
* @param writeLimit maximum number of characters to store; -1 for unlimited
* @param throwOnWriteLimitReached whether to throw when write limit is reached
* @param maxEmbeddedResources maximum number of embedded resources to process; -1 for unlimited
* @param parseContext to store warnings if throwOnWriteLimitReached is false
*/
public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit,
boolean throwOnWriteLimitReached, int maxEmbeddedResources,
ParseContext parseContext) {
this.type = type;
this.writeLimit = writeLimit;
this.throwOnWriteLimitReached = throwOnWriteLimitReached;
this.maxEmbeddedResources = maxEmbeddedResources;
this.parseContext = parseContext;
if (throwOnWriteLimitReached == false && parseContext == null) {
throw new IllegalArgumentException("parse context must not be null if " +
"throwOnWriteLimitReached is false");
}
}

/**
Expand Down Expand Up @@ -108,7 +141,7 @@ public static HANDLER_TYPE parseHandlerType(String handlerTypeName, HANDLER_TYPE
}

@Override
public ContentHandler getNewContentHandler() {
public ContentHandler createHandler() {

if (type == HANDLER_TYPE.BODY) {
return new BodyContentHandler(
Expand Down Expand Up @@ -139,7 +172,7 @@ private ContentHandler getFormatHandler() {
}

@Override
public ContentHandler getNewContentHandler(OutputStream os, Charset charset) {
public ContentHandler createHandler(OutputStream os, Charset charset) {

if (type == HANDLER_TYPE.IGNORE) {
return new DefaultHandler();
Expand Down Expand Up @@ -191,6 +224,14 @@ public HANDLER_TYPE getType() {
return type;
}

/**
* Sets the handler type.
* @param type the handler type
*/
public void setType(HANDLER_TYPE type) {
this.type = type;
}

/**
* Common handler types for content.
*/
Expand All @@ -203,8 +244,72 @@ public int getWriteLimit() {
return writeLimit;
}

/**
* Sets the write limit.
* @param writeLimit max characters to extract; -1 for unlimited
*/
public void setWriteLimit(int writeLimit) {
this.writeLimit = writeLimit;
}

@Override
public boolean isThrowOnWriteLimitReached() {
return throwOnWriteLimitReached;
}

/**
* Sets whether to throw an exception when write limit is reached.
* @param throwOnWriteLimitReached true to throw, false to silently stop
*/
public void setThrowOnWriteLimitReached(boolean throwOnWriteLimitReached) {
this.throwOnWriteLimitReached = throwOnWriteLimitReached;
}

/**
* Gets the maximum number of embedded resources to process.
* @return max embedded resources; -1 for unlimited
*/
public int getMaxEmbeddedResources() {
return maxEmbeddedResources;
}

/**
* Sets the maximum number of embedded resources to process.
* @param maxEmbeddedResources max embedded resources; -1 for unlimited
*/
public void setMaxEmbeddedResources(int maxEmbeddedResources) {
this.maxEmbeddedResources = maxEmbeddedResources;
}

/**
* Sets the parse context for storing warnings when throwOnWriteLimitReached is false.
* @param parseContext the parse context
*/
public void setParseContext(ParseContext parseContext) {
this.parseContext = parseContext;
}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
BasicContentHandlerFactory that = (BasicContentHandlerFactory) o;
return writeLimit == that.writeLimit &&
throwOnWriteLimitReached == that.throwOnWriteLimitReached &&
maxEmbeddedResources == that.maxEmbeddedResources &&
type == that.type;
}

@Override
public int hashCode() {
int result = type != null ? type.hashCode() : 0;
result = 31 * result + writeLimit;
result = 31 * result + (throwOnWriteLimitReached ? 1 : 0);
result = 31 * result + maxEmbeddedResources;
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,27 @@
*/
package org.apache.tika.sax;


import java.io.OutputStream;
import java.io.Serializable;
import java.nio.charset.Charset;

import org.xml.sax.ContentHandler;

/**
* Interface to allow easier injection of code for getting a new ContentHandler
* Factory interface for creating ContentHandler instances.
* <p>
* This is the base interface used by tika-pipes, RecursiveParserWrapper, and other
* components that need to create content handlers for in-memory content extraction.
* <p>
* For streaming output to an OutputStream, see {@link StreamingContentHandlerFactory}.
*
* @see StreamingContentHandlerFactory
* @see BasicContentHandlerFactory
*/
public interface ContentHandlerFactory extends Serializable {
ContentHandler getNewContentHandler();

ContentHandler getNewContentHandler(OutputStream os, Charset charset);

/**
* Creates a new ContentHandler for extracting content.
*
* @return a new ContentHandler instance
*/
ContentHandler createHandler();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.sax;

import java.io.OutputStream;
import java.nio.charset.Charset;

import org.xml.sax.ContentHandler;

/**
* Extended factory interface for creating ContentHandler instances that write
* directly to an OutputStream.
* <p>
* This interface extends {@link ContentHandlerFactory} to add streaming output
* capability, primarily used by tika-server's /tika endpoint for streaming
* responses back to clients.
*
* @see ContentHandlerFactory
* @see BasicContentHandlerFactory
*/
public interface StreamingContentHandlerFactory extends ContentHandlerFactory {

/**
* Creates a new ContentHandler that writes output directly to the given OutputStream.
*
* @param os the output stream to write to
* @param charset the character encoding to use
* @return a new ContentHandler instance that writes to the stream
*/
ContentHandler createHandler(OutputStream os, Charset charset);
}
Loading
Loading