notes on tika parsing - krickert/search-api GitHub Wiki
Below is an example of a complete solution. This example defines a configuration record, a parsed‐document record (with your mapped fields plus a catch‐all metadata map), and a service class that uses Tika’s auto‐detection plus configurable OCR support and text splitting. (In our example, if parsing fails we attempt a fallback using the auto parser.) You can adjust or extend the configuration options as needed. This code uses JDK21 “records” and modern constructs, and it is intended for use within a Micronaut service (without any web controllers).
Note: Ensure you add the appropriate Tika dependencies (including the Tesseract OCR extension if you want OCR) in your Gradle KTS build file.
package com.example.parser;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.sax.BodyContentHandler;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Configuration for document parsing.
*
* @param maxCharSize the maximum number of characters allowed in one document chunk;
* if the extracted content is larger, it will be split into multiple ParsedDocument objects.
* @param overlap the number of characters to overlap between consecutive chunks.
* @param enableOCR whether to enable OCR (useful for PDF/image-based documents).
*/
public record DocumentParserConfig(int maxCharSize, int overlap, boolean enableOCR) { }
/**
* Represents a parsed document with key mapped fields and a catch-all metadata map.
*
* Fields:
* - title
* - body (a chunk of content; multiple documents may be returned if the content is split)
* - description
* - keywords
* - pageNumbers
* - lastUpdated
* - dateCreated
* - documentType
* - fileName
* - metadata (all other metadata key/values)
*/
public record ParsedDocument(
String title,
String body,
String description,
String keywords,
String pageNumbers,
String lastUpdated,
String dateCreated,
String documentType,
String fileName,
Map<String, Object> metadata
) { }
/**
* Service that uses Apache Tika to parse various document types.
*
* <p>This service supports a wide range of file types (Word, PowerPoint, Open Office,
* text, PDF, PDF with OCR, Excel, CSV, JSON, HTML, XML, Markdown, JavaScript, etc.)
* by relying on Tika’s AutoDetectParser. Additional configuration (such as OCR, maximum
* character size for splitting, and overlap) is supported.
*
* <p>If the initial parse attempt fails, the service falls back on the auto parser.
*/
public class DocumentParserService {
/**
* Parses the given InputStream and returns one or more ParsedDocument objects.
*
* @param input the input stream (e.g. from a file upload) containing the document bytes
* @param fileName the name of the file (used to help determine file type and map to fields)
* @param config the configuration options (max char size, overlap, OCR enable)
* @return a list of ParsedDocument objects (one per chunk, if splitting occurs)
* @throws Exception if parsing fails (and no fallback is possible)
*/
public List<ParsedDocument> parse(InputStream input, String fileName, DocumentParserConfig config) throws Exception {
// Wrap the input stream in a BufferedInputStream to support mark/reset.
try (InputStream bufferedInput = new BufferedInputStream(input)) {
// Set the resource name (file name) into Tika metadata.
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
// Use the auto-detect parser. Tika will detect and delegate to
// the correct parser for Word, PowerPoint, Open Office, PDF, etc.
Parser parser = new AutoDetectParser();
// Prepare the parse context.
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
if (config.enableOCR()) {
// If OCR is enabled, configure the Tesseract OCR parser.
TesseractOCRConfig tesseractConfig = new TesseractOCRConfig();
// Optionally, customize Tesseract settings here.
context.set(TesseractOCRConfig.class, tesseractConfig);
}
// Use a BodyContentHandler with unlimited size (-1) to capture the full text.
BodyContentHandler handler = new BodyContentHandler(-1);
// Attempt parsing.
try {
parser.parse(bufferedInput, handler, metadata, context);
} catch (Exception e) {
// Fallback: if the primary parsing attempt fails, reset the stream and try again.
if (bufferedInput.markSupported()) {
bufferedInput.reset();
parser.parse(bufferedInput, handler, metadata, context);
} else {
throw new RuntimeException("Parsing failed and input stream is not resettable", e);
}
}
// Extract the full text.
String fullText = handler.toString();
// Split the text into chunks if it exceeds the configured maxCharSize.
List<String> textChunks = splitText(fullText, config.maxCharSize(), config.overlap());
// Map Tika metadata to our target fields.
String title = firstNonEmpty(metadata.get("title"), metadata.get("dc:title"));
String description = firstNonEmpty(metadata.get("description"), metadata.get("dc:description"));
String keywords = metadata.get("keywords");
String pageNumbers = metadata.get("xmpTPg:NPages");
String lastUpdated = firstNonEmpty(metadata.get("Last-Modified"), metadata.get("dcterms:modified"));
String dateCreated = firstNonEmpty(metadata.get("Creation-Date"), metadata.get("dcterms:created"));
String documentType = metadata.get(Metadata.CONTENT_TYPE);
// Build a map with all metadata entries.
Map<String, Object> metadataMap = new HashMap<>();
for (String name : metadata.names()) {
metadataMap.put(name, metadata.get(name));
}
// Create one ParsedDocument per text chunk.
List<ParsedDocument> parsedDocuments = new ArrayList<>();
for (String chunk : textChunks) {
ParsedDocument doc = new ParsedDocument(
title,
chunk,
description,
keywords,
pageNumbers,
lastUpdated,
dateCreated,
documentType,
fileName,
metadataMap
);
parsedDocuments.add(doc);
}
return parsedDocuments;
}
}
/**
* Splits the provided text into chunks that do not exceed maxCharSize.
* Each consecutive chunk overlaps the previous one by the specified number of characters.
*
* @param text the text to split
* @param maxCharSize the maximum size of each chunk; if non-positive, no splitting occurs
* @param overlap the number of characters to overlap between chunks
* @return a list of text chunks
*/
private List<String> splitText(String text, int maxCharSize, int overlap) {
List<String> chunks = new ArrayList<>();
if (text == null || text.isEmpty() || maxCharSize <= 0 || text.length() <= maxCharSize) {
chunks.add(text);
return chunks;
}
int length = text.length();
int start = 0;
while (start < length) {
int end = Math.min(start + maxCharSize, length);
String chunk = text.substring(start, end);
chunks.add(chunk);
// Calculate next start position with overlap.
start = end - overlap;
if (start < 0) {
start = 0;
}
}
return chunks;
}
/**
* Returns the first non-null, non-blank value from the provided arguments.
*
* @param values one or more String values to check
* @return the first non-blank string, or an empty string if none found
*/
private String firstNonEmpty(String... values) {
for (String value : values) {
if (value != null && !value.isBlank()) {
return value;
}
}
return "";
}
}
You might use this service in your Micronaut application as follows:
// Example usage in some Micronaut bean or service:
DocumentParserService parserService = new DocumentParserService();
DocumentParserConfig config = new DocumentParserConfig(
10000, // maxCharSize: each chunk is up to 10,000 characters
500, // overlap: 500 characters overlap between chunks
true // enable OCR if needed (for image-based PDFs, etc.)
);
// 'inputStream' is your document stream (e.g. from a file) and 'fileName' is its name.
List<ParsedDocument> docs = parserService.parse(inputStream, "example.pdf", config);
// You now have one or more ParsedDocument instances with your mapped fields and extra metadata.
This implementation leverages Tika’s auto-detection to support a wide range of file types (Word, PowerPoint, Open Office, text, PDF—including OCR PDFs—Excel, CSV, JSON, HTML, XML, Markdown, JavaScript, etc.) and provides a highly configurable parsing service with fallback behavior.