notes on tika parsing - krickert/search-api GitHub Wiki

Below is an example of a complete solution. This example defines a configuration record, a parsed‐document record (with your mapped fields plus a catch‐all metadata map), and a service class that uses Tika’s auto‐detection plus configurable OCR support and text splitting. (In our example, if parsing fails we attempt a fallback using the auto parser.) You can adjust or extend the configuration options as needed. This code uses JDK21 “records” and modern constructs, and it is intended for use within a Micronaut service (without any web controllers).

Note: Ensure you add the appropriate Tika dependencies (including the Tesseract OCR extension if you want OCR) in your Gradle KTS build file.


package com.example.parser;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.sax.BodyContentHandler;

import java.io.BufferedInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Configuration for document parsing.
 *
 * @param maxCharSize the maximum number of characters allowed in one document chunk;
 *                    if the extracted content is larger, it will be split into multiple ParsedDocument objects.
 * @param overlap     the number of characters to overlap between consecutive chunks.
 * @param enableOCR   whether to enable OCR (useful for PDF/image-based documents).
 */
public record DocumentParserConfig(int maxCharSize, int overlap, boolean enableOCR) { }

/**
 * Represents a parsed document with key mapped fields and a catch-all metadata map.
 *
 * Fields:
 * - title
 * - body (a chunk of content; multiple documents may be returned if the content is split)
 * - description
 * - keywords
 * - pageNumbers
 * - lastUpdated
 * - dateCreated
 * - documentType
 * - fileName
 * - metadata (all other metadata key/values)
 */
public record ParsedDocument(
    String title,
    String body,
    String description,
    String keywords,
    String pageNumbers,
    String lastUpdated,
    String dateCreated,
    String documentType,
    String fileName,
    Map<String, Object> metadata
) { }

/**
 * Service that uses Apache Tika to parse various document types.
 *
 * <p>This service supports a wide range of file types (Word, PowerPoint, Open Office,
 * text, PDF, PDF with OCR, Excel, CSV, JSON, HTML, XML, Markdown, JavaScript, etc.)
 * by relying on Tika’s AutoDetectParser. Additional configuration (such as OCR, maximum
 * character size for splitting, and overlap) is supported.
 *
 * <p>If the initial parse attempt fails, the service falls back on the auto parser.
 */
public class DocumentParserService {

    /**
     * Parses the given InputStream and returns one or more ParsedDocument objects.
     *
     * @param input    the input stream (e.g. from a file upload) containing the document bytes
     * @param fileName the name of the file (used to help determine file type and map to fields)
     * @param config   the configuration options (max char size, overlap, OCR enable)
     * @return a list of ParsedDocument objects (one per chunk, if splitting occurs)
     * @throws Exception if parsing fails (and no fallback is possible)
     */
    public List<ParsedDocument> parse(InputStream input, String fileName, DocumentParserConfig config) throws Exception {
        // Wrap the input stream in a BufferedInputStream to support mark/reset.
        try (InputStream bufferedInput = new BufferedInputStream(input)) {
            // Set the resource name (file name) into Tika metadata.
            Metadata metadata = new Metadata();
            metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);

            // Use the auto-detect parser. Tika will detect and delegate to
            // the correct parser for Word, PowerPoint, Open Office, PDF, etc.
            Parser parser = new AutoDetectParser();

            // Prepare the parse context.
            ParseContext context = new ParseContext();
            context.set(Parser.class, parser);
            if (config.enableOCR()) {
                // If OCR is enabled, configure the Tesseract OCR parser.
                TesseractOCRConfig tesseractConfig = new TesseractOCRConfig();
                // Optionally, customize Tesseract settings here.
                context.set(TesseractOCRConfig.class, tesseractConfig);
            }

            // Use a BodyContentHandler with unlimited size (-1) to capture the full text.
            BodyContentHandler handler = new BodyContentHandler(-1);

            // Attempt parsing.
            try {
                parser.parse(bufferedInput, handler, metadata, context);
            } catch (Exception e) {
                // Fallback: if the primary parsing attempt fails, reset the stream and try again.
                if (bufferedInput.markSupported()) {
                    bufferedInput.reset();
                    parser.parse(bufferedInput, handler, metadata, context);
                } else {
                    throw new RuntimeException("Parsing failed and input stream is not resettable", e);
                }
            }

            // Extract the full text.
            String fullText = handler.toString();

            // Split the text into chunks if it exceeds the configured maxCharSize.
            List<String> textChunks = splitText(fullText, config.maxCharSize(), config.overlap());

            // Map Tika metadata to our target fields.
            String title = firstNonEmpty(metadata.get("title"), metadata.get("dc:title"));
            String description = firstNonEmpty(metadata.get("description"), metadata.get("dc:description"));
            String keywords = metadata.get("keywords");
            String pageNumbers = metadata.get("xmpTPg:NPages");
            String lastUpdated = firstNonEmpty(metadata.get("Last-Modified"), metadata.get("dcterms:modified"));
            String dateCreated = firstNonEmpty(metadata.get("Creation-Date"), metadata.get("dcterms:created"));
            String documentType = metadata.get(Metadata.CONTENT_TYPE);

            // Build a map with all metadata entries.
            Map<String, Object> metadataMap = new HashMap<>();
            for (String name : metadata.names()) {
                metadataMap.put(name, metadata.get(name));
            }

            // Create one ParsedDocument per text chunk.
            List<ParsedDocument> parsedDocuments = new ArrayList<>();
            for (String chunk : textChunks) {
                ParsedDocument doc = new ParsedDocument(
                    title,
                    chunk,
                    description,
                    keywords,
                    pageNumbers,
                    lastUpdated,
                    dateCreated,
                    documentType,
                    fileName,
                    metadataMap
                );
                parsedDocuments.add(doc);
            }
            return parsedDocuments;
        }
    }

    /**
     * Splits the provided text into chunks that do not exceed maxCharSize.
     * Each consecutive chunk overlaps the previous one by the specified number of characters.
     *
     * @param text        the text to split
     * @param maxCharSize the maximum size of each chunk; if non-positive, no splitting occurs
     * @param overlap     the number of characters to overlap between chunks
     * @return a list of text chunks
     */
    private List<String> splitText(String text, int maxCharSize, int overlap) {
        List<String> chunks = new ArrayList<>();
        if (text == null || text.isEmpty() || maxCharSize <= 0 || text.length() <= maxCharSize) {
            chunks.add(text);
            return chunks;
        }
        int length = text.length();
        int start = 0;
        while (start < length) {
            int end = Math.min(start + maxCharSize, length);
            String chunk = text.substring(start, end);
            chunks.add(chunk);
            // Calculate next start position with overlap.
            start = end - overlap;
            if (start < 0) {
                start = 0;
            }
        }
        return chunks;
    }

    /**
     * Returns the first non-null, non-blank value from the provided arguments.
     *
     * @param values one or more String values to check
     * @return the first non-blank string, or an empty string if none found
     */
    private String firstNonEmpty(String... values) {
        for (String value : values) {
            if (value != null && !value.isBlank()) {
                return value;
            }
        }
        return "";
    }
}

Usage Example

You might use this service in your Micronaut application as follows:

// Example usage in some Micronaut bean or service:
DocumentParserService parserService = new DocumentParserService();
DocumentParserConfig config = new DocumentParserConfig(
    10000,   // maxCharSize: each chunk is up to 10,000 characters
    500,     // overlap: 500 characters overlap between chunks
    true     // enable OCR if needed (for image-based PDFs, etc.)
);

// 'inputStream' is your document stream (e.g. from a file) and 'fileName' is its name.
List<ParsedDocument> docs = parserService.parse(inputStream, "example.pdf", config);

// You now have one or more ParsedDocument instances with your mapped fields and extra metadata.

This implementation leverages Tika’s auto-detection to support a wide range of file types (Word, PowerPoint, Open Office, text, PDF—including OCR PDFs—Excel, CSV, JSON, HTML, XML, Markdown, JavaScript, etc.) and provides a highly configurable parsing service with fallback behavior.

⚠️ **GitHub.com Fallback** ⚠️