notes on a tika parser s3 crawler service - krickert/search-api GitHub Wiki

Below is the full Java code for a service that crawls an S3 bucket with pagination, processes each file using Apache Tika via a service method that accepts a byte stream, and then merges the S3 object details with the parsed Tika metadata and content.

import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.BodyContentHandler;
import software.amazon.awssdk.services.s3.S3Client;
import software.amazon.awssdk.services.s3.model.GetObjectRequest;
import software.amazon.awssdk.services.s3.model.ListObjectsV2Request;
import software.amazon.awssdk.services.s3.model.ListObjectsV2Response;
import software.amazon.awssdk.services.s3.model.S3Object;

import java.io.InputStream;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.stream.Collectors;

public class S3TikaCrawler {

    private final S3Client s3Client;
    private final Tika tika;

    public S3TikaCrawler() {
        this.s3Client = S3Client.create();
        this.tika = new Tika();
    }

    /**
     * Crawls an S3 bucket using pagination, processes each file via Tika, and prints the merged result.
     *
     * @param bucketName the S3 bucket name
     */
    public void crawlBucket(String bucketName) {
        String continuationToken = null;
        do {
            var listReq = ListObjectsV2Request.builder()
                    .bucket(bucketName)
                    .continuationToken(continuationToken)
                    .build();

            ListObjectsV2Response listRes = s3Client.listObjectsV2(listReq);

            for (S3Object s3Object : listRes.contents()) {
                String key = s3Object.key();
                System.out.println("Processing S3 object: " + key);

                var getReq = GetObjectRequest.builder()
                        .bucket(bucketName)
                        .key(key)
                        .build();

                try (InputStream s3Stream = s3Client.getObject(getReq)) {
                    ProcessedDocument doc = processStream(s3Stream);
                    Map<String, Object> merged = mergeData(s3Object, doc);
                    System.out.println("Merged Data for " + key + ": " + merged);
                } catch (Exception e) {
                    System.err.println("Error processing " + key);
                    e.printStackTrace();
                }
            }

            continuationToken = listRes.nextContinuationToken();
        } while (continuationToken != null);
    }

    /**
     * Parses the input stream using Tika to extract metadata and body content.
     *
     * @param stream InputStream from S3
     * @return a ProcessedDocument containing Tika metadata and body text
     * @throws Exception if parsing fails
     */
    public ProcessedDocument processStream(InputStream stream) throws Exception {
        Metadata metadata = new Metadata();
        // Use -1 for unlimited text extraction
        BodyContentHandler handler = new BodyContentHandler(-1);
        tika.parse(stream, metadata, handler);
        return new ProcessedDocument(metadata, handler.toString());
    }

    /**
     * Merges S3 object details with Tika-parsed metadata and content.
     *
     * @param s3Object the S3 object
     * @param doc      the document processed by Tika
     * @return a map containing both S3 details and Tika results
     */
    public Map<String, Object> mergeData(S3Object s3Object, ProcessedDocument doc) {
        Map<String, Object> merged = new HashMap<>();
        // Add S3 object details
        merged.put("s3Key", s3Object.key());
        merged.put("s3Size", s3Object.size());
        // Convert Tika metadata to a map using streams
        Map<String, String> tikaMetadata = Arrays.stream(doc.getMetadata().names())
                .collect(Collectors.toMap(name -> name, doc.getMetadata()::get));
        merged.put("tikaMetadata", tikaMetadata);
        merged.put("tikaBody", doc.getBody());
        return merged;
    }

    /**
     * Simple record-like class to hold Tika metadata and body content.
     */
    public static class ProcessedDocument {
        private final Metadata metadata;
        private final String body;

        public ProcessedDocument(Metadata metadata, String body) {
            this.metadata = metadata;
            this.body = body;
        }

        public Metadata getMetadata() {
            return metadata;
        }

        public String getBody() {
            return body;
        }
    }

    public static void main(String[] args) {
        var crawler = new S3TikaCrawler();
        // Replace with your actual S3 bucket name
        crawler.crawlBucket("your-bucket-name");
    }
}

How It Works

  • Pagination:
    The crawlBucket method uses a do-while loop with a continuation token to page through the S3 bucket. Each page is processed immediately without loading all 1.2 million documents into memory.

  • Processing Each File:
    For each S3 object, the service retrieves the file stream using the AWS SDK, then passes it to the processStream method where Apache Tika extracts both metadata and full text content.

  • Merging Data:
    The mergeData method combines the S3 object details (like key and size) with the Tika-parsed metadata (converted to a map) and the text content. The merged result is then printed.

  • Modern Java Features:
    The code leverages JDK 21 features such as var for local variable inference and the Stream API for a concise conversion of metadata.

This full code sample should serve as a good starting point for a service that integrates an S3 crawl with Apache Tika processing while handling large buckets efficiently.