notes on SPO crawler - krickert/search-api GitHub Wiki
Below is an example of a Java service that uses Microsoft Graph SDK libraries to crawl a SharePoint Online site (using pagination so that it doesn’t load millions of records into memory), retrieves each file’s content and ACL information, passes the file stream to Apache Tika for parsing, and then merges the SharePoint metadata, Tika results, and ACLs into a single merged object.
Note: You’ll need to supply your own authentication provider (here represented by a dummy implementation) and configure your site/drive IDs appropriately.
import com.microsoft.graph.authentication.IAuthenticationProvider;
import com.microsoft.graph.http.IHttpRequest;
import com.microsoft.graph.models.DriveItem;
import com.microsoft.graph.models.Permission;
import com.microsoft.graph.requests.DriveItemCollectionPage;
import com.microsoft.graph.requests.PermissionCollectionPage;
import com.microsoft.graph.requests.GraphServiceClient;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.BodyContentHandler;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class SharePointTikaCrawler {
private final GraphServiceClient<?> graphClient;
private final Tika tika;
public SharePointTikaCrawler(IAuthenticationProvider authProvider) {
// Build the Microsoft Graph client using your authentication provider.
this.graphClient = GraphServiceClient.builder().authenticationProvider(authProvider).buildClient();
this.tika = new Tika();
}
/**
* Crawls the SharePoint tree from the specified starting folder.
*
* @param siteId The SharePoint site ID.
* @param driveId The drive ID for the site.
* @param folderId The folder ID to start crawling (use "root" for the root folder).
*/
public void crawlSharePointTree(String siteId, String driveId, String folderId) {
processFolder(siteId, driveId, folderId);
}
/**
* Recursively processes a folder’s children using pagination.
*/
private void processFolder(String siteId, String driveId, String folderId) {
// Retrieve the first page of children.
DriveItemCollectionPage childrenPage = graphClient.sites(siteId)
.drives(driveId)
.items(folderId)
.children()
.buildRequest()
.get();
// Loop through pages.
while (childrenPage != null) {
for (DriveItem item : childrenPage.getCurrentPage()) {
if (item.folder != null) {
// Recursively process folders.
processFolder(siteId, driveId, item.id);
} else if (item.file != null) {
// Process file items.
processFileItem(siteId, driveId, item);
}
}
childrenPage = (childrenPage.getNextPage() != null)
? childrenPage.getNextPage().buildRequest().get()
: null;
}
}
/**
* Processes a single file: gets its content stream, parses it with Tika,
* retrieves its ACLs, and merges the data.
*/
private void processFileItem(String siteId, String driveId, DriveItem item) {
System.out.println("Processing file: " + item.name);
try {
// Retrieve file content stream.
InputStream contentStream = graphClient.sites(siteId)
.drives(driveId)
.items(item.id)
.content()
.buildRequest()
.get();
// Parse the file stream with Tika.
ProcessedDocument tikaDoc = processStream(contentStream);
// Retrieve ACLs (permissions) for the file.
List<Permission> permissions = getItemPermissions(siteId, driveId, item.id);
// Merge SharePoint metadata, Tika output, and ACLs.
MergedDocument merged = mergeData(item, tikaDoc, permissions);
System.out.println("Merged Document for " + item.name + ": " + merged);
} catch (Exception e) {
System.err.println("Error processing file " + item.name);
e.printStackTrace();
}
}
/**
* Parses an InputStream using Tika to extract metadata and body text.
*/
private ProcessedDocument processStream(InputStream stream) throws Exception {
Metadata metadata = new Metadata();
// Use -1 to indicate unlimited characters should be processed.
BodyContentHandler handler = new BodyContentHandler(-1);
tika.parse(stream, metadata, handler);
return new ProcessedDocument(metadata, handler.toString());
}
/**
* Retrieves ACLs (permissions) for a given file using pagination.
*/
private List<Permission> getItemPermissions(String siteId, String driveId, String itemId) {
List<Permission> allPermissions = new ArrayList<>();
PermissionCollectionPage permissionsPage = graphClient.sites(siteId)
.drives(driveId)
.items(itemId)
.permissions()
.buildRequest()
.get();
while (permissionsPage != null) {
allPermissions.addAll(permissionsPage.getCurrentPage());
permissionsPage = (permissionsPage.getNextPage() != null)
? permissionsPage.getNextPage().buildRequest().get()
: null;
}
return allPermissions;
}
/**
* Merges the SharePoint file details, Tika-parsed metadata and content,
* and ACL information into a single merged document.
*/
private MergedDocument mergeData(DriveItem item, ProcessedDocument tikaDoc, List<Permission> permissions) {
Map<String, Object> sharePointData = new HashMap<>();
sharePointData.put("id", item.id);
sharePointData.put("name", item.name);
sharePointData.put("size", item.size);
// Add additional SharePoint fields as needed.
// Convert Tika metadata into a map.
Map<String, String> tikaMetadata = Arrays.stream(tikaDoc.getMetadata().names())
.collect(Collectors.toMap(name -> name, tikaDoc.getMetadata()::get));
return new MergedDocument(sharePointData, tikaMetadata, tikaDoc.getBody(), permissions);
}
/**
* A simple class to hold the results from Tika parsing.
*/
public static class ProcessedDocument {
private final Metadata metadata;
private final String body;
public ProcessedDocument(Metadata metadata, String body) {
this.metadata = metadata;
this.body = body;
}
public Metadata getMetadata() {
return metadata;
}
public String getBody() {
return body;
}
}
/**
* The merged document containing SharePoint data, Tika output, and ACLs.
*/
public static class MergedDocument {
private final Map<String, Object> sharePointData;
private final Map<String, String> tikaMetadata;
private final String tikaBody;
private final List<Permission> permissions;
public MergedDocument(Map<String, Object> sharePointData,
Map<String, String> tikaMetadata,
String tikaBody,
List<Permission> permissions) {
this.sharePointData = sharePointData;
this.tikaMetadata = tikaMetadata;
this.tikaBody = tikaBody;
this.permissions = permissions;
}
@Override
public String toString() {
return "MergedDocument{" +
"sharePointData=" + sharePointData +
", tikaMetadata=" + tikaMetadata +
", tikaBody='" + (tikaBody != null ? tikaBody.substring(0, Math.min(tikaBody.length(), 100)) + "..." : "null") + '\'' +
", permissions=" + permissions +
'}';
}
}
/**
* Example entry point.
*/
public static void main(String[] args) {
// Replace with your actual authentication provider implementation.
IAuthenticationProvider authProvider = new MyAuthenticationProvider();
SharePointTikaCrawler crawler = new SharePointTikaCrawler(authProvider);
// Set your SharePoint site, drive, and starting folder (e.g., "root")
String siteId = "your-site-id";
String driveId = "your-drive-id";
String folderId = "root";
crawler.crawlSharePointTree(siteId, driveId, folderId);
}
/**
* Dummy authentication provider for illustration.
* Replace this with your actual authentication logic to acquire tokens.
*/
public static class MyAuthenticationProvider implements IAuthenticationProvider {
@Override
public void authenticateRequest(IHttpRequest request) {
// Insert your authentication token logic here.
request.addHeader("Authorization", "Bearer YOUR_ACCESS_TOKEN");
}
}
}
-
Graph SDK & Pagination:
The service uses the Microsoft Graph SDK to request items from a SharePoint drive. The code employs pagination (both for folder children and permissions) so that it does not load the entire tree (millions of records) into memory at once. -
Recursive Folder Traversal:
TheprocessFolder
method recurses through folder items. Files are processed viaprocessFileItem
. -
Tika Integration:
Each file’s content is streamed to a Tika parser (using aBodyContentHandler
with no size limit), extracting metadata and full text. -
ACL Retrieval:
Permissions (ACLs) are retrieved per file via the Graph API and merged with the document’s data. -
Merged Output:
The final merged object includes SharePoint metadata, Tika-parsed metadata and text, and the ACLs (which may include group ACLs).
This complete code sample provides a starting point for building a scalable SharePoint Online crawler integrated with Tika parsing and ACL extraction.
Below is an example of how you might leverage Tika 3.1’s built‑in Microsoft Graph (MS Graph) crawler to handle much of the heavy lifting for crawling SharePoint Online. In Tika 3.1 the Pipes framework has been extended with an MS Graph fetcher that supports paging (so that millions of records aren’t loaded in memory at once), retrieving file content, metadata, and even ACLs. (You’ll need to supply a proper tika‑config.xml that declares and configures the “msgraph” fetcher with your authentication details.)
The following sample shows a minimal integration where Tika’s fetcher is used to retrieve a document from a SharePoint tree; the fetched data (content stream, metadata, and ACL information) is then passed to Tika’s parser and merged into a custom merged object:
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.pipes.fetcher.Fetcher;
import org.apache.tika.pipes.fetcher.FetcherJob;
import org.apache.tika.pipes.fetcher.FetcherResult;
import org.xml.sax.ContentHandler;
import java.io.InputStream;
public class TikaMSGraphCrawler {
public static void main(String[] args) throws Exception {
// Initialize Tika with your configuration (which must include the MS Graph fetcher)
TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
// Create and configure a fetcher job for the MS Graph fetcher.
// The job parameters (siteId, driveId, folderId, etc.) must be set according to your environment.
FetcherJob job = new FetcherJob();
job.setFetcherName("msgraph"); // must match the name defined in tika-config.xml
job.getParameters().put("siteId", "your-site-id");
job.getParameters().put("driveId", "your-drive-id");
job.getParameters().put("folderId", "root"); // starting point; could be any folder id
// Get the MS Graph fetcher instance from Tika
Fetcher fetcher = tikaConfig.getFetcher(job.getFetcherName());
// Loop through the results; paging is handled internally by the fetcher job
while (job.hasNext()) {
FetcherResult result = fetcher.fetch(job);
if (result != null) {
// Use Tika's AutoDetectParser to extract text and additional metadata
Metadata metadata = result.getMetadata();
ContentHandler handler = new BodyContentHandler(-1); // no limit
AutoDetectParser parser = new AutoDetectParser();
try (InputStream stream = result.getInputStream()) {
parser.parse(stream, handler, metadata, new ParseContext());
}
// Get ACL info as provided by the fetcher (the type of this data depends on your configuration)
Object aclData = result.getAdditionalData("acls");
// Merge SharePoint data (from metadata), extracted content, and ACLs into one object.
MergedDocument merged = mergeData(metadata, handler.toString(), aclData);
System.out.println("Merged Document: " + merged);
}
}
}
/**
* Merges the Tika-parsed metadata, text content, and ACL information
* into a single domain object.
*/
public static MergedDocument mergeData(Metadata metadata, String content, Object aclData) {
// In a real application you would extract and map the individual fields as needed.
return new MergedDocument(metadata, content, aclData);
}
/**
* A simple POJO to hold merged document data.
*/
public static class MergedDocument {
private final Metadata metadata;
private final String content;
private final Object aclData;
public MergedDocument(Metadata metadata, String content, Object aclData) {
this.metadata = metadata;
this.content = content;
this.aclData = aclData;
}
@Override
public String toString() {
return "MergedDocument{" +
"metadata=" + metadata +
", content='" + (content != null
? content.substring(0, Math.min(content.length(), 100)) + "..."
: "null") + '\'' +
", aclData=" + aclData +
'}';
}
}
}
-
TikaConfig & Pipes Fetcher:
Tika 3.1’s configuration (typically via a tika-config.xml file) now includes an MS Graph fetcher. By callingTikaConfig.getDefaultConfig()
, you load this configuration so that the “msgraph” fetcher can be retrieved by name. This fetcher is built to handle pagination (so you don’t load millions of records into memory) and to fetch not only file content but also metadata and ACLs. -
FetcherJob Parameters:
The job is configured with parameters like"siteId"
,"driveId"
, and"folderId"
. These values will tell the fetcher where to start crawling in your SharePoint Online tree. -
Processing Results:
For each fetched item (returned as aFetcherResult
), an input stream is provided that is parsed using anAutoDetectParser
(which uses Tika under the hood). Additional data—such as ACL information—is assumed to be provided via the fetcher’s additional data mechanism (here retrieved with a key like"acls"
). -
Memory Efficiency:
The Tika Pipes framework automatically handles paging so that you don’t need to load the entire SharePoint tree into memory.
This code shows how upgrading to Tika 3.1 can simplify your implementation by offloading most of the heavy lifting (authentication, paging, fetching content, metadata, and ACLs) to the built‑in Microsoft Graph crawler. Adjust the configuration and job parameters to suit your SharePoint Online environment and authentication requirements.