TextClustering examples - GeorgeDittmar/TextClustering GitHub Wiki

Example usage of an NLPProcessor and StopWordsFilter class when pre processing text documents in the framework.

package com.examples;

import com.data.processors.INLPProcessor;
import com.data.processors.NLPProcessorFactory;
import com.data.processors.OpenNLPProcessor;
import com.data.processors.StopWordsFilter;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.List;

/**
 * Created by george on 12/9/13.
 */
public class NLPProcessorExample {

    public static void main(String[] args) throws FileNotFoundException {
        //Call an NLPProcessorFactory and give it the processor class you wish to create.
        INLPProcessor processor = NLPProcessorFactory.initNLPProcessor(OpenNLPProcessor.class);

        if( processor == null){
            throw new RuntimeException("NLP processor was not initialized.");
        }

        // Initialize the NLP processor to setup its tokenization models
        try {
            processor.init();
        } catch (IOException e) {
            e.printStackTrace();
        }

        // Once the processor is initialized lets give it some document to process.
        // In this example just a simple string that we will tokenize.

        List<String[]> processed = processor.processDocument("Bob said hi. Hi bob!");
        System.out.println("Number of sentences: "+processed.size());

        // Create a stopword filter object so that we may remove common terms from the tokenized input.
        StopWordsFilter filter = new StopWordsFilter();
        filter.loadStopWords(new File("./src/resources/stopwords.txt"));

        List<List<String>> filteredDoc = filter.filterStopWords(processed);

        System.out.println("Number of words left: "+filteredDoc.get(0).size() );

        for(List<String> sent: filteredDoc){
            for(String word: sent){

                System.out.print(word+" ");
            }
            System.out.println();
        }
        
    }
}
⚠️ **GitHub.com Fallback** ⚠️