Natural Language Processing (NLP) enables computers to understand, interpret, and manipulate human language. This guide explores implementing NLP solutions in Java using popular libraries and frameworks, with practical examples and best practices.
Key NLP capabilities covered:
// Maven dependency
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>4.5.4</version>
</dependency>
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>4.5.4</version>
<classifier>models</classifier>
</dependency>
// Basic pipeline setup
public class NLPProcessor {
private final StanfordCoreNLP pipeline;
public NLPProcessor() {
Properties props = new Properties();
props.setProperty("annotators",
"tokenize,ssplit,pos,lemma,ner,parse,sentiment");
pipeline = new StanfordCoreNLP(props);
}
public CoreDocument processText(String text) {
CoreDocument document = new CoreDocument(text);
pipeline.annotate(document);
return document;
}
}
@Service
public class TextAnalyzer {
private final NLPProcessor processor;
public List<String> extractEntities(String text) {
CoreDocument doc = processor.processText(text);
return doc.entityMentions().stream()
.map(mention -> mention.text() + " (" +
mention.entityType() + ")")
.collect(Collectors.toList());
}
public String analyzeSentiment(String text) {
CoreDocument doc = processor.processText(text);
return doc.sentences().stream()
.map(sentence -> sentence.sentiment())
.collect(Collectors.joining(", "));
}
public Map<String, String> getPOSTags(String text) {
CoreDocument doc = processor.processText(text);
Map<String, String> posTags = new HashMap<>();
doc.tokens().forEach(token ->
posTags.put(token.word(), token.tag()));
return posTags;
}
}
// Maven dependency
<dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
<version>2.1.0</version>
</dependency>
// Sentence Detection
public class OpenNLPProcessor {
private final SentenceDetectorME sentenceDetector;
private final TokenizerME tokenizer;
private final POSTaggerME posTagger;
public OpenNLPProcessor() throws IOException {
InputStream sentenceModelIn = getClass()
.getResourceAsStream("/models/en-sent.bin");
SentenceModel sentenceModel = new SentenceModel(sentenceModelIn);
sentenceDetector = new SentenceDetectorME(sentenceModel);
InputStream tokenModelIn = getClass()
.getResourceAsStream("/models/en-token.bin");
TokenizerModel tokenModel = new TokenizerModel(tokenModelIn);
tokenizer = new TokenizerME(tokenModel);
InputStream posModelIn = getClass()
.getResourceAsStream("/models/en-pos-maxent.bin");
POSModel posModel = new POSModel(posModelIn);
posTagger = new POSTaggerME(posModel);
}
public String[] detectSentences(String text) {
return sentenceDetector.sentDetect(text);
}
public String[] tokenize(String text) {
return tokenizer.tokenize(text);
}
public String[] tagPOS(String[] tokens) {
return posTagger.tag(tokens);
}
}
public class NamedEntityRecognizer {
private final TokenNameFinderModel personModel;
private final TokenNameFinderModel locationModel;
private final TokenNameFinderModel organizationModel;
public NamedEntityRecognizer() throws IOException {
personModel = new TokenNameFinderModel(getClass()
.getResourceAsStream("/models/en-ner-person.bin"));
locationModel = new TokenNameFinderModel(getClass()
.getResourceAsStream("/models/en-ner-location.bin"));
organizationModel = new TokenNameFinderModel(getClass()
.getResourceAsStream("/models/en-ner-organization.bin"));
}
public List<NamedEntity> findEntities(String text) {
String[] tokens = tokenizer.tokenize(text);
List<NamedEntity> entities = new ArrayList<>();
// Find persons
NameFinderME personFinder = new NameFinderME(personModel);
Span[] personSpans = personFinder.find(tokens);
for (Span span : personSpans) {
entities.add(new NamedEntity(
span.getType(),
String.join(" ", Arrays.copyOfRange(
tokens, span.getStart(), span.getEnd()))
));
}
// Find locations and organizations similarly
// ...
return entities;
}
}
public class DocumentClassifier {
private final DoccatModel model;
private final DocumentCategorizerME categorizer;
public DocumentClassifier(String modelPath) throws IOException {
try (InputStream modelIn = new FileInputStream(modelPath)) {
model = new DoccatModel(modelIn);
categorizer = new DocumentCategorizerME(model);
}
}
public String classify(String text) {
double[] outcomes = categorizer.categorize(text);
return categorizer.getBestCategory(outcomes);
}
public Map<String, Double> getScores(String text) {
double[] outcomes = categorizer.categorize(text);
String[] categories = categorizer.getCategories();
Map<String, Double> scores = new HashMap<>();
for (int i = 0; i < categories.length; i++) {
scores.put(categories[i], outcomes[i]);
}
return scores;
}
}
public class DocumentClassifierTrainer {
public DoccatModel train(List<DocumentSample> samples)
throws IOException {
DoccatFactory factory = new DoccatFactory();
TrainingParameters params = new TrainingParameters();
params.put(TrainingParameters.ITERATIONS_PARAM, "100");
params.put(TrainingParameters.CUTOFF_PARAM, "5");
DoccatModel model = DocumentCategorizerME.train(
"en",
new DocumentSampleStream(samples),
params,
factory);
return model;
}
public void saveModel(DoccatModel model, String modelPath)
throws IOException {
try (OutputStream modelOut = new FileOutputStream(modelPath)) {
model.serialize(modelOut);
}
}
// Example usage
public static void main(String[] args) throws IOException {
List<DocumentSample> samples = Arrays.asList(
new DocumentSample("tech", "Java programming language"),
new DocumentSample("tech", "Python data science"),
new DocumentSample("sports", "Football match highlights"),
new DocumentSample("sports", "Tennis tournament results")
);
DocumentClassifierTrainer trainer = new DocumentClassifierTrainer();
DoccatModel model = trainer.train(samples);
trainer.saveModel(model, "document-classifier.bin");
}
}
public class SentimentAnalyzer {
private final Set<String> positiveWords;
private final Set<String> negativeWords;
private final TokenizerME tokenizer;
public SentimentAnalyzer() throws IOException {
// Load sentiment lexicons
positiveWords = loadLexicon("/lexicons/positive-words.txt");
negativeWords = loadLexicon("/lexicons/negative-words.txt");
// Initialize tokenizer
TokenizerModel tokenModel = new TokenizerModel(getClass()
.getResourceAsStream("/models/en-token.bin"));
tokenizer = new TokenizerME(tokenModel);
}
public SentimentScore analyzeSentiment(String text) {
String[] tokens = tokenizer.tokenize(text.toLowerCase());
int positiveCount = 0;
int negativeCount = 0;
for (String token : tokens) {
if (positiveWords.contains(token)) {
positiveCount++;
} else if (negativeWords.contains(token)) {
negativeCount++;
}
}
double score = (double) (positiveCount - negativeCount) /
(positiveCount + negativeCount + 1);
return new SentimentScore(score, positiveCount, negativeCount);
}
private Set<String> loadLexicon(String path) throws IOException {
Set<String> words = new HashSet<>();
try (BufferedReader reader = new BufferedReader(
new InputStreamReader(
getClass().getResourceAsStream(path)))) {
String line;
while ((line = reader.readLine()) != null) {
words.add(line.trim().toLowerCase());
}
}
return words;
}
}
public class MLSentimentAnalyzer {
private final MaxentModel model;
private final TokenizerME tokenizer;
public MLSentimentAnalyzer(String modelPath) throws IOException {
// Load model
try (InputStream modelIn = new FileInputStream(modelPath)) {
model = new MaxentModel(modelIn);
}
// Initialize tokenizer
TokenizerModel tokenModel = new TokenizerModel(getClass()
.getResourceAsStream("/models/en-token.bin"));
tokenizer = new TokenizerME(tokenModel);
}
public String analyzeSentiment(String text) {
String[] tokens = tokenizer.tokenize(text);
// Extract features
String[] features = extractFeatures(tokens);
// Get sentiment probabilities
double[] probs = model.eval(features);
// Return sentiment with highest probability
return model.getBestOutcome(probs);
}
private String[] extractFeatures(String[] tokens) {
// Feature extraction logic
// This could include n-grams, POS tags, etc.
return tokens;
}
}
public class LanguageDetector {
private final LanguageDetectorModel model;
private final LanguageDetector detector;
public LanguageDetector() throws IOException {
// Load language detection model
try (InputStream modelIn = getClass()
.getResourceAsStream("/models/langdetect.bin")) {
model = new LanguageDetectorModel(modelIn);
detector = new LanguageDetectorME(model);
}
}
public Language detectLanguage(String text) {
Language[] languages = detector.predictLanguages(text);
return languages[0]; // Return most probable language
}
public List<Language> detectLanguages(String text, double threshold) {
Language[] languages = detector.predictLanguages(text);
return Arrays.stream(languages)
.filter(lang -> lang.getConfidence() >= threshold)
.collect(Collectors.toList());
}
}
public class LanguageModelTrainer {
public LanguageDetectorModel train(
Map<String, List<String>> languageTexts)
throws IOException {
// Prepare training data
List<LanguageSample> samples = new ArrayList<>();
for (Map.Entry<String, List<String>> entry :
languageTexts.entrySet()) {
String language = entry.getKey();
List<String> texts = entry.getValue();
for (String text : texts) {
samples.add(new LanguageSample(language, text));
}
}
// Train model
TrainingParameters params = new TrainingParameters();
params.put(TrainingParameters.ITERATIONS_PARAM, "100");
params.put(TrainingParameters.CUTOFF_PARAM, "5");
LanguageDetectorFactory factory = new LanguageDetectorFactory();
LanguageDetectorModel model = LanguageDetectorME.train(
new LanguageSampleStream(samples),
params,
factory);
return model;
}
public void saveModel(LanguageDetectorModel model, String modelPath)
throws IOException {
try (OutputStream modelOut = new FileOutputStream(modelPath)) {
model.serialize(modelOut);
}
}
}
public class OptimizedNLPPipeline {
private final StanfordCoreNLP pipeline;
private final Map<String, Annotation> cache;
private final int maxCacheSize;
public OptimizedNLPPipeline(int maxCacheSize) {
Properties props = new Properties();
props.setProperty("annotators",
"tokenize,ssplit,pos,lemma,ner");
props.setProperty("threads", "4"); // Parallel processing
pipeline = new StanfordCoreNLP(props);
this.maxCacheSize = maxCacheSize;
this.cache = new LinkedHashMap<String, Annotation>(
maxCacheSize + 1, 0.75f, true) {
@Override
protected boolean removeEldestEntry(
Map.Entry<String, Annotation> eldest) {
return size() > maxCacheSize;
}
};
}
public Annotation process(String text) {
// Check cache first
if (cache.containsKey(text)) {
return cache.get(text);
}
// Process text
Annotation annotation = new Annotation(text);
pipeline.annotate(annotation);
// Cache result
cache.put(text, annotation);
return annotation;
}
public void clearCache() {
cache.clear();
}
}
public class BatchProcessor {
private final ExecutorService executor;
private final NLPProcessor processor;
private final int batchSize;
public BatchProcessor(int threads, int batchSize) {
this.executor = Executors.newFixedThreadPool(threads);
this.processor = new NLPProcessor();
this.batchSize = batchSize;
}
public List<CoreDocument> processBatch(List<String> texts) {
List<Future<CoreDocument>> futures = new ArrayList<>();
// Submit tasks
for (String text : texts) {
futures.add(executor.submit(() ->
processor.processText(text)));
}
// Collect results
List<CoreDocument> results = new ArrayList<>();
for (Future<CoreDocument> future : futures) {
try {
results.add(future.get());
} catch (Exception e) {
// Handle exceptions
}
}
return results;
}
public void shutdown() {
executor.shutdown();
}
}
Natural Language Processing in Java offers powerful tools for text analysis and understanding. While implementing NLP solutions can be complex, the available libraries and frameworks make it accessible for various applications.
Remember to carefully consider your requirements, choose appropriate tools, and follow best practices for performance and accuracy. As NLP technology continues to evolve, stay updated with the latest developments and techniques.