WordStatistics.java

package org.xandercat.pmdb.util;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import org.xandercat.pmdb.util.format.FormatUtil;

/**
 * Class for word statistics.
 * 
 * @author Scott Arnold
 */
public class WordStatistics {

	public static class WordCount implements Comparable<WordCount> {
		private String wordLc;
		private int count;
		public WordCount(String word) {
			this.wordLc = word.trim().toLowerCase();
		}
		protected void incrementCount() {
			this.count++;
		}
		public int getCount() {
			return count;
		}
		public String getWord() {
			return FormatUtil.titleCase(wordLc);
		}
		@Override
		public int compareTo(WordCount o) {
			return o.count - count;
		}
		public String toString() {
			return getWord() + " (" + count + ")";
		}
	}

	private Map<String, WordCount> wordCounts = new HashMap<String, WordCount>();
	
	public WordStatistics() {
	}
	
	/**
	 * Add words contained with provided string to the collected statistics.  Individual words within the 
	 * provided string can be separated by white space or commas.
	 * 
	 * @param words  string containing words to collect statistics on
	 */
	public void addWords(String words) {
		String[] wordsArray = words.split("\\s+");
		for (int i=0; i<wordsArray.length; i++) {
			String[] commaSplitWords = wordsArray[i].split(",");
			for (String word : commaSplitWords) {
				addWord(word);
			}
		}
	}
	
	private void addWord(String word) {
		word = word.trim().toLowerCase();
		WordCount wordCount = wordCounts.get(word);
		if (wordCount == null) {
			wordCount = new WordCount(word);
			wordCounts.put(word, wordCount);
		}
		wordCount.incrementCount();
	}
	
	/**
	 * Returns list of all word counts in sorted order from most frequent to least frequent.
	 * 
	 * @return list of all word counts from most frequent to least frequent
	 */
	public List<WordCount> getWordCounts() {
		return wordCounts.values().stream().sorted().collect(Collectors.toList());
	}

	/**
	 * Returns list of top word counts in sorted order.
	 * 
	 * @param top number of results to include
	 * 
	 * @return list of top word counts
	 */
	public List<WordCount> getTopWordCounts(int top) {
		if (wordCounts.size() == 0) {
			return Collections.emptyList();
		}
		return getWordCounts().subList(0, Math.min(top, wordCounts.size()));
	}
	
	/**
	 * Returns list of bottom word counts in sorted order.
	 * 
	 * @param bottom number of results to include
	 * 
	 * @return list of bottom word counts
	 */
	public List<WordCount> getBottomWordCounts(int bottom) {
		if (wordCounts.size() == 0) {
			return Collections.emptyList();
		}
		return getWordCounts().subList(Math.max(0, wordCounts.size()-bottom), wordCounts.size());
	}
	
	/**
	 * Returns list of word counts within collected statistics for words contained within the provided string of words.
	 * Provided string of words can have words separated by white space or commas.
	 * 
	 * @param words  words to get word counts for from the collected statistics
	 * 
	 * @return list of word counts for words within the provided words string
	 */
	public List<WordCount> getWordCountsForWords(String words) {
		List<WordCount> specificWordCounts = new ArrayList<WordCount>();
		if (FormatUtil.isNotBlank(words)) {
			String[] wordsArray = words.split("\\s+");
			for (int i=0; i<wordsArray.length; i++) {
				String[] commaSplitWords = wordsArray[i].split(",");
				for (String word : commaSplitWords) {
					WordCount wordCount = wordCounts.get(word.toLowerCase());
					if (wordCount != null) {
						specificWordCounts.add(wordCount);
					}
				}
			}
			Collections.sort(specificWordCounts);
		}
		return specificWordCounts;
	}
}