TFIDFCalculator.java
package edu.odu.cs.cs350;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Calculates TF-IDF (Term Frequency–Inverse Document Frequency) for words across multiple documents.
*
* IDF(w) = log(N/df(w))
* where:
* - N is total number of documents
* - df(w) is the number of documents containing word w
*/
public class TFIDFCalculator {
/**
* Computes the Inverse Document Frequency (IDF) for all words in a collection of documents.
*
* @param documents The list of documents.
* @return A map of words to their IDF values.
*/
public static Map<String, Double> computeIDF(List<Document> documents) {
Map<String, Integer> docFrequency = new HashMap<>();
int totalDocs = documents.size();
// Count in how many documents each word appears
for (Document doc : documents) {
for (String word : doc.getWords().keySet()) {
docFrequency.put(word, docFrequency.getOrDefault(word, 0) + 1);
}
}
// Compute IDF
Map<String, Double> inverseDocumentFrequencyMap = new HashMap<>();
for (Map.Entry<String, Integer> entry : docFrequency.entrySet()) {
String word = entry.getKey();
int documentFrequency = entry.getValue();
double inverseDocumentFrequency = Math.log((double) totalDocs / documentFrequency);
inverseDocumentFrequencyMap.put(word, inverseDocumentFrequency);
}
return inverseDocumentFrequencyMap;
}
/**
* Computes the TF-IDF score for each word in a document.
*
* @param document The document to compute TF-IDF for.
* @param inverseDocumentFrequencyMap The precomputed IDF values for all words.
* @return A map of words to their TF-IDF values.
*/
public static Map<String, Double> computeTFIDF(Document document, Map<String, Double> inverseDocumentFrequencyMap) {
Map<String, Double> termFrequencyMap = TFCalculator.computeTF(document);
Map<String, Double> termFrequencyInverseDocumentFrequencyMap = new HashMap<>();
for (Map.Entry<String, Double> entry : termFrequencyMap.entrySet()) {
String word = entry.getKey();
double termFrequency = entry.getValue();
double inverseDocumentFrequency = inverseDocumentFrequencyMap.getOrDefault(word, 0.0);
termFrequencyInverseDocumentFrequencyMap.put(word, termFrequency * inverseDocumentFrequency);
}
return termFrequencyInverseDocumentFrequencyMap;
}
}