TFIDFCalculator.java

package edu.odu.cs.cs350;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Calculates TF-IDF (Term Frequency–Inverse Document Frequency) for words across multiple documents.
 * 
 * IDF(w) = log(N/df(w))
 * where:
 * - N is total number of documents
 * - df(w) is the number of documents containing word w
 */
public class TFIDFCalculator {

    /**
     * Computes the Inverse Document Frequency (IDF) for all words in a collection of documents.
     *
     * @param documents The list of documents.
     * @return A map of words to their IDF values.
     */
    public static Map<String, Double> computeIDF(List<Document> documents) {
        Map<String, Integer> docFrequency = new HashMap<>();
        int totalDocs = documents.size();

        // Count in how many documents each word appears
        for (Document doc : documents) {
            for (String word : doc.getWords().keySet()) {
                docFrequency.put(word, docFrequency.getOrDefault(word, 0) + 1);
            }
        }

        // Compute IDF
        Map<String, Double> inverseDocumentFrequencyMap = new HashMap<>();
        for (Map.Entry<String, Integer> entry : docFrequency.entrySet()) {
            String word = entry.getKey();
            int documentFrequency = entry.getValue();
            double inverseDocumentFrequency = Math.log((double) totalDocs / documentFrequency);
            inverseDocumentFrequencyMap.put(word, inverseDocumentFrequency);
        }

        return inverseDocumentFrequencyMap;
    }

    /**
     * Computes the TF-IDF score for each word in a document.
     *
     * @param document The document to compute TF-IDF for.
     * @param inverseDocumentFrequencyMap The precomputed IDF values for all words.
     * @return A map of words to their TF-IDF values.
     */
    public static Map<String, Double> computeTFIDF(Document document, Map<String, Double> inverseDocumentFrequencyMap) {
        Map<String, Double> termFrequencyMap = TFCalculator.computeTF(document);
        Map<String, Double> termFrequencyInverseDocumentFrequencyMap = new HashMap<>();

        for (Map.Entry<String, Double> entry : termFrequencyMap.entrySet()) {
            String word = entry.getKey();
            double termFrequency = entry.getValue();
            double inverseDocumentFrequency = inverseDocumentFrequencyMap.getOrDefault(word, 0.0);
            termFrequencyInverseDocumentFrequencyMap.put(word, termFrequency * inverseDocumentFrequency);
        }

        return termFrequencyInverseDocumentFrequencyMap;
    }
}