FileProcessor.java

package edu.odu.cs.cs350;

import java.util.List;
import java.util.Map;

/**
 * The main entry point for building a {@link Corpus} from a set of text files,
 * and computing term frequency (TF) and TF-IDF values for analysis.
 * 
 * Each file path should be supplied as a command-line argument. For example:
 *
 * java FileProcessor file1.txt file2.txt file3.txt
 *
 * This class uses {@link txtFileProcessor} to process each file into a
 * {@link Document}, then adds those documents to a {@link Corpus}. Once
 * all documents are processed, it computes and prints TF-IDF values for
 * the top terms in each document.
 */
public class FileProcessor {

    /**
     * Main method that runs the file processing pipeline.
     *
     * @param inputArguments file paths to process; each should point to a valid .txt file
     */
    public static void main(String[] inputArguments) {
        if (inputArguments.length == 0) {
            System.err.println("Usage: java FileProcessor <file1> <file2> ...");
            return;
        }

        // Step 1: Create a new corpus
        Corpus corpus = new Corpus();

        // Step 2: Process each file path into a Document
        for (String filePath : inputArguments) {
            Document doc = new txtFileProcessor(filePath).processFile();
            if (doc != null) {
                corpus.addDocument(doc);
                System.out.println("Processed: " + doc.getName());
            } else {
                System.err.println("Skipping invalid or unreadable file: " + filePath);
            }
        }

        // Step 3: Compute IDF across the corpus
        List<Document> allDocs = corpus.getDocuments();
        Map<String, Double> inverseDocumentFrequencyMap = TFIDFCalculator.computeIDF(allDocs);

        // Step 4: Compute and display TF-IDF results for each document
        for (Document doc : allDocs) {
            System.out.println("\n=== TF-IDF for " + doc.getName() + " ===");
            Map<String, Double> termFrequencyInverseDocumentFrequency = TFIDFCalculator.computeTFIDF(doc, inverseDocumentFrequencyMap);

            termFrequencyInverseDocumentFrequency.entrySet().stream()
                .sorted(Map.Entry.<String, Double>comparingByValue().reversed())
                .limit(10) // print top 10 terms
                .forEach(entry ->
                    System.out.printf("%s: %.5f%n", entry.getKey(), entry.getValue())
                );
        }

        System.out.println("\nCorpus processing complete. Total documents: " + corpus.getTotalDocuments());
    }
}