FileProcessor.java
package edu.odu.cs.cs350;
import java.util.List;
import java.util.Map;
/**
* The main entry point for building a {@link Corpus} from a set of text files,
* and computing term frequency (TF) and TF-IDF values for analysis.
*
* Each file path should be supplied as a command-line argument. For example:
*
* java FileProcessor file1.txt file2.txt file3.txt
*
* This class uses {@link txtFileProcessor} to process each file into a
* {@link Document}, then adds those documents to a {@link Corpus}. Once
* all documents are processed, it computes and prints TF-IDF values for
* the top terms in each document.
*/
public class FileProcessor {
/**
* Main method that runs the file processing pipeline.
*
* @param inputArguments file paths to process; each should point to a valid .txt file
*/
public static void main(String[] inputArguments) {
if (inputArguments.length == 0) {
System.err.println("Usage: java FileProcessor <file1> <file2> ...");
return;
}
// Step 1: Create a new corpus
Corpus corpus = new Corpus();
// Step 2: Process each file path into a Document
for (String filePath : inputArguments) {
Document doc = new txtFileProcessor(filePath).processFile();
if (doc != null) {
corpus.addDocument(doc);
System.out.println("Processed: " + doc.getName());
} else {
System.err.println("Skipping invalid or unreadable file: " + filePath);
}
}
// Step 3: Compute IDF across the corpus
List<Document> allDocs = corpus.getDocuments();
Map<String, Double> inverseDocumentFrequencyMap = TFIDFCalculator.computeIDF(allDocs);
// Step 4: Compute and display TF-IDF results for each document
for (Document doc : allDocs) {
System.out.println("\n=== TF-IDF for " + doc.getName() + " ===");
Map<String, Double> termFrequencyInverseDocumentFrequency = TFIDFCalculator.computeTFIDF(doc, inverseDocumentFrequencyMap);
termFrequencyInverseDocumentFrequency.entrySet().stream()
.sorted(Map.Entry.<String, Double>comparingByValue().reversed())
.limit(10) // print top 10 terms
.forEach(entry ->
System.out.printf("%s: %.5f%n", entry.getKey(), entry.getValue())
);
}
System.out.println("\nCorpus processing complete. Total documents: " + corpus.getTotalDocuments());
}
}