txtFileProcessor.java

package edu.odu.cs.cs350;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.Scanner;
import java.util.Set;
 
/**
 * A utility class for processing plain text (.txt) files.
 * 
 */
public class txtFileProcessor {

    /**
     * The path to the text file read in to the processor.
     */
    private final String filePath;

    /**
     * Constructor for txtFileProcessor.
     * 
     * @param filePath the path to the text file to be processed
     */
    public txtFileProcessor(String filePath) {
        this.filePath = filePath;
    }

    /**
     * Optional set of English stop words to ignore during processing.
     * @return null
     */
    private static final Set<String> STOP_WORDS = Set.of(
        "a","about","above","after","again","against","all","am","an","and","any","are","aren't",
        "as","at","be","because","been","before","being","below","between","both","but","by",
        "could","couldn't","did","didn't","do","does","doesn't","doing","don't","down","during",
        "each","few","for","from","further","had","hadn't","has","hasn't","have","haven't","having",
        "he","he'd","he'll","he's","her","here","here's","hers","herself","him","himself","his",
        "how","how's","i","i'd","i'll","i'm","i've","if","in","into","is","isn't","it","it's","its",
        "itself","let's","me","more","most","mustn't","my","myself","no","nor","not","of","off","on",
        "once","only","or","other","ought","our","ours","ourselves","out","over","own","same","shan't",
        "she","she'd","she'll","she's","should","shouldn't","so","some","such","than","that","that's",
        "the","their","theirs","them","themselves","then","there","there's","these","they","they'd",
        "they'll","they're","they've","this","those","through","to","too","under","until","up","very",
        "was","wasn't","we","we'd","we'll","we're","we've","were","weren't","what","what's","when",
        "when's","where","where's","which","while","who","who's","whom","why","why's","with","won't",
        "would","wouldn't","you","you'd","you'll","you're","you've","your","yours","yourself",
        "yourselves"
    );

    /** Returns the file path associated with this text file processor. 
     * @return the file path
    */
    public String getFilePath() {
        return filePath;
    }

    /**
     * Reads the text file and converts it into a Document object,
     * using tokenization and removing common stopwords.
     *
     * @return a Document object representing the file's contents, or null if the file is invalid
     */
    public Document processFile() {
        File inFile = new File(filePath);
        if (!inFile.exists() || !inFile.isFile()) {
            System.err.println("Invalid file: " + filePath);
            return null;
        }

        Document doc = new Document(inFile.getName());

        try (Scanner scanner = new Scanner(inFile)) {
            scanner.useDelimiter("[^A-Za-z0-9'-]+");

            while (scanner.hasNext()) {
                String word = scanner.next().toLowerCase();
                if (!word.isEmpty() && !STOP_WORDS.contains(word)) {
                    doc.addWord(word);
                }
            }

        } catch (FileNotFoundException e) {
            System.err.println("Could not open file: " + filePath);
            return null;
        }

        return doc;
    }
}