Corpus.java

package edu.odu.cs.cs350;

import java.util.ArrayList;
import java.util.List;

/**
 * Represents a collection (corpus) of {@link Document} objects.
 * <p>
 * The Corpus class manages a list of documents and provides utility
 * methods for corpus-level statistics, such as counting how many
 * documents contain a given word.
 * </p>
 */
public class Corpus {
    /** The list of documents contained in this corpus. */
    private final List<Document> documents;

    /**
     * Constructs an empty Corpus.
     */
    public Corpus() {
        documents = new ArrayList<>();
    }

    /**
     * Adds a {@link Document} to this corpus.
     *
     * @param doc the document to add
     */
    public void addDocument(Document doc) {
        documents.add(doc);
    }

    /**
     * Returns the list of all {@link Document} objects in this corpus.
     *
     * @return a list of documents
     */
    public List<Document> getDocuments() {
        return documents;
    }

    /**
     * Returns the total number of documents in this corpus.
     *
     * @return the number of documents
     */
    public int getTotalDocuments() {
        return documents.size();
    }

    /**
     * Counts how many documents in the corpus contain a given word.
     * <p>
     * This is useful for computing the inverse document frequency (IDF)
     * part of the TF-IDF calculation.
     * </p>
     *
     * @param word the word to search for
     * @return the number of documents containing the given word
     */
    public int getDocumentCountContaining(String word) {
        int count = 0;
        for (Document doc : documents) {
            if (doc.getWords().containsKey(word)) {
                count++;
            }
        }
        return count;
    }
}