pdfFileProcessor.java

package edu.odu.cs.cs350;

import java.io.File;
import java.io.IOException;

import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

/**
 * A service class to extract text from PDF files using Apache PDFBox. Modified
 * from https://github.com/tvalva/pdfwordscan
 */
public class pdfFileProcessor {

    /**
     * Extract text from the PDF file and return it as a Document object.
     * @return a Document object containing the extracted text and file name
     * @param inFile the PDF file to be processed
     * @throws IOException if there is an error reading the PDF
     */
    public static Document processFile(File inFile) throws IOException {
        PDFTextStripper stripper;
        if (!DocumentLength.isValidLength(inFile)) {
            throw new IOException("Input PDF exceeds maximum allowed length of 50 pages.");
        }

        String[] unfilteredWords;
        try (PDDocument document = Loader.loadPDF(inFile)) {
            stripper = new PDFTextStripper();
            unfilteredWords = stripper.getText(document).replaceAll("[\\p{Punct}&&[^'-]]", "").toLowerCase()
                    .split("\\s+");
        }

        Document outputDocument = new Document(inFile.getName());
        for (String word : unfilteredWords) {
            if (!WordFilter.contains(word)) {
                outputDocument.addWord(word);
            }
        }
        return outputDocument;
    }
}