pdfFileProcessor.java
package edu.odu.cs.cs350;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
/**
* A service class to extract text from PDF files using Apache PDFBox. Modified
* from https://github.com/tvalva/pdfwordscan
*/
public class pdfFileProcessor {
/**
* Extract text from the PDF file and return it as a Document object.
* @return a Document object containing the extracted text and file name
* @param inFile the PDF file to be processed
* @throws IOException if there is an error reading the PDF
*/
public static Document processFile(File inFile) throws IOException {
PDFTextStripper stripper;
if (!DocumentLength.isValidLength(inFile)) {
throw new IOException("Input PDF exceeds maximum allowed length of 50 pages.");
}
String[] unfilteredWords;
try (PDDocument document = Loader.loadPDF(inFile)) {
stripper = new PDFTextStripper();
unfilteredWords = stripper.getText(document).replaceAll("[\\p{Punct}&&[^'-]]", "").toLowerCase()
.split("\\s+");
}
Document outputDocument = new Document(inFile.getName());
for (String word : unfilteredWords) {
if (!WordFilter.contains(word)) {
outputDocument.addWord(word);
}
}
return outputDocument;
}
}