Remove the need for a reference to the pdf document

Matt Charters · Matt Charters · commit cfeeaac53dde · 2016-01-11T16:59:30.000-05:00
diff --git a/src/main/java/technology/tabula/CommandLineApp.java b/src/main/java/technology/tabula/CommandLineApp.java
@@ -156,7 +156,7 @@ public void extractTables(CommandLine line) throws ParseException {
                         // guess the page areas to extract using a detection algorithm
                         // currently we only have a detector that uses spreadsheets to find table areas
                         DetectionAlgorithm detector = new NurminenDetectionAlgorithm();
-                        List<Rectangle> guesses = detector.detect(page, pdfFile);
+                        List<Rectangle> guesses = detector.detect(page);
 
                         for (Rectangle guessRect : guesses) {
                             Page guess = page.getArea(guessRect);
diff --git a/src/main/java/technology/tabula/detectors/DetectionAlgorithm.java b/src/main/java/technology/tabula/detectors/DetectionAlgorithm.java
@@ -11,5 +11,5 @@
  * Created by matt on 2015-12-14.
  */
 public interface DetectionAlgorithm {
-    List<Rectangle> detect(Page page, File referenceDocument);
+    List<Rectangle> detect(Page page);
 }
diff --git a/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java b/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java
@@ -101,34 +101,17 @@ public RelevantEdges(int edgeType, int edgeCount) {
         }
     }
 
-    // for debugging
-    private File currentDoc;
-    private Page currentPage;
-    private PDPage currentPDPage;
-
     @Override
-    public List<Rectangle> detect(Page page, File referenceDocument) {
-
-        // open a PDDocument to read stuff in
-        PDDocument pdfDocument;
-        try {
-            pdfDocument = PDDocument.load(referenceDocument);
-        } catch (Exception e) {
-            return new ArrayList<Rectangle>();
-        }
-
-        // get the page in question (and keep refs for debugging)
-        this.currentPDPage = (PDPage) pdfDocument.getDocumentCatalog().getAllPages().get(page.getPageNumber() - 1);
-        this.currentDoc = referenceDocument;
-        this.currentPage = page;
+    public List<Rectangle> detect(Page page) {
 
         // get horizontal & vertical lines
         // we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF
         // instructions that are interpreted incorrectly as visible elements - we really want to capture what a
         // person sees when they look at the PDF
         BufferedImage image;
+        PDPage pdfPage = page.getPDPage();
         try {
-            image = this.currentPDPage.convertToImage(BufferedImage.TYPE_BYTE_GRAY, 144);
+            image = pdfPage.convertToImage(BufferedImage.TYPE_BYTE_GRAY, 144);
         } catch (IOException e) {
             return new ArrayList<Rectangle>();
         }
@@ -137,8 +120,8 @@ public List<Rectangle> detect(Page page, File referenceDocument) {
 
         // now check the page for vertical lines, but remove the text first to make things less confusing
         try {
-            this.removeText(pdfDocument, this.currentPDPage);
-            image = this.currentPDPage.convertToImage(BufferedImage.TYPE_BYTE_GRAY, 144);
+            this.removeText(pdfPage);
+            image = pdfPage.convertToImage(BufferedImage.TYPE_BYTE_GRAY, 144);
         } catch (Exception e) {
             return new ArrayList<Rectangle>();
         }
@@ -323,10 +306,6 @@ public int compare(Rectangle o1, Rectangle o2) {
 
         tableSet.addAll(tableAreas);
 
-        this.currentDoc = null;
-        this.currentPage = null;
-        this.currentPDPage = null;
-
         return new ArrayList<Rectangle>(tableSet);
     }
 
@@ -846,7 +825,7 @@ private List<Ruling> getVerticalRulings(BufferedImage image) {
     }
 
     // taken from http://www.docjar.com/html/api/org/apache/pdfbox/examples/util/RemoveAllText.java.html
-    private void removeText(PDDocument document, PDPage page) throws IOException {
+    private void removeText(PDPage page) throws IOException {
         PDFStreamParser parser = new PDFStreamParser(page.getContents());
         parser.parse();
 
@@ -865,40 +844,13 @@ private void removeText(PDDocument document, PDPage page) throws IOException {
             newTokens.add(token);
         }
 
+        PDDocument document = new PDDocument();
+        document.addPage(page);
+
         PDStream newContents = new PDStream(document);
         ContentStreamWriter writer = new ContentStreamWriter(newContents.createOutputStream());
         writer.writeTokens(newTokens);
         newContents.addCompression();
         page.setContents(newContents);
     }
-
-    private void debug(Collection<? extends Shape> shapes) {
-        this.debug(shapes, false);
-    }
-
-    private void debug(Collection<? extends Shape> shapes, boolean twox) {
-        Color[] COLORS = { new Color(27, 158, 119),
-                new Color(217, 95, 2), new Color(117, 112, 179),
-                new Color(231, 41, 138), new Color(102, 166, 30) };
-
-        try {
-            int res = twox ? 144 : 72;
-
-            BufferedImage image = this.currentPDPage.convertToImage(BufferedImage.TYPE_INT_RGB, res);
-            Graphics2D g = (Graphics2D) image.getGraphics();
-
-            g.setStroke(new BasicStroke(2f));
-            int i = 0;
-
-            for (Shape s : shapes) {
-                g.setColor(COLORS[(i++) % 5]);
-                g.draw(s);
-            }
-
-            String debugFileOut = this.currentDoc.getAbsolutePath().replace(".pdf", "-" + this.currentPage.getPageNumber() + ".jpg");
-
-            ImageIOUtil.writeImage(image, debugFileOut, res);
-        } catch (IOException e) {
-        }
-    }
 }
diff --git a/src/main/java/technology/tabula/detectors/SpreadsheetDetectionAlgorithm.java b/src/main/java/technology/tabula/detectors/SpreadsheetDetectionAlgorithm.java
@@ -20,7 +20,7 @@
  */
 public class SpreadsheetDetectionAlgorithm implements DetectionAlgorithm {
     @Override
-    public List<Rectangle> detect(Page page, File referenceDocument) {
+    public List<Rectangle> detect(Page page) {
         List<Cell> cells = SpreadsheetExtractionAlgorithm.findCells(page.getHorizontalRulings(), page.getVerticalRulings());
 
         SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
diff --git a/src/test/java/technology/tabula/TestTableDetection.java b/src/test/java/technology/tabula/TestTableDetection.java
@@ -181,7 +181,7 @@ public void testDetectionOfTables() throws Exception {
         PageIterator pages = extractor.extract();
         while (pages.hasNext()) {
             Page page = pages.next();
-            List<Rectangle> tablesOnPage = detectionAlgorithm.detect(page, this.pdf);
+            List<Rectangle> tablesOnPage = detectionAlgorithm.detect(page);
             if (tablesOnPage.size() > 0) {
                 detectedTables.put(new Integer(page.getPageNumber()), tablesOnPage);
             }

Original file line number	Diff line number	Diff line change
`@@ -11,5 +11,5 @@`
`11`	`11`	`* Created by matt on 2015-12-14.`
`12`	`12`	`*/`
`13`	`13`	`public interface DetectionAlgorithm {`
`14`		`- List<Rectangle> detect(Page page, File referenceDocument);`
	`14`	`+ List<Rectangle> detect(Page page);`
`15`	`15`	`}`
Original file line number	Diff line number	Diff line change
`@@ -181,7 +181,7 @@ public void testDetectionOfTables() throws Exception {`
`181`	`181`	`PageIterator pages = extractor.extract();`
`182`	`182`	`while (pages.hasNext()) {`
`183`	`183`	`Page page = pages.next();`
`184`		`- List<Rectangle> tablesOnPage = detectionAlgorithm.detect(page, this.pdf);`
	`184`	`+ List<Rectangle> tablesOnPage = detectionAlgorithm.detect(page);`
`185`	`185`	`if (tablesOnPage.size() > 0) {`
`186`	`186`	`detectedTables.put(new Integer(page.getPageNumber()), tablesOnPage);`
`187`	`187`	`}`