Skip to content

Commit cfeeaac

Browse files
author
Matt Charters
committed
Remove the need for a reference to the pdf document
1 parent 1f948c3 commit cfeeaac

5 files changed

Lines changed: 13 additions & 61 deletions

File tree

src/main/java/technology/tabula/CommandLineApp.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ public void extractTables(CommandLine line) throws ParseException {
156156
// guess the page areas to extract using a detection algorithm
157157
// currently we only have a detector that uses spreadsheets to find table areas
158158
DetectionAlgorithm detector = new NurminenDetectionAlgorithm();
159-
List<Rectangle> guesses = detector.detect(page, pdfFile);
159+
List<Rectangle> guesses = detector.detect(page);
160160

161161
for (Rectangle guessRect : guesses) {
162162
Page guess = page.getArea(guessRect);

src/main/java/technology/tabula/detectors/DetectionAlgorithm.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,5 @@
1111
* Created by matt on 2015-12-14.
1212
*/
1313
public interface DetectionAlgorithm {
14-
List<Rectangle> detect(Page page, File referenceDocument);
14+
List<Rectangle> detect(Page page);
1515
}

src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java

Lines changed: 9 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -101,34 +101,17 @@ public RelevantEdges(int edgeType, int edgeCount) {
101101
}
102102
}
103103

104-
// for debugging
105-
private File currentDoc;
106-
private Page currentPage;
107-
private PDPage currentPDPage;
108-
109104
@Override
110-
public List<Rectangle> detect(Page page, File referenceDocument) {
111-
112-
// open a PDDocument to read stuff in
113-
PDDocument pdfDocument;
114-
try {
115-
pdfDocument = PDDocument.load(referenceDocument);
116-
} catch (Exception e) {
117-
return new ArrayList<Rectangle>();
118-
}
119-
120-
// get the page in question (and keep refs for debugging)
121-
this.currentPDPage = (PDPage) pdfDocument.getDocumentCatalog().getAllPages().get(page.getPageNumber() - 1);
122-
this.currentDoc = referenceDocument;
123-
this.currentPage = page;
105+
public List<Rectangle> detect(Page page) {
124106

125107
// get horizontal & vertical lines
126108
// we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF
127109
// instructions that are interpreted incorrectly as visible elements - we really want to capture what a
128110
// person sees when they look at the PDF
129111
BufferedImage image;
112+
PDPage pdfPage = page.getPDPage();
130113
try {
131-
image = this.currentPDPage.convertToImage(BufferedImage.TYPE_BYTE_GRAY, 144);
114+
image = pdfPage.convertToImage(BufferedImage.TYPE_BYTE_GRAY, 144);
132115
} catch (IOException e) {
133116
return new ArrayList<Rectangle>();
134117
}
@@ -137,8 +120,8 @@ public List<Rectangle> detect(Page page, File referenceDocument) {
137120

138121
// now check the page for vertical lines, but remove the text first to make things less confusing
139122
try {
140-
this.removeText(pdfDocument, this.currentPDPage);
141-
image = this.currentPDPage.convertToImage(BufferedImage.TYPE_BYTE_GRAY, 144);
123+
this.removeText(pdfPage);
124+
image = pdfPage.convertToImage(BufferedImage.TYPE_BYTE_GRAY, 144);
142125
} catch (Exception e) {
143126
return new ArrayList<Rectangle>();
144127
}
@@ -323,10 +306,6 @@ public int compare(Rectangle o1, Rectangle o2) {
323306

324307
tableSet.addAll(tableAreas);
325308

326-
this.currentDoc = null;
327-
this.currentPage = null;
328-
this.currentPDPage = null;
329-
330309
return new ArrayList<Rectangle>(tableSet);
331310
}
332311

@@ -846,7 +825,7 @@ private List<Ruling> getVerticalRulings(BufferedImage image) {
846825
}
847826

848827
// taken from http://www.docjar.com/html/api/org/apache/pdfbox/examples/util/RemoveAllText.java.html
849-
private void removeText(PDDocument document, PDPage page) throws IOException {
828+
private void removeText(PDPage page) throws IOException {
850829
PDFStreamParser parser = new PDFStreamParser(page.getContents());
851830
parser.parse();
852831

@@ -865,40 +844,13 @@ private void removeText(PDDocument document, PDPage page) throws IOException {
865844
newTokens.add(token);
866845
}
867846

847+
PDDocument document = new PDDocument();
848+
document.addPage(page);
849+
868850
PDStream newContents = new PDStream(document);
869851
ContentStreamWriter writer = new ContentStreamWriter(newContents.createOutputStream());
870852
writer.writeTokens(newTokens);
871853
newContents.addCompression();
872854
page.setContents(newContents);
873855
}
874-
875-
private void debug(Collection<? extends Shape> shapes) {
876-
this.debug(shapes, false);
877-
}
878-
879-
private void debug(Collection<? extends Shape> shapes, boolean twox) {
880-
Color[] COLORS = { new Color(27, 158, 119),
881-
new Color(217, 95, 2), new Color(117, 112, 179),
882-
new Color(231, 41, 138), new Color(102, 166, 30) };
883-
884-
try {
885-
int res = twox ? 144 : 72;
886-
887-
BufferedImage image = this.currentPDPage.convertToImage(BufferedImage.TYPE_INT_RGB, res);
888-
Graphics2D g = (Graphics2D) image.getGraphics();
889-
890-
g.setStroke(new BasicStroke(2f));
891-
int i = 0;
892-
893-
for (Shape s : shapes) {
894-
g.setColor(COLORS[(i++) % 5]);
895-
g.draw(s);
896-
}
897-
898-
String debugFileOut = this.currentDoc.getAbsolutePath().replace(".pdf", "-" + this.currentPage.getPageNumber() + ".jpg");
899-
900-
ImageIOUtil.writeImage(image, debugFileOut, res);
901-
} catch (IOException e) {
902-
}
903-
}
904856
}

src/main/java/technology/tabula/detectors/SpreadsheetDetectionAlgorithm.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
*/
2121
public class SpreadsheetDetectionAlgorithm implements DetectionAlgorithm {
2222
@Override
23-
public List<Rectangle> detect(Page page, File referenceDocument) {
23+
public List<Rectangle> detect(Page page) {
2424
List<Cell> cells = SpreadsheetExtractionAlgorithm.findCells(page.getHorizontalRulings(), page.getVerticalRulings());
2525

2626
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();

src/test/java/technology/tabula/TestTableDetection.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ public void testDetectionOfTables() throws Exception {
181181
PageIterator pages = extractor.extract();
182182
while (pages.hasNext()) {
183183
Page page = pages.next();
184-
List<Rectangle> tablesOnPage = detectionAlgorithm.detect(page, this.pdf);
184+
List<Rectangle> tablesOnPage = detectionAlgorithm.detect(page);
185185
if (tablesOnPage.size() > 0) {
186186
detectedTables.put(new Integer(page.getPageNumber()), tablesOnPage);
187187
}

0 commit comments

Comments
 (0)