@@ -101,34 +101,17 @@ public RelevantEdges(int edgeType, int edgeCount) {
101101 }
102102 }
103103
104- // for debugging
105- private File currentDoc ;
106- private Page currentPage ;
107- private PDPage currentPDPage ;
108-
109104 @ Override
110- public List <Rectangle > detect (Page page , File referenceDocument ) {
111-
112- // open a PDDocument to read stuff in
113- PDDocument pdfDocument ;
114- try {
115- pdfDocument = PDDocument .load (referenceDocument );
116- } catch (Exception e ) {
117- return new ArrayList <Rectangle >();
118- }
119-
120- // get the page in question (and keep refs for debugging)
121- this .currentPDPage = (PDPage ) pdfDocument .getDocumentCatalog ().getAllPages ().get (page .getPageNumber () - 1 );
122- this .currentDoc = referenceDocument ;
123- this .currentPage = page ;
105+ public List <Rectangle > detect (Page page ) {
124106
125107 // get horizontal & vertical lines
126108 // we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF
127109 // instructions that are interpreted incorrectly as visible elements - we really want to capture what a
128110 // person sees when they look at the PDF
129111 BufferedImage image ;
112+ PDPage pdfPage = page .getPDPage ();
130113 try {
131- image = this . currentPDPage .convertToImage (BufferedImage .TYPE_BYTE_GRAY , 144 );
114+ image = pdfPage .convertToImage (BufferedImage .TYPE_BYTE_GRAY , 144 );
132115 } catch (IOException e ) {
133116 return new ArrayList <Rectangle >();
134117 }
@@ -137,8 +120,8 @@ public List<Rectangle> detect(Page page, File referenceDocument) {
137120
138121 // now check the page for vertical lines, but remove the text first to make things less confusing
139122 try {
140- this .removeText (pdfDocument , this . currentPDPage );
141- image = this . currentPDPage .convertToImage (BufferedImage .TYPE_BYTE_GRAY , 144 );
123+ this .removeText (pdfPage );
124+ image = pdfPage .convertToImage (BufferedImage .TYPE_BYTE_GRAY , 144 );
142125 } catch (Exception e ) {
143126 return new ArrayList <Rectangle >();
144127 }
@@ -323,10 +306,6 @@ public int compare(Rectangle o1, Rectangle o2) {
323306
324307 tableSet .addAll (tableAreas );
325308
326- this .currentDoc = null ;
327- this .currentPage = null ;
328- this .currentPDPage = null ;
329-
330309 return new ArrayList <Rectangle >(tableSet );
331310 }
332311
@@ -846,7 +825,7 @@ private List<Ruling> getVerticalRulings(BufferedImage image) {
846825 }
847826
848827 // taken from http://www.docjar.com/html/api/org/apache/pdfbox/examples/util/RemoveAllText.java.html
849- private void removeText (PDDocument document , PDPage page ) throws IOException {
828+ private void removeText (PDPage page ) throws IOException {
850829 PDFStreamParser parser = new PDFStreamParser (page .getContents ());
851830 parser .parse ();
852831
@@ -865,40 +844,13 @@ private void removeText(PDDocument document, PDPage page) throws IOException {
865844 newTokens .add (token );
866845 }
867846
847+ PDDocument document = new PDDocument ();
848+ document .addPage (page );
849+
868850 PDStream newContents = new PDStream (document );
869851 ContentStreamWriter writer = new ContentStreamWriter (newContents .createOutputStream ());
870852 writer .writeTokens (newTokens );
871853 newContents .addCompression ();
872854 page .setContents (newContents );
873855 }
874-
875- private void debug (Collection <? extends Shape > shapes ) {
876- this .debug (shapes , false );
877- }
878-
879- private void debug (Collection <? extends Shape > shapes , boolean twox ) {
880- Color [] COLORS = { new Color (27 , 158 , 119 ),
881- new Color (217 , 95 , 2 ), new Color (117 , 112 , 179 ),
882- new Color (231 , 41 , 138 ), new Color (102 , 166 , 30 ) };
883-
884- try {
885- int res = twox ? 144 : 72 ;
886-
887- BufferedImage image = this .currentPDPage .convertToImage (BufferedImage .TYPE_INT_RGB , res );
888- Graphics2D g = (Graphics2D ) image .getGraphics ();
889-
890- g .setStroke (new BasicStroke (2f ));
891- int i = 0 ;
892-
893- for (Shape s : shapes ) {
894- g .setColor (COLORS [(i ++) % 5 ]);
895- g .draw (s );
896- }
897-
898- String debugFileOut = this .currentDoc .getAbsolutePath ().replace (".pdf" , "-" + this .currentPage .getPageNumber () + ".jpg" );
899-
900- ImageIOUtil .writeImage (image , debugFileOut , res );
901- } catch (IOException e ) {
902- }
903- }
904856}
0 commit comments