Skip to content

Commit 0987d7c

Browse files
Asheesh Ranajazzido
authored andcommitted
1942-update-tabula-java-for-multi-column-pdf (PolicyReporter/requests#1942)
- Allow multiple occurrences of -a parameter - Allow -a parameter to accept % values as well as absolute values - Add test cases - Add test files
1 parent 1c121b6 commit 0987d7c

6 files changed

Lines changed: 154 additions & 13 deletions

File tree

src/main/java/technology/tabula/CommandLineApp.java

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@
66
import java.io.FileWriter;
77
import java.io.IOException;
88
import java.util.ArrayList;
9+
import java.util.HashMap;
10+
import java.util.Iterator;
911
import java.util.List;
12+
import java.util.Map;
1013

1114
import org.apache.commons.cli.CommandLine;
1215
import org.apache.commons.cli.CommandLineParser;
@@ -34,15 +37,16 @@ public class CommandLineApp {
3437
private static String BANNER = "\nTabula helps you extract tables from PDFs\n\n";
3538

3639
private Appendable defaultOutput;
37-
private Rectangle pageArea;
40+
41+
private List<Pair<Integer, Rectangle>> pageAreas;
3842
private List<Integer> pages;
3943
private OutputFormat outputFormat;
4044
private String password;
4145
private TableExtractor tableExtractor;
4246

4347
public CommandLineApp(Appendable defaultOutput, CommandLine line) throws ParseException {
4448
this.defaultOutput = defaultOutput;
45-
this.pageArea = CommandLineApp.whichArea(line);
49+
this.pageAreas = CommandLineApp.whichAreas(line);
4650
this.pages = CommandLineApp.whichPages(line);
4751
this.outputFormat = CommandLineApp.whichOutputFormat(line);
4852
this.tableExtractor = CommandLineApp.createExtractor(line);
@@ -156,11 +160,13 @@ private void extractFile(File pdfFile, Appendable outFile) throws ParseException
156160
while (pageIterator.hasNext()) {
157161
Page page = pageIterator.next();
158162

159-
if (pageArea != null) {
160-
page = page.getArea(pageArea);
163+
if (pageAreas != null) {
164+
for (Pair<Integer, Rectangle> areaPair : pageAreas) {
165+
tables.addAll(tableExtractor.extractTables(page.getArea(areaPair.getRight(), areaPair.getLeft())));
166+
}
167+
} else {
168+
tables.addAll(tableExtractor.extractTables(page));
161169
}
162-
163-
tables.addAll(tableExtractor.extractTables(page));
164170
}
165171
writeTables(tables, outFile);
166172
} catch (IOException e) {
@@ -200,16 +206,28 @@ private static OutputFormat whichOutputFormat(CommandLine line) throws ParseExce
200206
}
201207
}
202208

203-
private static Rectangle whichArea(CommandLine line) throws ParseException {
209+
private static List<Pair<Integer, Rectangle>> whichAreas(CommandLine line) throws ParseException {
204210
if (!line.hasOption('a')) {
205211
return null;
206212
}
207-
208-
List<Float> f = parseFloatList(line.getOptionValue('a'));
209-
if (f.size() != 4) {
210-
throw new ParseException("area parameters must be top,left,bottom,right");
213+
214+
String[] optionValues = line.getOptionValues('a');
215+
216+
List<Pair<Integer, Rectangle>> areaList = new ArrayList<Pair<Integer, Rectangle>>();
217+
for (String optionValue: optionValues) {
218+
int areaCalculationMode = Page.ABSOLUTE_AREA_CALCULATION_MODE;
219+
int startIndex = 0;
220+
if (optionValue.startsWith("%")) {
221+
startIndex = 1;
222+
areaCalculationMode = Page.RELATIVE_AREA_CALCULATION_MODE;
223+
}
224+
List<Float> f = parseFloatList(optionValue.substring(startIndex));
225+
if (f.size() != 4) {
226+
throw new ParseException("area parameters must be top,left,bottom,right optionally preceded by %");
227+
}
228+
areaList.add(new Pair<Integer, Rectangle>(areaCalculationMode, new Rectangle(f.get(0), f.get(1), f.get(3) - f.get(1), f.get(2) - f.get(0))));
211229
}
212-
return new Rectangle(f.get(0), f.get(1), f.get(3) - f.get(1), f.get(2) - f.get(0));
230+
return areaList;
213231
}
214232

215233
private static List<Integer> whichPages(CommandLine line) throws ParseException {
@@ -307,7 +325,9 @@ public static Options buildOptions() {
307325
.build());
308326
o.addOption(Option.builder("a")
309327
.longOpt("area")
310-
.desc("Portion of the page to analyze (top,left,bottom,right). Example: --area 269.875,12.75,790.5,561. Default is entire page")
328+
.desc("-a/--area = Portion of the page to analyze. Accepts top,left,bottom,right . Example: --area 269.875,12.75,790.5,561. "
329+
+ "If all values are between 0-100 (inclusive) and preceded by '%', input will be taken as % of actual height or width of the page. "
330+
+ "Example: --area %0,0,100,50. To specify multiple areas, -a option should be repeated. Default is entire page")
311331
.hasArg()
312332
.argName("AREA")
313333
.build());

src/main/java/technology/tabula/Page.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ public class Page extends Rectangle {
2121
private RectangleSpatialIndex<TextElement> spatial_index;
2222
private PDPage pdPage;
2323

24+
public static final int RELATIVE_AREA_CALCULATION_MODE = 0;
25+
public static final int ABSOLUTE_AREA_CALCULATION_MODE = 1;
26+
2427
public Page(float top, float left, float width, float height, int rotation, int page_number, PDPage pdPage) {
2528
super(top, left, width, height);
2629
this.rotation = rotation;
@@ -48,6 +51,21 @@ public Page(float top, float left, float width, float height, int rotation, int
4851
}
4952

5053

54+
public Page getArea(float top, float left, float bottom, float right, int mode) {
55+
Rectangle area = new Rectangle(top, left, right - left, bottom - top);
56+
return getArea(area, mode);
57+
}
58+
59+
public Page getArea(Rectangle area, int mode) {
60+
Rectangle newArea = area;
61+
if (mode == RELATIVE_AREA_CALCULATION_MODE) {
62+
newArea = new Rectangle((float) (area.getTop() / 100 * getHeight()),
63+
(float) (area.getLeft() / 100 * getWidth()), (float) (area.getWidth() / 100 * getWidth()),
64+
(float) (area.getHeight() / 100 * getHeight()));
65+
}
66+
return getArea(newArea);
67+
}
68+
5169
public Page getArea(Rectangle area) {
5270
List<TextElement> t = getText(area);
5371
float min_char_width = 7;
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
package technology.tabula;
2+
3+
public class Pair<L,R> {
4+
private final L left;
5+
private final R right;
6+
7+
public Pair(L left, R right) {
8+
this.left = left;
9+
this.right = right;
10+
}
11+
12+
public L getLeft() {
13+
return this.left;
14+
}
15+
16+
public R getRight() {
17+
return this.right;
18+
}
19+
}

src/test/java/technology/tabula/TestCommandLineApp.java

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,46 @@ public void testEncryptedWrongPassword() throws ParseException {
143143
});
144144
}
145145

146+
@Test
147+
public void testExtractWithMultiplePercentArea() throws ParseException, IOException {
148+
149+
String expectedCsv = UtilsForTesting.loadCsv("src/test/resources/technology/tabula/csv/MultiColumn.csv");
146150

151+
assertEquals(expectedCsv, this.csvFromCommandLineArgs(new String[]{
152+
"src/test/resources/technology/tabula/MultiColumn.pdf",
153+
"-p", "1", "-a",
154+
"%0,0,100,50", "-a",
155+
"%0,50,100,100", "-f",
156+
"CSV"
157+
}));
158+
}
159+
160+
@Test
161+
public void testExtractWithMultipleAbsoluteArea() throws ParseException, IOException {
162+
163+
String expectedCsv = UtilsForTesting.loadCsv("src/test/resources/technology/tabula/csv/MultiColumn.csv");
164+
165+
assertEquals(expectedCsv, this.csvFromCommandLineArgs(new String[]{
166+
"src/test/resources/technology/tabula/MultiColumn.pdf",
167+
"-p", "1", "-a",
168+
"0,0,451,212", "-a",
169+
"0,212,451,425", "-f",
170+
"CSV"
171+
}));
172+
}
173+
174+
@Test
175+
public void testExtractWithPercentAndAbsoluteArea() throws ParseException, IOException {
176+
177+
String expectedCsv = UtilsForTesting.loadCsv("src/test/resources/technology/tabula/csv/MultiColumn.csv");
178+
179+
assertEquals(expectedCsv, this.csvFromCommandLineArgs(new String[]{
180+
"src/test/resources/technology/tabula/MultiColumn.pdf",
181+
"-p", "1", "-a",
182+
"%0,0,100,50", "-a",
183+
"0,212,451,425", "-f",
184+
"CSV"
185+
}));
186+
}
147187

148188
}
8.14 KB
Binary file not shown.
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
1,100,200
2+
2,101,201
3+
3,102,202
4+
4,103,203
5+
5,104,204
6+
6,105,205
7+
7,106,206
8+
8,107,207
9+
9,108,208
10+
10,109,209
11+
11,110,210
12+
12,111,211
13+
13,112,212
14+
14,113,213
15+
15,114,214
16+
16,115,215
17+
17,116,216
18+
18,117,217
19+
19,118,218
20+
20,119,219
21+
21,120,220
22+
22,121,221
23+
23,122,222
24+
24,123,223
25+
25,124,224
26+
26,125,225
27+
27,126,226
28+
28,127,227
29+
29,128,228
30+
30,129,229
31+
31,130,230
32+
32,131,231
33+
33,132,232
34+
34,133,233
35+
35,134,234
36+
36,135,235
37+
37,136,236
38+
38,137,237
39+
39,138,238
40+
40,139,239
41+
41,140,240
42+
42,141,241
43+
43,142,242
44+
44,143,243

0 commit comments

Comments
 (0)