diff --git a/src/main/java/technology/tabula/CommandLineApp.java b/src/main/java/technology/tabula/CommandLineApp.java index 0228df4b..c153b3a5 100644 --- a/src/main/java/technology/tabula/CommandLineApp.java +++ b/src/main/java/technology/tabula/CommandLineApp.java @@ -44,6 +44,7 @@ public class CommandLineApp { private OutputFormat outputFormat; private String password; private TableExtractor tableExtractor; + private Integer lineColorFilter; public CommandLineApp(Appendable defaultOutput, CommandLine line) throws ParseException { this.defaultOutput = defaultOutput; @@ -51,6 +52,7 @@ public CommandLineApp(Appendable defaultOutput, CommandLine line) throws ParseEx this.pages = CommandLineApp.whichPages(line); this.outputFormat = CommandLineApp.whichOutputFormat(line); this.tableExtractor = CommandLineApp.createExtractor(line); + this.lineColorFilter = CommandLineApp.whichLineColorFilter(line); if (line.hasOption('s')) { this.password = line.getOptionValue('s'); @@ -195,7 +197,7 @@ private void extractFile(File pdfFile, Appendable outFile) throws ParseException } private PageIterator getPageIterator(PDDocument pdfDocument) throws IOException { - ObjectExtractor extractor = new ObjectExtractor(pdfDocument); + ObjectExtractor extractor = new ObjectExtractor(pdfDocument, lineColorFilter); return (pages == null) ? extractor.extract() : extractor.extract(pages); @@ -260,6 +262,23 @@ private static ExtractionMethod whichExtractionMethod(CommandLine line) { return ExtractionMethod.DECIDE; } + private static Integer whichLineColorFilter(CommandLine line) throws ParseException { + if (!line.hasOption("line-color-filter")) { + return null; + } + + Integer result; + try { + result = Integer.parseInt(line.getOptionValue("line-color-filter"), 16); + } catch (NumberFormatException e) { + throw new ParseException("line-color-filter parameter must be a hexadecimal number"); + } + if (result < 0 || result > 0xFFFFFF) { + throw new ParseException("line-color-filter parameter must be at most FFFFFF"); + } + return result; + } + private static TableExtractor createExtractor(CommandLine line) throws ParseException { TableExtractor extractor = new TableExtractor(); extractor.setGuess(line.hasOption('g')); @@ -358,6 +377,12 @@ public static Options buildOptions() { .hasArg() .argName("PAGES") .build()); + o.addOption(Option.builder(null) + .longOpt("line-color-filter") + .desc("Only consider lines of this color to be lattice lines. Example: --line-color-filter DEADBE .") + .hasArg() + .argName("COLOR") + .build()); return o; } diff --git a/src/main/java/technology/tabula/ObjectExtractor.java b/src/main/java/technology/tabula/ObjectExtractor.java index 9f3f6a03..358d24d0 100644 --- a/src/main/java/technology/tabula/ObjectExtractor.java +++ b/src/main/java/technology/tabula/ObjectExtractor.java @@ -8,9 +8,15 @@ public class ObjectExtractor implements java.io.Closeable { private final PDDocument pdfDocument; + private final Integer lineColorFilter; public ObjectExtractor(PDDocument pdfDocument) { + this(pdfDocument, null); + } + + public ObjectExtractor(PDDocument pdfDocument, Integer lineColorFilter) { this.pdfDocument = pdfDocument; + this.lineColorFilter = lineColorFilter; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // @@ -20,7 +26,7 @@ protected Page extractPage(Integer pageNumber) throws IOException { } PDPage page = pdfDocument.getPage(pageNumber - 1); - ObjectExtractorStreamEngine streamEngine = new ObjectExtractorStreamEngine(page); + ObjectExtractorStreamEngine streamEngine = new ObjectExtractorStreamEngine(page, lineColorFilter); streamEngine.processPage(page); TextStripper textStripper = new TextStripper(pdfDocument, pageNumber); diff --git a/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java b/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java index 9907eca1..d1fb45bb 100644 --- a/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java +++ b/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java @@ -7,6 +7,7 @@ import java.awt.geom.PathIterator; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; +import java.io.IOException; import java.util.ArrayList; import java.util.Comparator; import java.util.List; @@ -15,7 +16,9 @@ import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.graphics.color.PDColor; import org.apache.pdfbox.pdmodel.graphics.image.PDImage; +import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -23,6 +26,7 @@ class ObjectExtractorStreamEngine extends PDFGraphicsStreamEngine { + private Integer lineColorFilter; protected List rulings; private AffineTransform pageTransform; private boolean extractRulingLines = true; @@ -32,8 +36,9 @@ class ObjectExtractorStreamEngine extends PDFGraphicsStreamEngine { private static final float RULING_MINIMUM_LENGTH = 0.01f; - protected ObjectExtractorStreamEngine(PDPage page) { + protected ObjectExtractorStreamEngine(PDPage page, Integer lineColorFilter) { super(page); + this.lineColorFilter = lineColorFilter; logger = LoggerFactory.getLogger(ObjectExtractorStreamEngine.class); rulings = new ArrayList<>(); @@ -130,16 +135,11 @@ public void strokePath() { } private void strokeOrFillPath(boolean isFill) { - if (!extractRulingLines) { + if (!extractRulingLines || filterPathByColor(isFill) || filterPathBySegmentType()) { currentPath.reset(); return; } - boolean didNotPassedTheFilter = filterPathBySegmentType(); - if (didNotPassedTheFilter) return; - - // TODO: how to implement color filter? - // Skip the first path operation and save it as the starting point. PathIterator pathIterator = currentPath.getPathIterator(getPageTransform()); @@ -191,6 +191,32 @@ private void strokeOrFillPath(boolean isFill) { currentPath.reset(); } + private boolean filterPathByColor (boolean isFill) { + if (lineColorFilter == null) { + return false; + } + + try { + PDGraphicsState state = getGraphicsState(); + PDColor currentColor; + if (isFill) { + currentColor = state.getNonStrokingColor(); + } else { + currentColor = state.getStrokingColor(); + } + return currentColor.toRGB() != lineColorFilter; + } catch (IOException e) { + System.err.println("Color conversion failed:"); + e.printStackTrace(); + return false; + } catch (IllegalStateException e) { + System.err.println("Cannot convert pattern color:"); + e.printStackTrace(); + return false; + } + // TODO: if the toRGB() method throws an exception, should the color be valid or not? + } + private boolean filterPathBySegmentType() { PathIterator pathIterator = currentPath.getPathIterator(pageTransform); float[] coordinates = new float[6]; diff --git a/src/test/java/technology/tabula/TestObjectExtractor.java b/src/test/java/technology/tabula/TestObjectExtractor.java index 9db7ad18..d2aa870a 100644 --- a/src/test/java/technology/tabula/TestObjectExtractor.java +++ b/src/test/java/technology/tabula/TestObjectExtractor.java @@ -83,6 +83,19 @@ public void testShouldDetectRulings() throws IOException { } } + @Test + public void testShouldFilterRulingsByColor() throws IOException { + PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/should_filter_rulings_by_color.pdf")); + try (ObjectExtractor oe = new ObjectExtractor(pdf_document, 0)) { + PageIterator pi = oe.extract(); + + Page page = pi.next(); + List rulings = page.getRulings(); + + assertEquals(7, rulings.size()); + } + } + @Test public void testDontThrowNPEInShfill() throws IOException { PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/labor.pdf")); diff --git a/src/test/resources/technology/tabula/should_filter_rulings_by_color.pdf b/src/test/resources/technology/tabula/should_filter_rulings_by_color.pdf new file mode 100644 index 00000000..639a1aa8 Binary files /dev/null and b/src/test/resources/technology/tabula/should_filter_rulings_by_color.pdf differ