null
a new instance of {@link PdfOcrFontProvider} is
+ * null
a new instance of {@link PdfOcrFontProvider} is
* returned.
* @return {@link com.itextpdf.layout.font.FontProvider} object
*/
diff --git a/pdfocr-tesseract4/pom.xml b/pdfocr-tesseract4/pom.xml
index c09cdfe..4afa904 100644
--- a/pdfocr-tesseract4/pom.xml
+++ b/pdfocr-tesseract4/pom.xml
@@ -5,7 +5,7 @@
com.itextpdf
pdfocr-root
- 1.0.0
+ 1.0.1
pdfocr-tesseract4
diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ReflectionUtils.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ReflectionUtils.java
index 07c1c53..133436e 100644
--- a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ReflectionUtils.java
+++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ReflectionUtils.java
@@ -22,58 +22,25 @@ This file is part of the iText (R) project.
*/
package com.itextpdf.pdfocr.tesseract4;
-import com.itextpdf.io.util.MessageFormatUtil;
import com.itextpdf.kernel.Version;
-import com.itextpdf.kernel.counter.ContextManager;
-import java.lang.reflect.AccessibleObject;
import java.lang.reflect.Array;
import java.lang.reflect.Constructor;
import java.lang.reflect.Method;
import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
final class ReflectionUtils {
- private static final Logger logger = LoggerFactory.getLogger(ReflectionUtils.class);
-
- private static final String KERNEL_PACKAGE = "com.itextpdf.kernel.";
private static final String LICENSEKEY_PACKAGE = "com.itextpdf.licensekey.";
- private static final String CONTEXT_MANAGER = "counter.ContextManager";
private static final String LICENSEKEY = "LicenseKey";
private static final String LICENSEKEY_PRODUCT = "LicenseKeyProduct";
private static final String LICENSEKEY_FEATURE = "LicenseKeyProductFeature";
- private static final String REGISTER_GENERIC_CONTEXT = "registerGenericContext";
private static final String SCHEDULED_CHECK = "scheduledCheck";
private static final String NO_PDFOCR_TESSERACT4 = "No license loaded for product pdfOcr-Tesseract4. Please use LicenseKey.loadLicense(...) to load one.";
- private static Map> cachedClasses = new HashMap<>();
- private static Map cachedMethods = new HashMap<>();
-
- static {
- try {
- ContextManager contextManager = ContextManager.getInstance();
- callMethod(KERNEL_PACKAGE + CONTEXT_MANAGER, REGISTER_GENERIC_CONTEXT, contextManager,
- new Class[] {Collection.class, Collection.class},
- Collections.singletonList("com.itextpdf.pdfocr"),
- Collections.singletonList("com.itextpdf.pdfocr.tesseract4"));
- callMethod(KERNEL_PACKAGE + CONTEXT_MANAGER, REGISTER_GENERIC_CONTEXT, contextManager,
- new Class[] {Collection.class, Collection.class},
- Collections.singletonList("com.itextpdf.pdfocr.tesseract4"),
- Collections.singletonList("com.itextpdf.pdfocr.tesseract4"));
- } catch (Exception e) {
- logger.error(e.getMessage());
- }
- }
-
private ReflectionUtils() {
}
@@ -116,52 +83,6 @@ public static void scheduledCheck() {
}
}
- private static Object callMethod(String className, String methodName, Object target, Class[] parameterTypes,
- Object... args) {
- try {
- Method method = findMethod(className, methodName, parameterTypes);
- return method.invoke(target, args);
- } catch (NoSuchMethodException e) {
- logger.warn(MessageFormatUtil.format("Cannot find method {0} for class {1}", methodName, className));
- } catch (ClassNotFoundException e) {
- logger.warn(MessageFormatUtil.format("Cannot find class {0}", className));
- } catch (IllegalArgumentException e) {
- logger.warn(MessageFormatUtil
- .format("Illegal arguments passed to {0}#{1} method call: {2}", className, methodName,
- e.getMessage()));
- } catch (Exception e) {
- // Converting checked exceptions to unchecked RuntimeException (java-specific comment).
- //
- // If kernel utils throws an exception at this point, we consider it as unrecoverable situation for
- // its callers (pdfOcr methods).
- // It's might be more suitable to wrap checked exceptions at a bit higher level, but we do it here for
- // the sake of convenience.
- throw new RuntimeException(e.toString(), e);
- }
- return null;
- }
-
- private static Method findMethod(String className, String methodName, Class[] parameterTypes)
- throws NoSuchMethodException, ClassNotFoundException {
- MethodSignature tm = new MethodSignature(className, parameterTypes, methodName);
- Method m = (Method) cachedMethods.get(tm);
- if (m == null) {
- m = findClass(className).getDeclaredMethod(methodName, parameterTypes);
- m.setAccessible(true);
- cachedMethods.put(tm, m);
- }
- return m;
- }
-
- private static Class> findClass(String className) throws ClassNotFoundException {
- Class> c = cachedClasses.get(className);
- if (c == null) {
- c = getClass(className);
- cachedClasses.put(className, c);
- }
- return c;
- }
-
private static Class> getClass(String className) throws ClassNotFoundException {
return Class.forName(className);
}
diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LogMessageConstant.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LogMessageConstant.java
index 90bf76b..7f028c5 100644
--- a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LogMessageConstant.java
+++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LogMessageConstant.java
@@ -59,7 +59,10 @@ public class Tesseract4LogMessageConstant {
+ "temporary directory: {0}";
public static final String CANNOT_CONVERT_IMAGE_TO_PIX =
"Cannot convert image to pix: {0}";
+ public static final String CANNOT_PARSE_NODE_BBOX =
+ "Cannot parse node BBox, defaults to 0, 0, 0, 0. Node: {0}";
+
private Tesseract4LogMessageConstant() {
}
-}
+}
\ No newline at end of file
diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractHelper.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractHelper.java
index c700f25..d397625 100644
--- a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractHelper.java
+++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractHelper.java
@@ -28,6 +28,7 @@ This file is part of the iText (R) project.
import com.itextpdf.styledxmlparser.jsoup.Jsoup;
import com.itextpdf.styledxmlparser.jsoup.nodes.Document;
import com.itextpdf.styledxmlparser.jsoup.nodes.Element;
+import com.itextpdf.styledxmlparser.jsoup.nodes.Node;
import com.itextpdf.styledxmlparser.jsoup.select.Elements;
import java.io.File;
@@ -60,6 +61,27 @@ public class TesseractHelper {
private static final Logger LOGGER = LoggerFactory
.getLogger(TesseractHelper.class);
+ /**
+ * Patterns for matching hOCR element bboxes.
+ */
+ private static final Pattern BBOX_PATTERN = Pattern.compile(".*bbox(\\s+\\d+){4}.*");
+ private static final Pattern BBOX_COORDINATE_PATTERN = Pattern
+ .compile(
+ ".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*");
+
+ /**
+ * Indices in array representing bbox.
+ */
+ private static final int LEFT_IDX = 0;
+ private static final int BOTTOM_IDX = 1;
+ private static final int RIGHT_IDX = 2;
+ private static final int TOP_IDX = 3;
+
+ /**
+ * Size of the array containing bbox.
+ */
+ private static final int BBOX_ARRAY_SIZE = 4;
+
/**
* Creates a new {@link TesseractHelper} instance.
*/
@@ -86,12 +108,13 @@ public static Map> parseHocrFile(
throws IOException {
Map> imageData =
new LinkedHashMap>();
+ Map unparsedBBoxes = new LinkedHashMap<>();
for (File inputFile : inputFiles) {
if (inputFile != null
&& Files.exists(
- java.nio.file.Paths
- .get(inputFile.getAbsolutePath()))) {
+ java.nio.file.Paths
+ .get(inputFile.getAbsolutePath()))) {
FileInputStream fileInputStream =
new FileInputStream(inputFile.getAbsolutePath());
Document doc = Jsoup.parse(fileInputStream,
@@ -99,10 +122,6 @@ public static Map> parseHocrFile(
inputFile.getAbsolutePath());
Elements pages = doc.getElementsByClass("ocr_page");
- Pattern bboxPattern = Pattern.compile(".*bbox(\\s+\\d+){4}.*");
- Pattern bboxCoordinatePattern = Pattern
- .compile(
- ".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*");
List searchedClasses = TextPositioning.BY_LINES
.equals(textPositioning)
? Arrays.asList("ocr_line", "ocr_caption")
@@ -124,26 +143,11 @@ public static Map> parseHocrFile(
}
}
for (Element obj : objects) {
- String value = obj.attr("title");
- Matcher bboxMatcher = bboxPattern.matcher(value);
- if (bboxMatcher.matches()) {
- Matcher bboxCoordinateMatcher =
- bboxCoordinatePattern
- .matcher(bboxMatcher.group());
- if (bboxCoordinateMatcher.matches()) {
- List coordinates =
- new ArrayList();
- for (int i = 0; i < 4; i++) {
- String coord = bboxCoordinateMatcher
- .group(i + 1);
- coordinates
- .add(Float.parseFloat(coord));
- }
-
- textData.add(new TextInfo(obj.text(),
- coordinates));
- }
- }
+ List coordinates = getAlignedBBox(obj,
+ textPositioning,
+ unparsedBBoxes);
+ textData.add(new TextInfo(obj.text(),
+ coordinates));
}
}
if (textData.size() > 0) {
@@ -157,9 +161,97 @@ public static Map> parseHocrFile(
fileInputStream.close();
}
}
+ for (Node node : unparsedBBoxes.values()) {
+ LOGGER.warn(MessageFormatUtil.format(
+ Tesseract4LogMessageConstant.CANNOT_PARSE_NODE_BBOX,
+ node.toString()
+ ));
+ }
return imageData;
}
+ /**
+ * Get and align (if needed) bbox of the element.
+ */
+ static List getAlignedBBox(Element object,
+ TextPositioning textPositioning,
+ Map unparsedBBoxes) {
+ final List coordinates = parseBBox(object, unparsedBBoxes);
+ if (TextPositioning.BY_WORDS_AND_LINES == textPositioning
+ || TextPositioning.BY_WORDS == textPositioning) {
+ Node line = object.parent();
+ final List lineCoordinates = parseBBox(line, unparsedBBoxes);
+ if (TextPositioning.BY_WORDS_AND_LINES == textPositioning) {
+ coordinates.set(BOTTOM_IDX, lineCoordinates.get(BOTTOM_IDX));
+ coordinates.set(TOP_IDX, lineCoordinates.get(TOP_IDX));
+ }
+ detectAndFixBrokenBBoxes(object, coordinates,
+ lineCoordinates, unparsedBBoxes);
+ }
+ return coordinates;
+ }
+
+ /**
+ * Parses element bbox.
+ *
+ * @param node element containing bbox
+ * @param unparsedBBoxes list of element ids with bboxes which could not be parsed
+ * @return parsed bbox
+ */
+ static List parseBBox(Node node, Map unparsedBBoxes) {
+ List bbox = new ArrayList<>();
+ Matcher bboxMatcher = BBOX_PATTERN.matcher(node.attr("title"));
+ if (bboxMatcher.matches()) {
+ Matcher bboxCoordinateMatcher =
+ BBOX_COORDINATE_PATTERN
+ .matcher(bboxMatcher.group());
+ if (bboxCoordinateMatcher.matches()) {
+ for (int i = 0; i < BBOX_ARRAY_SIZE; i++) {
+ String coord = bboxCoordinateMatcher
+ .group(i + 1);
+ bbox.add(Float.parseFloat(coord));
+ }
+ }
+ }
+ if (bbox.size() == 0) {
+ bbox = Arrays.asList(0f, 0f, 0f, 0f);
+ String id = node.attr("id");
+ if (id != null && !unparsedBBoxes.containsKey(id)) {
+ unparsedBBoxes.put(id, node);
+ }
+ }
+ return bbox;
+ }
+
+ /**
+ * Sometimes hOCR file contains broke character bboxes which are equal to page bbox.
+ * This method attempts to detect and fix them.
+ */
+ static void detectAndFixBrokenBBoxes(Element object, List coordinates,
+ List lineCoordinates,
+ Map unparsedBBoxes) {
+ if (coordinates.get(LEFT_IDX) < lineCoordinates.get(LEFT_IDX)
+ || coordinates.get(LEFT_IDX) > lineCoordinates.get(RIGHT_IDX)) {
+ if (object.previousElementSibling() == null) {
+ coordinates.set(LEFT_IDX, lineCoordinates.get(LEFT_IDX));
+ } else {
+ Element sibling = object.previousElementSibling();
+ List siblingBBox = parseBBox(sibling, unparsedBBoxes);
+ coordinates.set(LEFT_IDX, siblingBBox.get(RIGHT_IDX));
+ }
+ }
+ if (coordinates.get(RIGHT_IDX) > lineCoordinates.get(RIGHT_IDX)
+ || coordinates.get(RIGHT_IDX) < lineCoordinates.get(LEFT_IDX)) {
+ if (object.nextElementSibling() == null) {
+ coordinates.set(RIGHT_IDX, lineCoordinates.get(RIGHT_IDX));
+ } else {
+ Element sibling = object.nextElementSibling();
+ List siblingBBox = parseBBox(sibling, unparsedBBoxes);
+ coordinates.set(RIGHT_IDX, siblingBBox.get(LEFT_IDX));
+ }
+ }
+ }
+
/**
* Deletes file using provided path.
*
@@ -208,7 +300,7 @@ static String readTxtFile(final File txtFile) {
* @param data text data in required format as {@link java.lang.String}
*/
static void writeToTextFile(final String path,
- final String data) {
+ final String data) {
try (Writer writer = new OutputStreamWriter(new FileOutputStream(path),
StandardCharsets.UTF_8)) {
writer.write(data);
@@ -228,7 +320,7 @@ static void writeToTextFile(final String path,
* @throws Tesseract4OcrException if provided command failed
*/
static void runCommand(final String execPath,
- final List paramsList) throws Tesseract4OcrException {
+ final List paramsList) throws Tesseract4OcrException {
try {
String params = String.join(" ", paramsList);
boolean cmdSucceeded = SystemUtil
@@ -251,4 +343,4 @@ static void runCommand(final String execPath,
.TESSERACT_FAILED);
}
}
-}
+}
\ No newline at end of file
diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TextPositioning.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TextPositioning.java
index c8edb07..f660f7e 100644
--- a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TextPositioning.java
+++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TextPositioning.java
@@ -39,5 +39,9 @@ public enum TextPositioning {
/**
* Text will be located by words retrieved from hocr file.
*/
- BY_WORDS
-}
+ BY_WORDS,
+ /**
+ * Similar to BY_WORDS mode, but top and bottom of word BBox are inherited from line.
+ */
+ BY_WORDS_AND_LINES,
+}
\ No newline at end of file
diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/IntegrationTestHelper.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/IntegrationTestHelper.java
index 394e3d6..0261e09 100644
--- a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/IntegrationTestHelper.java
+++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/IntegrationTestHelper.java
@@ -87,6 +87,8 @@ public class IntegrationTestHelper extends ExtendedITextTest {
// path to font for hindi
protected static final String NOTO_SANS_FONT_PATH = TEST_FONTS_DIRECTORY + "NotoSans-Regular.ttf";
+ // path to font for thai
+ protected static final String NOTO_SANS_THAI_FONT_PATH = TEST_FONTS_DIRECTORY + "NotoSansThai-Regular.ttf";
// path to font for japanese
protected static final String KOSUGI_FONT_PATH = TEST_FONTS_DIRECTORY + "Kosugi-Regular.ttf";
// path to font for chinese
@@ -101,13 +103,14 @@ public class IntegrationTestHelper extends ExtendedITextTest {
static {
Map fontPathToNameMap = new HashMap<>();
fontPathToNameMap.put(NOTO_SANS_FONT_PATH, "NotoSans");
+ fontPathToNameMap.put(NOTO_SANS_THAI_FONT_PATH, "NotoSansThai");
fontPathToNameMap.put(KOSUGI_FONT_PATH, "Kosugi");
fontPathToNameMap.put(NOTO_SANS_SC_FONT_PATH, "NotoSansSC");
fontPathToNameMap.put(CAIRO_FONT_PATH, "Cairo");
fontPathToNameMap.put(FREE_SANS_FONT_PATH, "FreeSans");
FONT_PATH_TO_FONT_NAME_MAP = Collections.unmodifiableMap(fontPathToNameMap);
}
-
+
public enum ReaderType {
LIB,
EXECUTABLE
@@ -164,7 +167,7 @@ protected static File getTessDataDirectory() {
* Retrieve text from specified page from given PDF document.
*/
protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader,
- File file, int page, List languages, List fonts) {
+ File file, int page, List languages, List fonts) {
String result = null;
String pdfPath = null;
try {
@@ -183,7 +186,7 @@ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader,
* Retrieve text from specified page from given PDF document.
*/
protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader,
- File file, int page, List languages, String fontPath) {
+ File file, int page, List languages, String fontPath) {
return getTextFromPdf(tesseractReader, file, page, languages,
Collections.singletonList(fontPath));
}
@@ -192,7 +195,7 @@ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader,
* Retrieve text from the first page of given PDF document setting font.
*/
protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, File file,
- List languages, String fontPath) {
+ List languages, String fontPath) {
return getTextFromPdf(tesseractReader, file, 1, languages, fontPath);
}
@@ -200,7 +203,7 @@ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, Fil
* Retrieve text from the first page of given PDF document.
*/
protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, File file,
- List languages) {
+ List languages) {
return getTextFromPdf(tesseractReader, file, 1, languages,
new ArrayList());
}
@@ -209,7 +212,7 @@ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, Fil
* Retrieve text from the required page of given PDF document.
*/
protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, File file, int page,
- List languages) {
+ List languages) {
return getTextFromPdf(tesseractReader, file, page, languages, new ArrayList());
}
@@ -224,7 +227,7 @@ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, Fil
* Get text from layer specified by name from page.
*/
protected String getTextFromPdfLayer(String pdfPath, String layerName,
- int page, boolean useActualText) throws IOException {
+ int page, boolean useActualText) throws IOException {
PdfDocument pdfDocument = new PdfDocument(new PdfReader(pdfPath),
new DocumentProperties().setEventCountingMetaInfo(new PdfOcrMetaInfo()));
@@ -243,7 +246,7 @@ protected String getTextFromPdfLayer(String pdfPath, String layerName,
* Get text from layer specified by name from page.
*/
protected String getTextFromPdfLayer(String pdfPath, String layerName,
- int page) throws IOException {
+ int page) throws IOException {
return getTextFromPdfLayer(pdfPath, layerName, page, false);
}
@@ -253,7 +256,7 @@ protected String getTextFromPdfLayer(String pdfPath, String layerName,
* {@link LocationTextExtractionStrategy#getResultantText()}.
*/
protected String getTextFromPdfLayerUsingActualText(String pdfPath,
- String layerName, int page) throws IOException {
+ String layerName, int page) throws IOException {
return getTextFromPdfLayer(pdfPath, layerName, page, true)
.replace(" ", "");
}
@@ -378,7 +381,7 @@ protected void doOcrAndSavePdfToPath(
* (Text will be invisible)
*/
protected void doOcrAndSavePdfToPath(AbstractTesseract4OcrEngine tesseractReader, String imgPath,
- String pdfPath, List languages, List fonts) {
+ String pdfPath, List languages, List fonts) {
doOcrAndSavePdfToPath(tesseractReader, imgPath, pdfPath,
languages, fonts, null);
}
@@ -469,7 +472,7 @@ public void setImageBBoxRectangle(com.itextpdf.kernel.geom.Rectangle imageBBoxRe
@Override
protected boolean isChunkAtWordBoundary(TextChunk chunk,
- TextChunk previousChunk) {
+ TextChunk previousChunk) {
ITextChunkLocation curLoc = chunk.getLocation();
ITextChunkLocation prevLoc = previousChunk.getLocation();
@@ -522,4 +525,4 @@ else if (type.equals(EventType.RENDER_IMAGE)) {
: tagHierarchy.get(0).getProperties().get(PdfName.Name).toString();
}
}
-}
+}
\ No newline at end of file
diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/multithreading/MultiThreadingTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/multithreading/MultiThreadingTest.java
index 3a36dee..dc62ce7 100644
--- a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/multithreading/MultiThreadingTest.java
+++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/multithreading/MultiThreadingTest.java
@@ -69,7 +69,7 @@ public static void beforeClass() {
public void initTesseractProperties() {
Tesseract4OcrEngineProperties ocrEngineProperties =
new Tesseract4OcrEngineProperties();
- ocrEngineProperties.setPathToTessData(new File(sourceFolder + "../../tessdata/"));
+ ocrEngineProperties.setPathToTessData(new File(sourceFolder + "../../tessdata"));
tesseractReader.setTesseract4OcrEngineProperties(ocrEngineProperties);
}
@@ -94,9 +94,6 @@ public void testEventCountingPdfEvent() throws InterruptedException {
}
for (int i = 0; i < n; i++) {
threads[i].start();
-
- // The test will pass in sequential mode, i.e. if the following line is uncommented
- //threads[i].join();
}
for (int i = 0; i < n; i++) {
threads[i].join();
@@ -127,8 +124,8 @@ private static Thread getThread(DoImageOcrRunnable runnable) {
}
public static class TestEventCounter extends EventCounter {
- private List events = new ArrayList<>();
- private List metaInfos = new ArrayList<>();
+ private List events = new ArrayList();
+ private List metaInfos = new ArrayList();
public List getEvents() {
return events;
@@ -139,10 +136,9 @@ public List getMetaInfos() {
}
@Override
- protected void onEvent(IEvent event, IMetaInfo metaInfo) {
+ synchronized protected void onEvent(IEvent event, IMetaInfo metaInfo) {
this.events.add(event);
this.metaInfos.add(metaInfo);
}
}
-
}
diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tessdata/TessDataIntegrationLibTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tessdata/TessDataIntegrationLibTest.java
index 689a058..297036d 100644
--- a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tessdata/TessDataIntegrationLibTest.java
+++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tessdata/TessDataIntegrationLibTest.java
@@ -22,10 +22,16 @@ This file is part of the iText (R) project.
*/
package com.itextpdf.pdfocr.tessdata;
+import com.itextpdf.kernel.colors.DeviceRgb;
+import com.itextpdf.kernel.utils.CompareTool;
+import com.itextpdf.pdfocr.PdfOcrLogMessageConstant;
import com.itextpdf.pdfocr.TextInfo;
import com.itextpdf.pdfocr.tesseract4.OutputFormat;
+import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrEngineProperties;
import com.itextpdf.pdfocr.tesseract4.TesseractHelper;
import com.itextpdf.pdfocr.tesseract4.TextPositioning;
+import com.itextpdf.test.annotations.LogMessage;
+import com.itextpdf.test.annotations.LogMessages;
import com.itextpdf.test.annotations.type.IntegrationTest;
import org.junit.Assert;
@@ -33,6 +39,7 @@ This file is part of the iText (R) project.
import org.junit.experimental.categories.Category;
import java.io.File;
+import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
@@ -43,7 +50,7 @@ public TessDataIntegrationLibTest() {
super(ReaderType.LIB);
}
- @Test(timeout = 50000)
+ @Test(timeout = 60000)
public void textOutputFromHalftoneFile() {
String imgPath = TEST_IMAGES_DIRECTORY + "halftone.jpg";
String expected01 = "Silliness Enablers";
@@ -59,7 +66,7 @@ public void textOutputFromHalftoneFile() {
Assert.assertTrue(result.contains(expected03));
}
- @Test(timeout = 50000)
+ @Test(timeout = 60000)
public void hocrOutputFromHalftoneFile() throws java.io.IOException {
String path = TEST_IMAGES_DIRECTORY + "halftone.jpg";
String expected01 = "Silliness";
@@ -97,6 +104,69 @@ public void hocrOutputFromHalftoneFile() throws java.io.IOException {
Assert.assertTrue(findTextInPageData(pageData, 1, expected09));
}
+ @Test
+ public void compareInvoiceFrontThaiImage() throws InterruptedException, java.io.IOException {
+ String testName = "compareInvoiceFrontThaiImage";
+ String filename = "invoice_front_thai";
+
+ //Tesseract for Java and Tesseract for .NET give different output
+ //So we cannot use one reference pdf file for them
+ String expectedPdfPathJava = TEST_DOCUMENTS_DIRECTORY + filename + "_" + testFileTypeName + "_java.pdf";
+ String expectedPdfPathDotNet = TEST_DOCUMENTS_DIRECTORY + filename + "_" + testFileTypeName + "_dotnet.pdf";
+
+ String resultPdfPath = getTargetDirectory() + filename + "_" + testName + "_" + testFileTypeName + ".pdf";
+
+ Tesseract4OcrEngineProperties properties =
+ tesseractReader.getTesseract4OcrEngineProperties();
+ properties.setTextPositioning(TextPositioning.BY_WORDS_AND_LINES);
+ properties.setPathToTessData(getTessDataDirectory());
+ properties.setLanguages(Arrays.asList("tha", "eng"));
+ tesseractReader.setTesseract4OcrEngineProperties(properties);
+
+ doOcrAndSavePdfToPath(tesseractReader,
+ TEST_IMAGES_DIRECTORY + filename + ".jpg", resultPdfPath,
+ Arrays.asList("tha", "eng"), Arrays.asList(NOTO_SANS_THAI_FONT_PATH, NOTO_SANS_FONT_PATH), DeviceRgb.RED);
+ boolean javaTest = new CompareTool().compareByContent(resultPdfPath, expectedPdfPathJava,
+ TEST_DOCUMENTS_DIRECTORY, "diff_") == null;
+ boolean dotNetTest = new CompareTool().compareByContent(resultPdfPath, expectedPdfPathDotNet,
+ TEST_DOCUMENTS_DIRECTORY, "diff_") == null;
+
+ Assert.assertTrue(javaTest || dotNetTest);
+ }
+
+ @LogMessages(messages = {
+ @LogMessage(messageTemplate = PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER, count = 2)
+ })
+ @Test
+ public void compareThaiTextImage() throws InterruptedException, java.io.IOException {
+ String testName = "compareThaiTextImage";
+ String filename = "thai_01";
+
+ //Tesseract for Java and Tesseract for .NET give different output
+ //So we cannot use one reference pdf file for them
+ String expectedPdfPathJava = TEST_DOCUMENTS_DIRECTORY + filename + "_" + testFileTypeName + "_java.pdf";
+ String expectedPdfPathDotNet = TEST_DOCUMENTS_DIRECTORY + filename + "_" + testFileTypeName + "_dotnet.pdf";
+
+ String resultPdfPath = getTargetDirectory() + filename + "_" + testName + "_" + testFileTypeName + ".pdf";
+
+ Tesseract4OcrEngineProperties properties =
+ tesseractReader.getTesseract4OcrEngineProperties();
+ properties.setTextPositioning(TextPositioning.BY_WORDS_AND_LINES);
+ properties.setPathToTessData(getTessDataDirectory());
+ properties.setLanguages(Arrays.asList("tha"));
+ tesseractReader.setTesseract4OcrEngineProperties(properties);
+
+ doOcrAndSavePdfToPath(tesseractReader,
+ TEST_IMAGES_DIRECTORY + filename + ".jpg", resultPdfPath,
+ Arrays.asList("tha"), Arrays.asList(NOTO_SANS_THAI_FONT_PATH), DeviceRgb.RED);
+ boolean javaTest = new CompareTool().compareByContent(resultPdfPath, expectedPdfPathJava,
+ TEST_DOCUMENTS_DIRECTORY, "diff_") == null;
+ boolean dotNetTest = new CompareTool().compareByContent(resultPdfPath, expectedPdfPathDotNet,
+ TEST_DOCUMENTS_DIRECTORY, "diff_") == null;
+
+ Assert.assertTrue(javaTest || dotNetTest);
+ }
+
/**
* Searches for certain text in page data.
*/
@@ -109,4 +179,4 @@ private boolean findTextInPageData(Map> pageData, int pa
return false;
}
-}
+}
\ No newline at end of file
diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/ApiTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/ApiTest.java
index 1f43e3e..f5eece1 100644
--- a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/ApiTest.java
+++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/ApiTest.java
@@ -24,6 +24,7 @@ This file is part of the iText (R) project.
import com.itextpdf.io.util.MessageFormatUtil;
import com.itextpdf.pdfocr.IntegrationTestHelper;
+import com.itextpdf.pdfocr.TextInfo;
import com.itextpdf.test.annotations.LogMessage;
import com.itextpdf.test.annotations.LogMessages;
@@ -32,6 +33,10 @@ This file is part of the iText (R) project.
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
import net.sourceforge.lept4j.Pix;
import net.sourceforge.tess4j.TesseractException;
import org.junit.Assert;
@@ -45,7 +50,7 @@ public class ApiTest extends IntegrationTestHelper {
public ExpectedException junitExpectedException = ExpectedException.none();
@LogMessages(messages = {
- @LogMessage(messageTemplate = Tesseract4OcrException.PATH_TO_TESS_DATA_IS_NOT_SET)
+ @LogMessage(messageTemplate = Tesseract4OcrException.PATH_TO_TESS_DATA_IS_NOT_SET)
})
@Test
public void testDefaultTessDataPathValidationForLib() {
@@ -60,7 +65,7 @@ public void testDefaultTessDataPathValidationForLib() {
}
@LogMessages(messages = {
- @LogMessage(messageTemplate = Tesseract4OcrException.PATH_TO_TESS_DATA_IS_NOT_SET)
+ @LogMessage(messageTemplate = Tesseract4OcrException.PATH_TO_TESS_DATA_IS_NOT_SET)
})
@Test
public void testDefaultTessDataPathValidationForExecutable() {
@@ -76,7 +81,7 @@ public void testDefaultTessDataPathValidationForExecutable() {
}
@LogMessages(messages = {
- @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE)
+ @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE)
})
@Test
public void testDoTesseractOcrForIncorrectImageForExecutable() {
@@ -96,8 +101,8 @@ public void testDoTesseractOcrForIncorrectImageForExecutable() {
}
@LogMessages(messages = {
- @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_FAILED),
- @LogMessage(messageTemplate = Tesseract4LogMessageConstant.TESSERACT_FAILED)
+ @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_FAILED),
+ @LogMessage(messageTemplate = Tesseract4LogMessageConstant.TESSERACT_FAILED)
})
@Test
public void testOcrResultForSinglePageForNullImage() {
@@ -131,10 +136,10 @@ public void testDoTesseractOcrForNonAsciiPathForExecutable() {
}
@LogMessages(messages = {
- @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE),
- @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_FAILED),
- @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_NOT_FOUND),
- @LogMessage(messageTemplate = Tesseract4LogMessageConstant.COMMAND_FAILED)
+ @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE),
+ @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_FAILED),
+ @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_NOT_FOUND),
+ @LogMessage(messageTemplate = Tesseract4LogMessageConstant.COMMAND_FAILED)
}, ignore = true)
@Test
public void testDoTesseractOcrForExecutableForWin() {
@@ -143,10 +148,10 @@ public void testDoTesseractOcrForExecutableForWin() {
}
@LogMessages(messages = {
- @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE),
- @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_FAILED),
- @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_NOT_FOUND),
- @LogMessage(messageTemplate = Tesseract4LogMessageConstant.COMMAND_FAILED)
+ @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE),
+ @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_FAILED),
+ @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_NOT_FOUND),
+ @LogMessage(messageTemplate = Tesseract4LogMessageConstant.COMMAND_FAILED)
}, ignore = true)
@Test
public void testDoTesseractOcrForExecutableForLinux() {
@@ -154,6 +159,22 @@ public void testDoTesseractOcrForExecutableForLinux() {
testSettingOsName("linux");
}
+ @LogMessages(messages = {
+ @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_PARSE_NODE_BBOX, count = 4)
+ })
+ @Test
+ public void testDetectAndFixBrokenBBoxes() throws IOException {
+ File hocrFile = new File(TEST_DOCUMENTS_DIRECTORY + "broken_bboxes.hocr");
+ Map> parsedHocr = TesseractHelper.parseHocrFile(Collections.singletonList(hocrFile),
+ TextPositioning.BY_WORDS_AND_LINES);
+ TextInfo textInfo = parsedHocr.get(1).get(1);
+
+ Assert.assertEquals(383.0f, (float)textInfo.getBbox().get(0), 0.1);
+ Assert.assertEquals(101.0f, (float)textInfo.getBbox().get(1), 0.1);
+ Assert.assertEquals(514.0f, (float)textInfo.getBbox().get(2), 0.1);
+ Assert.assertEquals(136.0f, (float)textInfo.getBbox().get(3), 0.1);
+ }
+
private void testSettingOsName(String osName) {
String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg";
File imgFile = new File(path);
@@ -175,4 +196,4 @@ private void testSettingOsName(String osName) {
System.setProperty(osPropertyName, os);
}
}
-}
+}
\ No newline at end of file
diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/broken_bboxes.hocr b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/broken_bboxes.hocr
new file mode 100644
index 0000000..4b432e0
--- /dev/null
+++ b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/broken_bboxes.hocr
@@ -0,0 +1,66 @@
+
+
+
+
+
+
+
+
+
+
+
+
+ บ
+ 1
+ ซ่
+ 1
+ ฒ
+
+
+ ป
+ ร
+ ะ
+ เท
+ ศ
+ ไ
+ ท
+ ย
+ ม
+ ี
+ ช็
+ อ
+ อ
+ ย
+ ่
+ า
+ ง
+ เป
+ ็
+ น
+ ท
+ า
+ ง
+ ก
+ า
+ ร
+ ว
+ ่
+ า
+ ร
+ า
+ ช
+ อ
+ า
+ ณา
+ จ
+ ั
+ ก
+ ร
+ ไท
+ ย
+
+
+
+
+
+
\ No newline at end of file
diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/invoice_front_thai_lib_dotnet.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/invoice_front_thai_lib_dotnet.pdf
new file mode 100644
index 0000000..011d403
Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/invoice_front_thai_lib_dotnet.pdf differ
diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/invoice_front_thai_lib_java.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/invoice_front_thai_lib_java.pdf
new file mode 100644
index 0000000..616b588
Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/invoice_front_thai_lib_java.pdf differ
diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/thai_01_lib_dotnet.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/thai_01_lib_dotnet.pdf
new file mode 100644
index 0000000..68f5d7e
Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/thai_01_lib_dotnet.pdf differ
diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/thai_01_lib_java.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/thai_01_lib_java.pdf
new file mode 100644
index 0000000..b65d60c
Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/thai_01_lib_java.pdf differ
diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/NotoSansThai-Regular.ttf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/NotoSansThai-Regular.ttf
new file mode 100644
index 0000000..da12e41
Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/NotoSansThai-Regular.ttf differ
diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/invoice_front_thai.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/invoice_front_thai.jpg
new file mode 100644
index 0000000..e390fb7
Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/invoice_front_thai.jpg differ
diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/thai_01.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/thai_01.jpg
new file mode 100644
index 0000000..9aef2a5
Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/thai_01.jpg differ
diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/script/Thai.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/script/Thai.traineddata
new file mode 100644
index 0000000..62acc3d
Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/script/Thai.traineddata differ
diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/tha.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/tha.traineddata
new file mode 100644
index 0000000..fa80ee4
Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/tha.traineddata differ
diff --git a/pom.xml b/pom.xml
index e8ab2c5..30ffeef 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,12 +5,12 @@
com.itextpdf
root
- 7.1.11
+ 7.1.12
pdfocr-root
- 1.0.0
+ 1.0.1
pom
pdfOCR
@@ -22,7 +22,7 @@
- 7.1.11
+ 7.1.12
1.8
${java.version}
${java.version}