From 37e3e30112e3312086ff8d259d5e773cd0cdfddd Mon Sep 17 00:00:00 2001 From: Aatman Vaidya Date: Tue, 2 Jan 2024 15:00:39 +0530 Subject: [PATCH] docs: documentation for tesseract ocr operator --- docs/package-lock.json | 200 ++++++++++++++---- .../detect-text-in-image-tesseract.mdx | 18 +- .../detect_text_in_image_tesseract.py | 6 +- 3 files changed, 174 insertions(+), 50 deletions(-) diff --git a/docs/package-lock.json b/docs/package-lock.json index c1bbf077..6f9103a4 100644 --- a/docs/package-lock.json +++ b/docs/package-lock.json @@ -5,6 +5,7 @@ "requires": true, "packages": { "": { + "name": "docs", "version": "1.0.0", "dependencies": { "@deckdeckgo/highlight-code": "^3.4.1", @@ -1923,6 +1924,17 @@ "node": ">= 4" } }, + "node_modules/@eslint/eslintrc/node_modules/type-fest": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.20.2.tgz", + "integrity": "sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/@gatsbyjs/reach-router": { "version": "1.3.6", "resolved": "https://registry.npmjs.org/@gatsbyjs/reach-router/-/reach-router-1.3.6.tgz", @@ -4063,9 +4075,9 @@ } }, "node_modules/acorn": { - "version": "7.4.1", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-7.4.1.tgz", - "integrity": "sha512-nQyp0o1/mNdbTO1PO6kHkwSrmgZ0MT/jCCpNiwbUjGoRN4dlBhqJtoQuCnEOKzgTVwg0ZWiCoQy6SxMebQVh8A==", + "version": "8.11.3", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.11.3.tgz", + "integrity": "sha512-Y9rRfJG5jcKOE0CLisYbojUjIrIEE7AGMzA/Sm4BslANhbS+cDMpgBdcPT91oJ7OuJ9hYJBx59RjbhxVnrF8Xg==", "bin": { "acorn": "bin/acorn" }, @@ -4711,6 +4723,36 @@ "resolved": "https://registry.npmjs.org/axobject-query/-/axobject-query-2.2.0.tgz", "integrity": "sha512-Td525n+iPOOyUQIeBfcASuG6uJsDOITl7Mds5gFyerkWiX7qhUTdYUBlSgNMyVqtSJqwpt1kXGLdUt6SykLMRA==" }, + "node_modules/babel-eslint": { + "version": "10.1.0", + "resolved": "https://registry.npmjs.org/babel-eslint/-/babel-eslint-10.1.0.tgz", + "integrity": "sha512-ifWaTHQ0ce+448CYop8AdrQiBsGrnC+bMgfyKFdi6EsPLTAWG+QfyDeM6OH+FmWnKvEq5NnBMLvlBUPKQZoDSg==", + "deprecated": "babel-eslint is now @babel/eslint-parser. This package will no longer receive updates.", + "peer": true, + "dependencies": { + "@babel/code-frame": "^7.0.0", + "@babel/parser": "^7.7.0", + "@babel/traverse": "^7.7.0", + "@babel/types": "^7.7.0", + "eslint-visitor-keys": "^1.0.0", + "resolve": "^1.12.0" + }, + "engines": { + "node": ">=6" + }, + "peerDependencies": { + "eslint": ">= 4.12.1" + } + }, + "node_modules/babel-eslint/node_modules/eslint-visitor-keys": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-1.3.0.tgz", + "integrity": "sha512-6J72N8UNa462wa/KFODt/PJ3IU60SDpC3QXC1Hjc1BXXpfL2C9R5+AU7jhe0F6GREqVMh4Juu+NY7xn+6dipUQ==", + "peer": true, + "engines": { + "node": ">=4" + } + }, "node_modules/babel-jsx-utils": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/babel-jsx-utils/-/babel-jsx-utils-1.1.0.tgz", @@ -5185,6 +5227,17 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/boxen/node_modules/type-fest": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.20.2.tgz", + "integrity": "sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/brace-expansion": { "version": "1.1.11", "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", @@ -8499,6 +8552,17 @@ "node": ">=8" } }, + "node_modules/eslint/node_modules/type-fest": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.20.2.tgz", + "integrity": "sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/eslint/node_modules/which": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", @@ -8526,6 +8590,17 @@ "node": "^10.12.0 || >=12.0.0" } }, + "node_modules/espree/node_modules/acorn": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-7.4.1.tgz", + "integrity": "sha512-nQyp0o1/mNdbTO1PO6kHkwSrmgZ0MT/jCCpNiwbUjGoRN4dlBhqJtoQuCnEOKzgTVwg0ZWiCoQy6SxMebQVh8A==", + "bin": { + "acorn": "bin/acorn" + }, + "engines": { + "node": ">=0.4.0" + } + }, "node_modules/espree/node_modules/eslint-visitor-keys": { "version": "1.3.0", "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-1.3.0.tgz", @@ -14825,17 +14900,6 @@ "url": "https://opencollective.com/unified" } }, - "node_modules/micromark-extension-mdxjs/node_modules/acorn": { - "version": "8.5.0", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.5.0.tgz", - "integrity": "sha512-yXbYeFy+jUuYd3/CDcg2NkIYE991XYX/bje7LmjJigUciaeO1JR4XxXgCIV1/Zc/dRuFEyw1L0pbA+qynJkW5Q==", - "bin": { - "acorn": "bin/acorn" - }, - "engines": { - "node": ">=0.4.0" - } - }, "node_modules/micromark-factory-destination": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/micromark-factory-destination/-/micromark-factory-destination-1.0.0.tgz", @@ -21902,9 +21966,11 @@ } }, "node_modules/type-fest": { - "version": "0.20.2", - "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.20.2.tgz", - "integrity": "sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==", + "version": "0.13.1", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.13.1.tgz", + "integrity": "sha512-34R7HTnG0XIJcBSn5XhDd7nNFPRcXYRZrBB2O2jdKqYODldSzBAqzsWoZYYvduky73toYS/ESqxPvkDf/F0XMg==", + "optional": true, + "peer": true, "engines": { "node": ">=10" }, @@ -21942,6 +22008,19 @@ "is-typedarray": "^1.0.0" } }, + "node_modules/typescript": { + "version": "5.3.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.3.3.tgz", + "integrity": "sha512-pXWcraxM0uxAS+tN0AG/BF2TyqmHO014Z070UsJ+pFvYuRSq8KH8DmWpnbXe0pEPDHXZV3FcAbJkijJ5oNEnWw==", + "peer": true, + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, "node_modules/unbox-primitive": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.0.1.tgz", @@ -22869,17 +22948,6 @@ "debug": "^3.0.0" } }, - "node_modules/webpack/node_modules/acorn": { - "version": "8.5.0", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.5.0.tgz", - "integrity": "sha512-yXbYeFy+jUuYd3/CDcg2NkIYE991XYX/bje7LmjJigUciaeO1JR4XxXgCIV1/Zc/dRuFEyw1L0pbA+qynJkW5Q==", - "bin": { - "acorn": "bin/acorn" - }, - "engines": { - "node": ">=0.4.0" - } - }, "node_modules/webpack/node_modules/schema-utils": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-3.1.1.tgz", @@ -24629,6 +24697,11 @@ "version": "4.0.6", "resolved": "https://registry.npmjs.org/ignore/-/ignore-4.0.6.tgz", "integrity": "sha512-cyFDKrqc/YdcWFniJhzI42+AzS+gNwmUzOSFcRCQYwySuBBBy/KjuxWLZ/FHEH6Moq1NizMOBWyTcv8O4OZIMg==" + }, + "type-fest": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.20.2.tgz", + "integrity": "sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==" } } }, @@ -26339,9 +26412,9 @@ } }, "acorn": { - "version": "7.4.1", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-7.4.1.tgz", - "integrity": "sha512-nQyp0o1/mNdbTO1PO6kHkwSrmgZ0MT/jCCpNiwbUjGoRN4dlBhqJtoQuCnEOKzgTVwg0ZWiCoQy6SxMebQVh8A==" + "version": "8.11.3", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.11.3.tgz", + "integrity": "sha512-Y9rRfJG5jcKOE0CLisYbojUjIrIEE7AGMzA/Sm4BslANhbS+cDMpgBdcPT91oJ7OuJ9hYJBx59RjbhxVnrF8Xg==" }, "acorn-import-assertions": { "version": "1.8.0", @@ -26835,6 +26908,28 @@ "resolved": "https://registry.npmjs.org/axobject-query/-/axobject-query-2.2.0.tgz", "integrity": "sha512-Td525n+iPOOyUQIeBfcASuG6uJsDOITl7Mds5gFyerkWiX7qhUTdYUBlSgNMyVqtSJqwpt1kXGLdUt6SykLMRA==" }, + "babel-eslint": { + "version": "10.1.0", + "resolved": "https://registry.npmjs.org/babel-eslint/-/babel-eslint-10.1.0.tgz", + "integrity": "sha512-ifWaTHQ0ce+448CYop8AdrQiBsGrnC+bMgfyKFdi6EsPLTAWG+QfyDeM6OH+FmWnKvEq5NnBMLvlBUPKQZoDSg==", + "peer": true, + "requires": { + "@babel/code-frame": "^7.0.0", + "@babel/parser": "^7.7.0", + "@babel/traverse": "^7.7.0", + "@babel/types": "^7.7.0", + "eslint-visitor-keys": "^1.0.0", + "resolve": "^1.12.0" + }, + "dependencies": { + "eslint-visitor-keys": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-1.3.0.tgz", + "integrity": "sha512-6J72N8UNa462wa/KFODt/PJ3IU60SDpC3QXC1Hjc1BXXpfL2C9R5+AU7jhe0F6GREqVMh4Juu+NY7xn+6dipUQ==", + "peer": true + } + } + }, "babel-jsx-utils": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/babel-jsx-utils/-/babel-jsx-utils-1.1.0.tgz", @@ -27207,6 +27302,13 @@ "type-fest": "^0.20.2", "widest-line": "^3.1.0", "wrap-ansi": "^7.0.0" + }, + "dependencies": { + "type-fest": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.20.2.tgz", + "integrity": "sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==" + } } }, "brace-expansion": { @@ -29490,6 +29592,11 @@ "ansi-regex": "^5.0.1" } }, + "type-fest": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.20.2.tgz", + "integrity": "sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==" + }, "which": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", @@ -29781,6 +29888,11 @@ "eslint-visitor-keys": "^1.3.0" }, "dependencies": { + "acorn": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-7.4.1.tgz", + "integrity": "sha512-nQyp0o1/mNdbTO1PO6kHkwSrmgZ0MT/jCCpNiwbUjGoRN4dlBhqJtoQuCnEOKzgTVwg0ZWiCoQy6SxMebQVh8A==" + }, "eslint-visitor-keys": { "version": "1.3.0", "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-1.3.0.tgz", @@ -34686,13 +34798,6 @@ "micromark-extension-mdxjs-esm": "^1.0.0", "micromark-util-combine-extensions": "^1.0.0", "micromark-util-types": "^1.0.0" - }, - "dependencies": { - "acorn": { - "version": "8.5.0", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.5.0.tgz", - "integrity": "sha512-yXbYeFy+jUuYd3/CDcg2NkIYE991XYX/bje7LmjJigUciaeO1JR4XxXgCIV1/Zc/dRuFEyw1L0pbA+qynJkW5Q==" - } } }, "micromark-extension-mdxjs-esm": { @@ -39925,9 +40030,11 @@ } }, "type-fest": { - "version": "0.20.2", - "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.20.2.tgz", - "integrity": "sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==" + "version": "0.13.1", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.13.1.tgz", + "integrity": "sha512-34R7HTnG0XIJcBSn5XhDd7nNFPRcXYRZrBB2O2jdKqYODldSzBAqzsWoZYYvduky73toYS/ESqxPvkDf/F0XMg==", + "optional": true, + "peer": true }, "type-is": { "version": "1.6.18", @@ -39956,6 +40063,12 @@ "is-typedarray": "^1.0.0" } }, + "typescript": { + "version": "5.3.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.3.3.tgz", + "integrity": "sha512-pXWcraxM0uxAS+tN0AG/BF2TyqmHO014Z070UsJ+pFvYuRSq8KH8DmWpnbXe0pEPDHXZV3FcAbJkijJ5oNEnWw==", + "peer": true + }, "unbox-primitive": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.0.1.tgz", @@ -40584,11 +40697,6 @@ "webpack-sources": "^3.2.0" }, "dependencies": { - "acorn": { - "version": "8.5.0", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.5.0.tgz", - "integrity": "sha512-yXbYeFy+jUuYd3/CDcg2NkIYE991XYX/bje7LmjJigUciaeO1JR4XxXgCIV1/Zc/dRuFEyw1L0pbA+qynJkW5Q==" - }, "schema-utils": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-3.1.1.tgz", diff --git a/docs/src/pages/operators/detect-text-in-image-tesseract.mdx b/docs/src/pages/operators/detect-text-in-image-tesseract.mdx index 5d6f206e..d2b6066d 100644 --- a/docs/src/pages/operators/detect-text-in-image-tesseract.mdx +++ b/docs/src/pages/operators/detect-text-in-image-tesseract.mdx @@ -1 +1,17 @@ -## documentation for tesseract OCR \ No newline at end of file +## Documentation for Tesseract OCR Operator + +For each language support, we need to install separate tesseract operators for each language. Right now the current operator only supports English and Hindi languages. + +For Linux, you can follow these links to understand how and what modules to install for each language. +- https://tesseract-ocr.github.io/tessdoc/Installation.html +- https://www.loc.gov/standards/iso639-2/php/code_list.php + +To extract text from an image, we pass the image through a tesseract function like this + +``` +data = pytesseract.image_to_string(image, lang='eng+hin', config='--psm 6 --oem 1') +``` + +Here the config settings help us define some more insight into the image and LSTM blocks for the image extraction engines. + +You can take a look at the operator and the test of the operator for the entire code. \ No newline at end of file diff --git a/src/api/core/operators/detect_text_in_image_tesseract.py b/src/api/core/operators/detect_text_in_image_tesseract.py index 64c1e739..65c7e150 100644 --- a/src/api/core/operators/detect_text_in_image_tesseract.py +++ b/src/api/core/operators/detect_text_in_image_tesseract.py @@ -7,12 +7,12 @@ def initialize(param): global config_psm global config_oem + config_psm = 6 + config_oem = 1 global Image global pytesseract global requests global BytesIO - config_psm = 6 - config_oem = 1 import pytesseract from PIL import Image from io import BytesIO @@ -20,7 +20,7 @@ def initialize(param): def run(image_path): with Image.open(image_path) as load_image: - data = pytesseract.image_to_string(load_image, lang='eng+hin') + data = pytesseract.image_to_string(load_image, lang='eng+hin', config='--psm 6 --oem 1') return data def cleanup(param):