feat: operator to extract text in images using tesseract (#40)

* feat: opreator to detect text in images using tesseract * chore: adding test images and making test multilingual
tattle-made · Dec 27, 2023 · edec4a9 · edec4a9
1 parent ac254d1
commit edec4a9
Show file tree

Hide file tree

Showing 8 changed files with 74 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -121,7 +121,9 @@ credentials.json
 
 # Docker logs
 .docker/*
-src/api/sample_data/
+# src/api/core/operators/sample_data/
+# sample_data/**
 # a temporary folder made by denny.
 # it is safe to delete this line if you no longer see this folder in the root folder
-_docs
+_docs
+
diff --git a/docs/src/pages/operators/detect-text-in-image-tesseract.mdx b/docs/src/pages/operators/detect-text-in-image-tesseract.mdx
@@ -0,0 +1 @@
+## documentation for tesseract OCR
diff --git a/src/api/Dockerfile b/src/api/Dockerfile
@@ -1,7 +1,7 @@
-FROM jrottenberg/ffmpeg:4.0-scratch AS ffmpeg
+# FROM jrottenberg/ffmpeg:4.0-scratch AS ffmpeg
 
 FROM python:3.7-slim AS base
-COPY --from=ffmpeg / /
+# COPY --from=ffmpeg / /
 RUN apt-get update \
     && apt-get -y upgrade \
     && apt-get install -y \
@@ -15,6 +15,10 @@ ENV PATH=/root/.local/bin:$PATH
 RUN pip install --upgrade pip
 RUN pip install pip-tools
 RUN apt-get update && apt-get -y upgrade && apt-get install -y vim curl
+RUN apt-get install -y ffmpeg
+RUN apt-get update && \
+    apt-get -y upgrade && \
+    apt-get install -y tesseract-ocr tesseract-ocr-hin
 WORKDIR /app
 COPY requirements.txt /app/requirements.txt
 RUN pip install --user -r requirements.txt

diff --git a/src/api/core/operators/detect_text_in_image_tesseract.py b/src/api/core/operators/detect_text_in_image_tesseract.py
@@ -0,0 +1,41 @@
+from installer import install_packages
+
+requirement_list = ["pytesseract==0.3.10"]
+
+def initialize(param):
+    install_packages(requirement_list)
+
+    global config_psm
+    global config_oem
+    global Image
+    global pytesseract
+    global requests
+    global BytesIO
+    config_psm = 6
+    config_oem = 1
+    import pytesseract
+    from PIL import Image
+    from io import BytesIO
+    import requests
+
+def run(image_path):
+    with Image.open(image_path) as load_image:
+        data = pytesseract.image_to_string(load_image, lang='eng+hin')
+    return data
+
+def cleanup(param):
+    pass
+
+def state():
+    pass
+
+# if __name__ == "__main__":
+#     initialize(param={})
+#     image_path = 'sample_data/hindi-text.png'
+#     text_data = run(image_path)
+#     print(text_data)
+    # image_url = "https://tattle-media.s3.amazonaws.com/test-data/tattle-search/text-in-image-test-hindi.png"
+    # response = requests.get(image_url)
+    # response.raise_for_status()
+    # text_data = run(BytesIO(response.content))
+    # print(text_data)
diff --git a/src/api/core/operators/sample_data/hindi-text-2.png b/src/api/core/operators/sample_data/hindi-text-2.png
diff --git a/src/api/core/operators/sample_data/hindi-text.png b/src/api/core/operators/sample_data/hindi-text.png
diff --git a/src/api/core/operators/sample_data/text.png b/src/api/core/operators/sample_data/text.png
diff --git a/src/api/core/operators/test_detect_text_in_image_tesseract.py b/src/api/core/operators/test_detect_text_in_image_tesseract.py
@@ -0,0 +1,22 @@
+import unittest
+import detect_text_in_image_tesseract
+
+class Test(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        # initialize operator
+        detect_text_in_image_tesseract.initialize(param={})
+
+    @classmethod
+    def tearDownClass(cls):
+        # delete config files
+        pass
+
+    def test_sample_image_from_disk(self):
+        image_path = "sample_data/hindi-text-2.png"
+        image_text = detect_text_in_image_tesseract.run(image_path)
+        expected_text = "( मेरे पीछे कौन आ रहा है)"
+        self.assertEqual(image_text.strip(), expected_text.strip())
+
+# if __name__ == "__main__":
+#     unittest.main()