Skip to content

Commit

Permalink
feat: operator to extract text in images using tesseract (#40)
Browse files Browse the repository at this point in the history
* feat: opreator to detect text in images using tesseract
* chore: adding test images and making test multilingual
  • Loading branch information
aatmanvaidya committed Dec 27, 2023
1 parent ac254d1 commit edec4a9
Show file tree
Hide file tree
Showing 8 changed files with 74 additions and 4 deletions.
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,9 @@ credentials.json

# Docker logs
.docker/*
src/api/sample_data/
# src/api/core/operators/sample_data/
# sample_data/**
# a temporary folder made by denny.
# it is safe to delete this line if you no longer see this folder in the root folder
_docs
_docs

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
## documentation for tesseract OCR
8 changes: 6 additions & 2 deletions src/api/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FROM jrottenberg/ffmpeg:4.0-scratch AS ffmpeg
# FROM jrottenberg/ffmpeg:4.0-scratch AS ffmpeg

FROM python:3.7-slim AS base
COPY --from=ffmpeg / /
# COPY --from=ffmpeg / /
RUN apt-get update \
&& apt-get -y upgrade \
&& apt-get install -y \
Expand All @@ -15,6 +15,10 @@ ENV PATH=/root/.local/bin:$PATH
RUN pip install --upgrade pip
RUN pip install pip-tools
RUN apt-get update && apt-get -y upgrade && apt-get install -y vim curl
RUN apt-get install -y ffmpeg
RUN apt-get update && \
apt-get -y upgrade && \
apt-get install -y tesseract-ocr tesseract-ocr-hin
WORKDIR /app
COPY requirements.txt /app/requirements.txt
RUN pip install --user -r requirements.txt
Expand Down
41 changes: 41 additions & 0 deletions src/api/core/operators/detect_text_in_image_tesseract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from installer import install_packages

requirement_list = ["pytesseract==0.3.10"]

def initialize(param):
install_packages(requirement_list)

global config_psm
global config_oem
global Image
global pytesseract
global requests
global BytesIO
config_psm = 6
config_oem = 1
import pytesseract
from PIL import Image
from io import BytesIO
import requests

def run(image_path):
with Image.open(image_path) as load_image:
data = pytesseract.image_to_string(load_image, lang='eng+hin')
return data

def cleanup(param):
pass

def state():
pass

# if __name__ == "__main__":
# initialize(param={})
# image_path = 'sample_data/hindi-text.png'
# text_data = run(image_path)
# print(text_data)
# image_url = "https://tattle-media.s3.amazonaws.com/test-data/tattle-search/text-in-image-test-hindi.png"
# response = requests.get(image_url)
# response.raise_for_status()
# text_data = run(BytesIO(response.content))
# print(text_data)
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added src/api/core/operators/sample_data/hindi-text.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added src/api/core/operators/sample_data/text.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
22 changes: 22 additions & 0 deletions src/api/core/operators/test_detect_text_in_image_tesseract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import unittest
import detect_text_in_image_tesseract

class Test(unittest.TestCase):
@classmethod
def setUpClass(cls):
# initialize operator
detect_text_in_image_tesseract.initialize(param={})

@classmethod
def tearDownClass(cls):
# delete config files
pass

def test_sample_image_from_disk(self):
image_path = "sample_data/hindi-text-2.png"
image_text = detect_text_in_image_tesseract.run(image_path)
expected_text = "( मेरे पीछे कौन आ रहा है)"
self.assertEqual(image_text.strip(), expected_text.strip())

# if __name__ == "__main__":
# unittest.main()

0 comments on commit edec4a9

Please sign in to comment.