diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index de590f5..a61983a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -41,7 +41,7 @@ jobs: run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> $GITHUB_ENV - name: Install tesseract - run: sudo apt-get -y update && sudo apt-get install -y tesseract-ocr tesseract-ocr-fra + run: sudo apt-get -y update && sudo apt-get install -y libcurl4-openssl-dev tesseract-ocr tesseract-ocr-fra - name: Print tesseract version run: echo $(tesseract --version) diff --git a/pytesseract/pytesseract.py b/pytesseract/pytesseract.py index ad6beda..3d99a60 100644 --- a/pytesseract/pytesseract.py +++ b/pytesseract/pytesseract.py @@ -81,6 +81,7 @@ TESSERACT_MIN_VERSION = Version('3.05') TESSERACT_ALTO_VERSION = Version('4.1.0') +TESSERACT_URL_VERSION = Version('4.1.1') class Output: @@ -123,6 +124,12 @@ def __init__(self): 'ALTO output not supported. Tesseract >= 4.1.0 required', ) +class URLNotSupported(EnvironmentError): + def __init__(self): + super().__init__( + 'URL input not supported. Tesseract >= 4.1.1 and libcurl required', + ) + def kill(process, code): process.terminate() @@ -210,6 +217,9 @@ def save(image): with NamedTemporaryFile(prefix='tess_', delete=False) as f: if isinstance(image, str): if image.startswith('http:') or image.startswith('https:'): + if get_tesseract_version(cached=True) < TESSERACT_URL_VERSION\ + or not has_libcurl(cached=True): + raise URLNotSupported() yield f.name, image else: yield f.name, realpath(normpath(normcase(image))) @@ -473,6 +483,24 @@ def get_tesseract_version(): return version +@run_once +def has_libcurl(): + """ + Returns True if tesseract-ocr was installed with libcurl or False otherwise + """ + try: + output = subprocess.check_output( + [tesseract_cmd, '--version'], + stderr=subprocess.STDOUT, + env=environ, + stdin=subprocess.DEVNULL, + ) + except OSError: + raise TesseractNotFoundError() + + return 'libcurl' in output.decode(DEFAULT_ENCODING) + + def image_to_string( image, lang=None,