From 493558cb2aeac1879fd3e9289a5fec8bb00d9218 Mon Sep 17 00:00:00 2001 From: Midnight Date: Wed, 3 Jul 2019 14:31:45 -0400 Subject: [PATCH 1/3] Fixed one test and used an env var for the google cloud bucket The bucket name will be given as an environment variable as the method to_text() is only given one argument when used in extract_data --- src/invoice2data/input/gvision.py | 10 +++++++--- tests/test_cli.py | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/invoice2data/input/gvision.py b/src/invoice2data/input/gvision.py index cfb48b49..93d6b5e0 100644 --- a/src/invoice2data/input/gvision.py +++ b/src/invoice2data/input/gvision.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -def to_text(path, bucket_name='cloud-vision-84893', language='fr'): +def to_text(path, language='fr'): """Sends PDF files to Google Cloud Vision for OCR. Before using invoice2data, make sure you have the auth json path set as @@ -9,8 +9,6 @@ def to_text(path, bucket_name='cloud-vision-84893', language='fr'): ---------- path : str path of electronic invoice in JPG or PNG format - bucket_name : str - name of bucket to use for file storage and results cache. Returns ------- @@ -27,6 +25,12 @@ def to_text(path, bucket_name='cloud-vision-84893', language='fr'): # Supported mime_types are: 'application/pdf' and 'image/tiff' mime_type = 'application/pdf' + bucket_name = os.getenv('GOOGLE_CLOUD_BUCKET_NAME', None) + + if bucket_name is None: + raise EnvironmentError( + 'GOOGLE_CLOUD_BUCKET_NAME environment variable not set' + ) path_dir, filename = os.path.split(path) result_blob_basename = filename.replace('.pdf', '').replace('.PDF', '') diff --git a/tests/test_cli.py b/tests/test_cli.py index 40f908f7..0741185f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -133,7 +133,7 @@ def test_copy(self): i += 1 shutil.rmtree('tests/copy_test/', ignore_errors=True) - self.assertEqual(i, len(get_sample_files('.json'))) + self.assertEqual(i, len(get_sample_files('.pdf'))) ''' if i != len(self._get_test_file_json_path()): print(i) From 97ac5891a8e703efa22659ff6a61304cafb95414 Mon Sep 17 00:00:00 2001 From: Midnight Date: Wed, 3 Jul 2019 14:46:37 -0400 Subject: [PATCH 2/3] Updated README regarding changes with google cloud bucket name --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 078aa1e1..a159ebdb 100644 --- a/README.rst +++ b/README.rst @@ -67,7 +67,7 @@ Choose any of the following input readers: - tesseract ``invoice2data --input-reader tesseract invoice.pdf`` - pdf miner ``invoice2data --input-reader pdfminer invoice.pdf`` - tesseract4 ``invoice2data --input-reader tesseract4 invoice.pdf`` - - gvision ``invoice2data --input-reader gvision invoice.pdf`` (needs ``GOOGLE_APPLICATION_CREDENTIALS`` env var) + - gvision ``invoice2data --input-reader gvision invoice.pdf`` (needs ``GOOGLE_APPLICATION_CREDENTIALS`` and ``GOOGLE_CLOUD_BUCKET_NAME`` env var) Choose any of the following output formats: From 27c27c8e8025847e105f67d48c5efc622aa35009 Mon Sep 17 00:00:00 2001 From: Midnight Date: Mon, 15 Jul 2019 10:55:51 -0400 Subject: [PATCH 3/3] Added precision to README and Re-added the bucket name as argument --- README.rst | 2 +- src/invoice2data/input/gvision.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index a159ebdb..3cbd8585 100644 --- a/README.rst +++ b/README.rst @@ -67,7 +67,7 @@ Choose any of the following input readers: - tesseract ``invoice2data --input-reader tesseract invoice.pdf`` - pdf miner ``invoice2data --input-reader pdfminer invoice.pdf`` - tesseract4 ``invoice2data --input-reader tesseract4 invoice.pdf`` - - gvision ``invoice2data --input-reader gvision invoice.pdf`` (needs ``GOOGLE_APPLICATION_CREDENTIALS`` and ``GOOGLE_CLOUD_BUCKET_NAME`` env var) + - gvision ``invoice2data --input-reader gvision invoice.pdf`` (needs ``GOOGLE_APPLICATION_CREDENTIALS`` and a Google Cloud Bucket name. The bucket name can be set as an argument to the function ``to_text`` or as an Environment variable named ``GOOGLE_CLOUD_BUCKET_NAME`` ) Choose any of the following output formats: diff --git a/src/invoice2data/input/gvision.py b/src/invoice2data/input/gvision.py index 93d6b5e0..cb75d5c7 100644 --- a/src/invoice2data/input/gvision.py +++ b/src/invoice2data/input/gvision.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -def to_text(path, language='fr'): +def to_text(path, bucket_name=None, language='fr'): """Sends PDF files to Google Cloud Vision for OCR. Before using invoice2data, make sure you have the auth json path set as @@ -9,6 +9,8 @@ def to_text(path, language='fr'): ---------- path : str path of electronic invoice in JPG or PNG format + bucket_name : str + name of bucket to use for file storage and results cache. Returns ------- @@ -25,12 +27,14 @@ def to_text(path, language='fr'): # Supported mime_types are: 'application/pdf' and 'image/tiff' mime_type = 'application/pdf' - bucket_name = os.getenv('GOOGLE_CLOUD_BUCKET_NAME', None) if bucket_name is None: - raise EnvironmentError( - 'GOOGLE_CLOUD_BUCKET_NAME environment variable not set' - ) + bucket_name = os.getenv('GOOGLE_CLOUD_BUCKET_NAME', None) + + if bucket_name is None: + raise EnvironmentError( + 'No Google Cloud Bucket name set.\n Set it as an input variable or as an environment variable named GOOGLE_CLOUD_BUCKET_NAME' + ) path_dir, filename = os.path.split(path) result_blob_basename = filename.replace('.pdf', '').replace('.PDF', '')