Merge branch 'devel'

DARPA-CRITICALMAAS · Jan 17, 2025 · e2052fe · e2052fe
2 parents 0faae67 + eaa7a29
commit e2052fe
Show file tree

Hide file tree

Showing 18 changed files with 275 additions and 30 deletions.
diff --git a/deploy/docker-compose.j2 b/deploy/docker-compose.j2
@@ -103,7 +103,10 @@ services:
       "--rabbit_host", "rabbitmq",
       "--model", "pipelines/segmentation_weights",
       "--result_queue", "lara_result_queue",
-      "--llm_provider", {{ llm_provider }}
+      "--llm_provider", {{ llm_provider }},
+      {%- if not google_application_credentials_dir %}
+      "--ocr_cloud_auth",
+      {%- endif %}
       ]
     environment:
       {%- if azure_openai_api_key %}
@@ -112,14 +115,26 @@ services:
       {%- else %}
       - OPENAI_API_KEY={{ openai_api_key }}
       {%- endif %}
+      {%- if google_application_credentials_dir %}
       - GOOGLE_APPLICATION_CREDENTIALS=/credentials/google_application_credentials.json
+      {%- else %}
+      - AZURE_CLIENT_ID={{ azure_client_id }}
+      - AZURE_CLIENT_SECRET={{ azure_client_secret }}
+      - AZURE_TENANT_ID={{ azure_tenant_id }}
+      - GOOGLE_AUDIENCE={{ google_audience }}
+      - AZURE_AUDIENCE={{ azure_audience }}
+      {%- endif %}
       {%- if not local %}
       - AWS_ACCESS_KEY_ID={{ aws_access_key_id }}
       - AWS_SECRET_ACCESS_KEY={{ aws_secret_access_key }}
       - AWS_REGION={{ aws_region }}
       {%- endif %}
+    {%- if google_application_credentials_dir or local%}  
     volumes:
+    {%- endif %}
+      {%- if google_application_credentials_dir %}
       - {{ google_application_credentials_dir }}:/credentials
+      {%- endif %}
       {%- if local %}
       - {{ work_dir }}:/workdir
       - {{ image_dir }}:/imagedir
@@ -148,16 +163,32 @@ services:
       "--rabbit_host", "rabbitmq",
       "--model_point_extractor", "pipelines/point_extraction_weights/points.pt",
       "--model_segmenter", "pipelines/segmentation_weights",
-      "--result_queue", "lara_result_queue"]
+      "--result_queue", "lara_result_queue",
+      {%- if not google_application_credentials_dir %}
+      "--ocr_cloud_auth",
+      {%- endif %}
+      ]
     environment:
+      {%- if google_application_credentials_dir %}
       - GOOGLE_APPLICATION_CREDENTIALS=/credentials/google_application_credentials.json
+      {%- else %}
+      - AZURE_CLIENT_ID={{ azure_client_id }}
+      - AZURE_CLIENT_SECRET={{ azure_client_secret }}
+      - AZURE_TENANT_ID={{ azure_tenant_id }}
+      - GOOGLE_AUDIENCE={{ google_audience }}
+      - AZURE_AUDIENCE={{ azure_audience }}
+      {%- endif %}
       {%- if not local %}
       - AWS_ACCESS_KEY_ID={{ aws_access_key_id }}
       - AWS_SECRET_ACCESS_KEY={{ aws_secret_access_key }}
       - AWS_REGION={{ aws_region }}
       {%- endif %}
+    {%- if google_application_credentials_dir or local%}  
     volumes:
+    {%- endif %}
+      {%- if google_application_credentials_dir %}
       - {{ google_application_credentials_dir }}:/credentials
+      {%- endif %}
     {%- if local %}
       - {{ work_dir }}:/workdir
       - {{ image_dir }}:/imagedir
@@ -193,16 +224,32 @@ services:
       {%- endif %}
       "--rabbit_host", "rabbitmq",
       "--model", "pipelines/segmentation_weights",
-      "--result_queue", "lara_result_queue"]
+      "--result_queue", "lara_result_queue",
+      {%- if not google_application_credentials_dir %}
+      "--ocr_cloud_auth",
+      {%- endif %}
+      ]
     environment:
+      {%- if google_application_credentials_dir %}
       - GOOGLE_APPLICATION_CREDENTIALS=/credentials/google_application_credentials.json
+      {%- else %}
+      - AZURE_CLIENT_ID={{ azure_client_id }}
+      - AZURE_CLIENT_SECRET={{ azure_client_secret }}
+      - AZURE_TENANT_ID={{ azure_tenant_id }}
+      - GOOGLE_AUDIENCE={{ google_audience }}
+      - AZURE_AUDIENCE={{ azure_audience }}
+      {%- endif %}
       {%- if not local %}
       - AWS_ACCESS_KEY_ID={{ aws_access_key_id }}
       - AWS_SECRET_ACCESS_KEY={{ aws_secret_access_key }}
       - AWS_REGION={{ aws_region }}
       {%- endif %}
+    {%- if google_application_credentials_dir or local%}  
     volumes:
+    {%- endif %}
+    {%- if google_application_credentials_dir %}
       - {{ google_application_credentials_dir }}:/credentials
+    {%- endif %}
     {%- if local %}
       - {{ work_dir }}:/workdir
       - {{ image_dir }}:/imagedir
@@ -231,22 +278,38 @@ services:
       "--rabbit_host", "rabbitmq",
       "--model", "pipelines/segmentation_weights",
       "--result_queue", "lara_result_queue",
-      "--llm_provider", {{ llm_provider }}]
+      "--llm_provider", {{ llm_provider }},
+      {%- if not google_application_credentials_dir %}
+      "--ocr_cloud_auth",
+      {%- endif %}
+      ]
     environment:
       {%- if azure_openai_api_key %}
       - AZURE_OPENAI_API_KEY={{ azure_openai_api_key }}
       - AZURE_OPENAI_ENDPOINT={{ azure_openai_endpoint }}
       {%- else %}
       - OPENAI_API_KEY={{ openai_api_key }}
       {%- endif %}
+      {%- if google_application_credentials_dir %}
       - GOOGLE_APPLICATION_CREDENTIALS=/credentials/google_application_credentials.json
+      {%- else %}
+      - AZURE_CLIENT_ID={{ azure_client_id }}
+      - AZURE_CLIENT_SECRET={{ azure_client_secret }}
+      - AZURE_TENANT_ID={{ azure_tenant_id }}
+      - GOOGLE_AUDIENCE={{ google_audience }}
+      - AZURE_AUDIENCE={{ azure_audience }}
+      {%- endif %}
       {%- if not local %}
       - AWS_ACCESS_KEY_ID={{ aws_access_key_id }}
       - AWS_SECRET_ACCESS_KEY={{ aws_secret_access_key }}
       - AWS_REGION={{ aws_region }}
       {%- endif %}
+    {%- if google_application_credentials_dir or local%}  
     volumes:
+    {%- endif %}
+    {%- if google_application_credentials_dir %}
       - {{ google_application_credentials_dir }}:/credentials
+    {%- endif %}
     {%- if local %}
       - {{ work_dir }}:/workdir
       - {{ image_dir }}:/imagedir

diff --git a/deploy/vars_cloud_auth_example.json b/deploy/vars_cloud_auth_example.json
@@ -0,0 +1,21 @@
+{
+    "tag": "latest",
+    "cdr": true,
+    "work_dir": "s3://example_bucket/workdir",
+    "image_dir": "s3://example_bucket/images",
+    "aws_access_key_id": "AKIAIOSFODNN7EXAMPLE",
+    "aws_secret_access_key": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
+    "aws_region": "us-west-2",
+    "cdr_api_token": "1234567890abcdef1234567890abcdef",
+    "cdr_host": "https://example.com/cdr",
+    "cdr_callback_url": "https://lara_cdr:5000",
+    "cog_host": "https://s3.amazonaws.com/example/cogs",
+    "azure_openai_api_key": "1234567890abcdef1234567890abcdef",
+    "azure_openai_endpoint": "https://azure/endpoint",
+    "llm_provider": "azure",
+    "azure_client_id": "1234567890abcdef1234567890abcdef",
+    "azure_client_secret": "1234567890abcdef1234567890abcdef",
+    "azure_tenant_id": "1234567890abcdef1234567890abcdef",
+    "google_audience": "https://google_audience_example.com",
+    "azure_audience": "https://azure_audience_example.com" 
+}
diff --git a/deploy/vars_example.json b/deploy/vars_example.json
@@ -6,7 +6,6 @@
     "aws_region": "us-west-2",
     "cdr_api_token": "1234567890abcdef1234567890abcdef",
     "cdr_host": "https://example.com/cdr",
-    "cdr_callback_url": "https://lara_cdr:5000",
     "cog_host": "https://s3.amazonaws.com/example/cogs",
     "azure_openai_api_key": "1234567890abcdef1234567890abcdef",
     "azure_openai_endpoint": "https://azure/endpoint",

diff --git a/pipelines/geo_referencing/georeferencing_pipeline.py b/pipelines/geo_referencing/georeferencing_pipeline.py
@@ -77,6 +77,7 @@ def __init__(
         diagnostics: bool,
         gpu_enabled: bool,
         metrics_url: str = "",
+        ocr_cloud_auth=False,
     ):
         geocoding_cache_bounds = append_to_cache_location(
             working_dir, "geocoding_cache_bounds.json"
@@ -131,6 +132,7 @@ def __init__(
                 6000,
                 gamma_correction=ocr_gamma_correction,
                 metrics_url=metrics_url,
+                ocr_cloud_auth=ocr_cloud_auth,
             ),
             # Defines an allowed region for cooredinates to occupy by buffering
             # the extracted map area polyline by a fixed amount

diff --git a/pipelines/geo_referencing/run_server.py b/pipelines/geo_referencing/run_server.py
@@ -186,6 +186,7 @@ def start_server():
     parser.add_argument("--no_gpu", action="store_true")
     parser.add_argument("--project", action="store_true")
     parser.add_argument("--diagnostics", action="store_true")
+    parser.add_argument("--ocr_cloud_auth", action="store_true")
     p = parser.parse_args()
 
     # validate any s3 path args up front
@@ -216,6 +217,7 @@ def start_server():
         p.diagnostics,
         not p.no_gpu,
         p.metrics_url,
+        p.ocr_cloud_auth,
     )
 
     #### start flask server or startup up the message queue

diff --git a/pipelines/metadata_extraction/metadata_extraction_pipeline.py b/pipelines/metadata_extraction/metadata_extraction_pipeline.py
@@ -51,6 +51,7 @@ def __init__(
         provider=LLM_PROVIDER.OPENAI,
         gpu=True,
         metrics_url: str = "",
+        ocr_cloud_auth=False,
     ):
         # extract text from image, filter out the legend and map areas, and then extract metadata using an LLM
         tasks = [
@@ -75,6 +76,7 @@ def __init__(
                 6000,
                 0.5,
                 metrics_url=metrics_url,
+                ocr_cloud_auth=ocr_cloud_auth,
             ),
             # filter out the text that is not part of the map or supplemental information
             TextFilter(

diff --git a/pipelines/metadata_extraction/run_server.py b/pipelines/metadata_extraction/run_server.py
@@ -128,6 +128,7 @@ def health():
     parser.add_argument("--request_queue", type=str, default=METADATA_REQUEST_QUEUE)
     parser.add_argument("--result_queue", type=str, default=METADATA_RESULT_QUEUE)
     parser.add_argument("--no_gpu", action="store_true")
+    parser.add_argument("--ocr_cloud_auth", action="store_true")
     p = parser.parse_args()
 
     # validate any s3 path args up front
@@ -143,6 +144,7 @@ def health():
         provider=p.llm_provider,
         gpu=not p.no_gpu,
         metrics_url=p.metrics_url,
+        ocr_cloud_auth=p.ocr_cloud_auth,
     )
 
     metadata_result_key = (

diff --git a/pipelines/point_extraction/point_extraction_pipeline.py b/pipelines/point_extraction/point_extraction_pipeline.py
@@ -75,6 +75,7 @@ def __init__(
         batch_size=20,
         metrics_url="",
         debug_images=False,
+        ocr_cloud_auth=False,
     ):
         # extract text from image, segmentation to only keep the map area,
         # tile, extract points, untile, predict direction
@@ -113,6 +114,7 @@ def __init__(
                     append_to_cache_location(cache_location, "text"),
                     gamma_correction=0.5,
                     metrics_url=metrics_url,
+                    ocr_cloud_auth=ocr_cloud_auth,
                 ),
                 LegendPreprocessor("legend_preprocessor", "", fetch_legend_items),
                 Tiler("tiling"),

diff --git a/pipelines/point_extraction/run_server.py b/pipelines/point_extraction/run_server.py
@@ -101,6 +101,7 @@ def health():
     parser.add_argument("--result_queue", type=str, default=POINTS_RESULT_QUEUE)
     parser.add_argument("--no_gpu", action="store_true")
     parser.add_argument("--batch_size", type=int, default=20)
+    parser.add_argument("--ocr_cloud_auth", action="store_true")
     p = parser.parse_args()
 
     # validate any s3 path args up front
@@ -115,6 +116,7 @@ def health():
         gpu=not p.no_gpu,
         batch_size=p.batch_size,
         metrics_url=p.metrics_url,
+        ocr_cloud_auth=p.ocr_cloud_auth,
     )
 
     result_key = (

diff --git a/pipelines/segmentation/run_server.py b/pipelines/segmentation/run_server.py
@@ -102,6 +102,7 @@ def health():
     parser.add_argument("--request_queue", type=str, default=SEGMENTATION_REQUEST_QUEUE)
     parser.add_argument("--result_queue", type=str, default=SEGMENTATION_RESULT_QUEUE)
     parser.add_argument("--no_gpu", action="store_true")
+    parser.add_argument("--ocr_cloud_auth", action="store_true")
     p = parser.parse_args()
 
     # validate any s3 path args up front
@@ -115,6 +116,7 @@ def health():
         cdr_schema=p.cdr_schema,
         gpu=not p.no_gpu,
         metrics_url=p.metrics_url,
+        ocr_cloud_auth=p.ocr_cloud_auth,
     )
 
     # get ta1 schema output or internal output format

diff --git a/pipelines/segmentation/segmentation_pipeline.py b/pipelines/segmentation/segmentation_pipeline.py
@@ -41,6 +41,7 @@ def __init__(
         confidence_thres=0.25,
         gpu=True,
         metrics_url: str = "",
+        ocr_cloud_auth=False,
     ):
         """
         Initializes the pipeline.
@@ -59,6 +60,7 @@ def __init__(
                 6000,
                 0.5,
                 metrics_url=metrics_url,
+                ocr_cloud_auth=ocr_cloud_auth,
             ),
             DetectronSegmenter(
                 "segmenter",

diff --git a/pipelines/text_extraction/run_server.py b/pipelines/text_extraction/run_server.py
@@ -97,13 +97,20 @@ def health():
     parser.add_argument("--metrics_url", type=str, default="")
     parser.add_argument("--request_queue", type=str, default=TEXT_REQUEST_QUEUE)
     parser.add_argument("--result_queue", type=str, default=TEXT_RESULT_QUEUE)
+    parser.add_argument("--ocr_cloud_auth", action="store_true")
     p = parser.parse_args()
 
     # validate s3 path args up front
     validate_s3_config("", p.workdir, p.imagedir, "")
 
     pipeline = TextExtractionPipeline(
-        p.workdir, p.tile, p.pixel_limit, p.gamma_corr, p.debug, p.metrics_url
+        p.workdir,
+        p.tile,
+        p.pixel_limit,
+        p.gamma_corr,
+        p.debug,
+        p.metrics_url,
+        p.ocr_cloud_auth,
     )
 
     result_key = (

diff --git a/pipelines/text_extraction/text_extraction_pipeline.py b/pipelines/text_extraction/text_extraction_pipeline.py
@@ -45,6 +45,7 @@ def __init__(
         gamma_corr=1.0,
         debug_images=False,
         metrics_url: str = "",
+        ocr_cloud_auth=False,
     ):
         if tile:
             tasks = [
@@ -54,6 +55,7 @@ def __init__(
                     pixel_limit,
                     gamma_corr,
                     metrics_url=metrics_url,
+                    ocr_cloud_auth=ocr_cloud_auth,
                 )
             ]
         else:
@@ -66,6 +68,7 @@ def __init__(
                     pixel_limit,
                     gamma_corr,
                     metrics_url=metrics_url,
+                    ocr_cloud_auth=ocr_cloud_auth,
                 )
             ]
 

diff --git a/tasks/common/io.py b/tasks/common/io.py
@@ -329,7 +329,7 @@ def _read_from_s3(
         try:
             client.head_object(Bucket=bucket, Key=key)
         except client.exceptions.ClientError as e:
-            error_code = int(e.response["Error"]["Code"])
+            error_code = int(e.response["Error"]["Code"])  # type: ignore
             if error_code == 404:
                 return None
 
@@ -446,7 +446,7 @@ def bucket_exists(uri: str) -> bool:
     except client.exceptions.NoSuchBucket:
         return False
     except client.exceptions.ClientError as e:
-        error_code = int(e.response["Error"]["Code"])
+        error_code = int(e.response["Error"]["Code"])  # type: ignore
         if error_code == 404 or error_code == 403:
             return False
         raise

diff --git a/tasks/pyproject.toml b/tasks/pyproject.toml
@@ -13,9 +13,11 @@ dependencies = [
     "pydantic",
     "tiktoken",
     "google-cloud-vision",
+    "google-auth",
+    "msal",
     "grpcio",
     "boto3",
-    "boto3-stubs",
+    "boto3-stubs[s3]",
     "pillow",
     "geopy",
     "matplotlib",