Skip to content

Commit

Permalink
Merge branch 'devel'
Browse files Browse the repository at this point in the history
  • Loading branch information
cdbethune committed Jan 17, 2025
2 parents 0faae67 + eaa7a29 commit e2052fe
Show file tree
Hide file tree
Showing 18 changed files with 275 additions and 30 deletions.
71 changes: 67 additions & 4 deletions deploy/docker-compose.j2
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,10 @@ services:
"--rabbit_host", "rabbitmq",
"--model", "pipelines/segmentation_weights",
"--result_queue", "lara_result_queue",
"--llm_provider", {{ llm_provider }}
"--llm_provider", {{ llm_provider }},
{%- if not google_application_credentials_dir %}
"--ocr_cloud_auth",
{%- endif %}
]
environment:
{%- if azure_openai_api_key %}
Expand All @@ -112,14 +115,26 @@ services:
{%- else %}
- OPENAI_API_KEY={{ openai_api_key }}
{%- endif %}
{%- if google_application_credentials_dir %}
- GOOGLE_APPLICATION_CREDENTIALS=/credentials/google_application_credentials.json
{%- else %}
- AZURE_CLIENT_ID={{ azure_client_id }}
- AZURE_CLIENT_SECRET={{ azure_client_secret }}
- AZURE_TENANT_ID={{ azure_tenant_id }}
- GOOGLE_AUDIENCE={{ google_audience }}
- AZURE_AUDIENCE={{ azure_audience }}
{%- endif %}
{%- if not local %}
- AWS_ACCESS_KEY_ID={{ aws_access_key_id }}
- AWS_SECRET_ACCESS_KEY={{ aws_secret_access_key }}
- AWS_REGION={{ aws_region }}
{%- endif %}
{%- if google_application_credentials_dir or local%}
volumes:
{%- endif %}
{%- if google_application_credentials_dir %}
- {{ google_application_credentials_dir }}:/credentials
{%- endif %}
{%- if local %}
- {{ work_dir }}:/workdir
- {{ image_dir }}:/imagedir
Expand Down Expand Up @@ -148,16 +163,32 @@ services:
"--rabbit_host", "rabbitmq",
"--model_point_extractor", "pipelines/point_extraction_weights/points.pt",
"--model_segmenter", "pipelines/segmentation_weights",
"--result_queue", "lara_result_queue"]
"--result_queue", "lara_result_queue",
{%- if not google_application_credentials_dir %}
"--ocr_cloud_auth",
{%- endif %}
]
environment:
{%- if google_application_credentials_dir %}
- GOOGLE_APPLICATION_CREDENTIALS=/credentials/google_application_credentials.json
{%- else %}
- AZURE_CLIENT_ID={{ azure_client_id }}
- AZURE_CLIENT_SECRET={{ azure_client_secret }}
- AZURE_TENANT_ID={{ azure_tenant_id }}
- GOOGLE_AUDIENCE={{ google_audience }}
- AZURE_AUDIENCE={{ azure_audience }}
{%- endif %}
{%- if not local %}
- AWS_ACCESS_KEY_ID={{ aws_access_key_id }}
- AWS_SECRET_ACCESS_KEY={{ aws_secret_access_key }}
- AWS_REGION={{ aws_region }}
{%- endif %}
{%- if google_application_credentials_dir or local%}
volumes:
{%- endif %}
{%- if google_application_credentials_dir %}
- {{ google_application_credentials_dir }}:/credentials
{%- endif %}
{%- if local %}
- {{ work_dir }}:/workdir
- {{ image_dir }}:/imagedir
Expand Down Expand Up @@ -193,16 +224,32 @@ services:
{%- endif %}
"--rabbit_host", "rabbitmq",
"--model", "pipelines/segmentation_weights",
"--result_queue", "lara_result_queue"]
"--result_queue", "lara_result_queue",
{%- if not google_application_credentials_dir %}
"--ocr_cloud_auth",
{%- endif %}
]
environment:
{%- if google_application_credentials_dir %}
- GOOGLE_APPLICATION_CREDENTIALS=/credentials/google_application_credentials.json
{%- else %}
- AZURE_CLIENT_ID={{ azure_client_id }}
- AZURE_CLIENT_SECRET={{ azure_client_secret }}
- AZURE_TENANT_ID={{ azure_tenant_id }}
- GOOGLE_AUDIENCE={{ google_audience }}
- AZURE_AUDIENCE={{ azure_audience }}
{%- endif %}
{%- if not local %}
- AWS_ACCESS_KEY_ID={{ aws_access_key_id }}
- AWS_SECRET_ACCESS_KEY={{ aws_secret_access_key }}
- AWS_REGION={{ aws_region }}
{%- endif %}
{%- if google_application_credentials_dir or local%}
volumes:
{%- endif %}
{%- if google_application_credentials_dir %}
- {{ google_application_credentials_dir }}:/credentials
{%- endif %}
{%- if local %}
- {{ work_dir }}:/workdir
- {{ image_dir }}:/imagedir
Expand Down Expand Up @@ -231,22 +278,38 @@ services:
"--rabbit_host", "rabbitmq",
"--model", "pipelines/segmentation_weights",
"--result_queue", "lara_result_queue",
"--llm_provider", {{ llm_provider }}]
"--llm_provider", {{ llm_provider }},
{%- if not google_application_credentials_dir %}
"--ocr_cloud_auth",
{%- endif %}
]
environment:
{%- if azure_openai_api_key %}
- AZURE_OPENAI_API_KEY={{ azure_openai_api_key }}
- AZURE_OPENAI_ENDPOINT={{ azure_openai_endpoint }}
{%- else %}
- OPENAI_API_KEY={{ openai_api_key }}
{%- endif %}
{%- if google_application_credentials_dir %}
- GOOGLE_APPLICATION_CREDENTIALS=/credentials/google_application_credentials.json
{%- else %}
- AZURE_CLIENT_ID={{ azure_client_id }}
- AZURE_CLIENT_SECRET={{ azure_client_secret }}
- AZURE_TENANT_ID={{ azure_tenant_id }}
- GOOGLE_AUDIENCE={{ google_audience }}
- AZURE_AUDIENCE={{ azure_audience }}
{%- endif %}
{%- if not local %}
- AWS_ACCESS_KEY_ID={{ aws_access_key_id }}
- AWS_SECRET_ACCESS_KEY={{ aws_secret_access_key }}
- AWS_REGION={{ aws_region }}
{%- endif %}
{%- if google_application_credentials_dir or local%}
volumes:
{%- endif %}
{%- if google_application_credentials_dir %}
- {{ google_application_credentials_dir }}:/credentials
{%- endif %}
{%- if local %}
- {{ work_dir }}:/workdir
- {{ image_dir }}:/imagedir
Expand Down
21 changes: 21 additions & 0 deletions deploy/vars_cloud_auth_example.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"tag": "latest",
"cdr": true,
"work_dir": "s3://example_bucket/workdir",
"image_dir": "s3://example_bucket/images",
"aws_access_key_id": "AKIAIOSFODNN7EXAMPLE",
"aws_secret_access_key": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
"aws_region": "us-west-2",
"cdr_api_token": "1234567890abcdef1234567890abcdef",
"cdr_host": "https://example.com/cdr",
"cdr_callback_url": "https://lara_cdr:5000",
"cog_host": "https://s3.amazonaws.com/example/cogs",
"azure_openai_api_key": "1234567890abcdef1234567890abcdef",
"azure_openai_endpoint": "https://azure/endpoint",
"llm_provider": "azure",
"azure_client_id": "1234567890abcdef1234567890abcdef",
"azure_client_secret": "1234567890abcdef1234567890abcdef",
"azure_tenant_id": "1234567890abcdef1234567890abcdef",
"google_audience": "https://google_audience_example.com",
"azure_audience": "https://azure_audience_example.com"
}
1 change: 0 additions & 1 deletion deploy/vars_example.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
"aws_region": "us-west-2",
"cdr_api_token": "1234567890abcdef1234567890abcdef",
"cdr_host": "https://example.com/cdr",
"cdr_callback_url": "https://lara_cdr:5000",
"cog_host": "https://s3.amazonaws.com/example/cogs",
"azure_openai_api_key": "1234567890abcdef1234567890abcdef",
"azure_openai_endpoint": "https://azure/endpoint",
Expand Down
2 changes: 2 additions & 0 deletions pipelines/geo_referencing/georeferencing_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def __init__(
diagnostics: bool,
gpu_enabled: bool,
metrics_url: str = "",
ocr_cloud_auth=False,
):
geocoding_cache_bounds = append_to_cache_location(
working_dir, "geocoding_cache_bounds.json"
Expand Down Expand Up @@ -131,6 +132,7 @@ def __init__(
6000,
gamma_correction=ocr_gamma_correction,
metrics_url=metrics_url,
ocr_cloud_auth=ocr_cloud_auth,
),
# Defines an allowed region for cooredinates to occupy by buffering
# the extracted map area polyline by a fixed amount
Expand Down
2 changes: 2 additions & 0 deletions pipelines/geo_referencing/run_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ def start_server():
parser.add_argument("--no_gpu", action="store_true")
parser.add_argument("--project", action="store_true")
parser.add_argument("--diagnostics", action="store_true")
parser.add_argument("--ocr_cloud_auth", action="store_true")
p = parser.parse_args()

# validate any s3 path args up front
Expand Down Expand Up @@ -216,6 +217,7 @@ def start_server():
p.diagnostics,
not p.no_gpu,
p.metrics_url,
p.ocr_cloud_auth,
)

#### start flask server or startup up the message queue
Expand Down
2 changes: 2 additions & 0 deletions pipelines/metadata_extraction/metadata_extraction_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def __init__(
provider=LLM_PROVIDER.OPENAI,
gpu=True,
metrics_url: str = "",
ocr_cloud_auth=False,
):
# extract text from image, filter out the legend and map areas, and then extract metadata using an LLM
tasks = [
Expand All @@ -75,6 +76,7 @@ def __init__(
6000,
0.5,
metrics_url=metrics_url,
ocr_cloud_auth=ocr_cloud_auth,
),
# filter out the text that is not part of the map or supplemental information
TextFilter(
Expand Down
2 changes: 2 additions & 0 deletions pipelines/metadata_extraction/run_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def health():
parser.add_argument("--request_queue", type=str, default=METADATA_REQUEST_QUEUE)
parser.add_argument("--result_queue", type=str, default=METADATA_RESULT_QUEUE)
parser.add_argument("--no_gpu", action="store_true")
parser.add_argument("--ocr_cloud_auth", action="store_true")
p = parser.parse_args()

# validate any s3 path args up front
Expand All @@ -143,6 +144,7 @@ def health():
provider=p.llm_provider,
gpu=not p.no_gpu,
metrics_url=p.metrics_url,
ocr_cloud_auth=p.ocr_cloud_auth,
)

metadata_result_key = (
Expand Down
2 changes: 2 additions & 0 deletions pipelines/point_extraction/point_extraction_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def __init__(
batch_size=20,
metrics_url="",
debug_images=False,
ocr_cloud_auth=False,
):
# extract text from image, segmentation to only keep the map area,
# tile, extract points, untile, predict direction
Expand Down Expand Up @@ -113,6 +114,7 @@ def __init__(
append_to_cache_location(cache_location, "text"),
gamma_correction=0.5,
metrics_url=metrics_url,
ocr_cloud_auth=ocr_cloud_auth,
),
LegendPreprocessor("legend_preprocessor", "", fetch_legend_items),
Tiler("tiling"),
Expand Down
2 changes: 2 additions & 0 deletions pipelines/point_extraction/run_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def health():
parser.add_argument("--result_queue", type=str, default=POINTS_RESULT_QUEUE)
parser.add_argument("--no_gpu", action="store_true")
parser.add_argument("--batch_size", type=int, default=20)
parser.add_argument("--ocr_cloud_auth", action="store_true")
p = parser.parse_args()

# validate any s3 path args up front
Expand All @@ -115,6 +116,7 @@ def health():
gpu=not p.no_gpu,
batch_size=p.batch_size,
metrics_url=p.metrics_url,
ocr_cloud_auth=p.ocr_cloud_auth,
)

result_key = (
Expand Down
2 changes: 2 additions & 0 deletions pipelines/segmentation/run_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ def health():
parser.add_argument("--request_queue", type=str, default=SEGMENTATION_REQUEST_QUEUE)
parser.add_argument("--result_queue", type=str, default=SEGMENTATION_RESULT_QUEUE)
parser.add_argument("--no_gpu", action="store_true")
parser.add_argument("--ocr_cloud_auth", action="store_true")
p = parser.parse_args()

# validate any s3 path args up front
Expand All @@ -115,6 +116,7 @@ def health():
cdr_schema=p.cdr_schema,
gpu=not p.no_gpu,
metrics_url=p.metrics_url,
ocr_cloud_auth=p.ocr_cloud_auth,
)

# get ta1 schema output or internal output format
Expand Down
2 changes: 2 additions & 0 deletions pipelines/segmentation/segmentation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(
confidence_thres=0.25,
gpu=True,
metrics_url: str = "",
ocr_cloud_auth=False,
):
"""
Initializes the pipeline.
Expand All @@ -59,6 +60,7 @@ def __init__(
6000,
0.5,
metrics_url=metrics_url,
ocr_cloud_auth=ocr_cloud_auth,
),
DetectronSegmenter(
"segmenter",
Expand Down
9 changes: 8 additions & 1 deletion pipelines/text_extraction/run_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,20 @@ def health():
parser.add_argument("--metrics_url", type=str, default="")
parser.add_argument("--request_queue", type=str, default=TEXT_REQUEST_QUEUE)
parser.add_argument("--result_queue", type=str, default=TEXT_RESULT_QUEUE)
parser.add_argument("--ocr_cloud_auth", action="store_true")
p = parser.parse_args()

# validate s3 path args up front
validate_s3_config("", p.workdir, p.imagedir, "")

pipeline = TextExtractionPipeline(
p.workdir, p.tile, p.pixel_limit, p.gamma_corr, p.debug, p.metrics_url
p.workdir,
p.tile,
p.pixel_limit,
p.gamma_corr,
p.debug,
p.metrics_url,
p.ocr_cloud_auth,
)

result_key = (
Expand Down
3 changes: 3 additions & 0 deletions pipelines/text_extraction/text_extraction_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def __init__(
gamma_corr=1.0,
debug_images=False,
metrics_url: str = "",
ocr_cloud_auth=False,
):
if tile:
tasks = [
Expand All @@ -54,6 +55,7 @@ def __init__(
pixel_limit,
gamma_corr,
metrics_url=metrics_url,
ocr_cloud_auth=ocr_cloud_auth,
)
]
else:
Expand All @@ -66,6 +68,7 @@ def __init__(
pixel_limit,
gamma_corr,
metrics_url=metrics_url,
ocr_cloud_auth=ocr_cloud_auth,
)
]

Expand Down
4 changes: 2 additions & 2 deletions tasks/common/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ def _read_from_s3(
try:
client.head_object(Bucket=bucket, Key=key)
except client.exceptions.ClientError as e:
error_code = int(e.response["Error"]["Code"])
error_code = int(e.response["Error"]["Code"]) # type: ignore
if error_code == 404:
return None

Expand Down Expand Up @@ -446,7 +446,7 @@ def bucket_exists(uri: str) -> bool:
except client.exceptions.NoSuchBucket:
return False
except client.exceptions.ClientError as e:
error_code = int(e.response["Error"]["Code"])
error_code = int(e.response["Error"]["Code"]) # type: ignore
if error_code == 404 or error_code == 403:
return False
raise
Expand Down
4 changes: 3 additions & 1 deletion tasks/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@ dependencies = [
"pydantic",
"tiktoken",
"google-cloud-vision",
"google-auth",
"msal",
"grpcio",
"boto3",
"boto3-stubs",
"boto3-stubs[s3]",
"pillow",
"geopy",
"matplotlib",
Expand Down
Loading

0 comments on commit e2052fe

Please sign in to comment.