diff --git a/document-ai/README.md b/document-ai/README.md index 19830da..4710c0d 100644 --- a/document-ai/README.md +++ b/document-ai/README.md @@ -47,7 +47,7 @@ Pricing Estimates - We have created a sample estimate based on some usage we see 1. Click on Open in Google Cloud Shell button below. - + Open in Cloud Shell diff --git a/document-ai/code/main.py b/document-ai/code/main.py index 95d0a72..480bcd3 100644 --- a/document-ai/code/main.py +++ b/document-ai/code/main.py @@ -1,9 +1,9 @@ from google.cloud import bigquery, documentai_v1beta3, storage import functions_framework +from google.cloud import documentai import os import json - def process_document(bucket_name, object_name): """Process a document stored in GCS.""" print("Document processing started.") @@ -20,13 +20,15 @@ def process_document(bucket_name, object_name): # Read the file into memory with open(file_path, "rb") as image: image_content = image.read() + # Set the document content in the request document = {"content": image_content, "mime_type": blob.content_type} # Configure the process request - processor_name = os.getenv("DOC_AI_PROCESSOR") + processor_name = os.getenv("FORM_PARSER_PROCESSOR") if not processor_name: - print("Environment variable DOC_AI_PROCESSOR not set") - exit(1) + print("Environment variable FORM_PARSER_PROCESSOR not set") + return + request = {"name": processor_name, "document": document} # Use the Document AI client to process the request @@ -43,10 +45,24 @@ def process_document(bucket_name, object_name): fieldValue = get_text(form_field.field_value, document) document_dict[f"{fieldName}"] = fieldValue + # Extract Summary + # Set the document content in the request + document = {"content": image_content, "mime_type": blob.content_type} + print("Summarizing Document") + summary_processor_name = os.getenv("SUMMARY_PROCESSOR") + if not summary_processor_name: + print("Environment variable SUMMARY_PROCESSOR not set") + return + + summary_request = {"name": summary_processor_name, "document": document} + summary_result = client.process_document(request=summary_request) + document = summary_result.document + summary_text = document.entities[0].mention_text print("Document processing complete.") - process_output(bucket_name, object_name, document_text, document_dict) + process_output(bucket_name, object_name, document_text, summary_text, document_dict) + def get_text(doc_element: dict, document: dict): """ Document AI identifies form fields by their offsets @@ -67,7 +83,7 @@ def get_text(doc_element: dict, document: dict): return response -def process_output(bucket_name, object_name, document_text, document_dict): +def process_output(bucket_name, object_name, document_text, summary_text, document_dict): """Moves a blob from one bucket to another.""" print("Process output started.") storage_client = storage.Client() @@ -80,10 +96,16 @@ def process_output(bucket_name, object_name, document_text, document_dict): results_text_blob = destination_bucket.blob(results_text_name) results_text_blob.upload_from_string(document_text) + print("Saving summary results into the output bucket...") + results_summary_name = "{}.summary".format(object_name) + results_summary_blob = destination_bucket.blob(results_summary_name) + results_summary_blob.upload_from_string(summary_text) + print("Saving json results into the output bucket...") results_json = { "document_file_name": object_name, - "document_content": document_dict + "document_content": document_dict, + "document_summary": summary_text } results_json = json.dumps(results_json) results_json_name = "{}.json".format(object_name) @@ -105,6 +127,7 @@ def process_output(bucket_name, object_name, document_text, document_dict): schema=[ bigquery.SchemaField("document_file_name", "STRING"), bigquery.SchemaField("document_content", "JSON"), + bigquery.SchemaField("document_summary", "STRING"), ], source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON, ) @@ -143,4 +166,4 @@ def trigger_gcs(cloud_event): print(f"Created: {timeCreated}") print(f"Updated: {updated}") - process_document(bucket, name) + process_document(bucket, name) \ No newline at end of file diff --git a/document-ai/infra/bq.tf b/document-ai/infra/bq.tf index 4824d29..83a51ea 100644 --- a/document-ai/infra/bq.tf +++ b/document-ai/infra/bq.tf @@ -37,6 +37,11 @@ resource "google_bigquery_table" "form_parser" { "name": "document_content", "type": "JSON", "mode": "NULLABLE" + }, + { + "name": "document_summary", + "type": "STRING", + "mode": "NULLABLE" } ] EOF diff --git a/document-ai/infra/doc_ai.tf b/document-ai/infra/doc_ai.tf index 524901f..62a5cc3 100644 --- a/document-ai/infra/doc_ai.tf +++ b/document-ai/infra/doc_ai.tf @@ -17,3 +17,8 @@ resource "google_document_ai_processor" "form_parser" { display_name = local.processor_name type = "FORM_PARSER_PROCESSOR" } +resource "google_document_ai_processor" "doc_summarizer" { + location = var.location + display_name = local.summarizer_name + type = "SUMMARY_PROCESSOR" +} \ No newline at end of file diff --git a/document-ai/infra/functions.tf b/document-ai/infra/functions.tf index 2716598..42ff739 100644 --- a/document-ai/infra/functions.tf +++ b/document-ai/infra/functions.tf @@ -50,7 +50,8 @@ resource "google_cloudfunctions2_function" "function" { timeout_seconds = 60 service_account_email = google_service_account.doc_ai_form_function.email environment_variables = { - DOC_AI_PROCESSOR = google_document_ai_processor.form_parser.id + FORM_PARSER_PROCESSOR = google_document_ai_processor.form_parser.id + SUMMARY_PROCESSOR = google_document_ai_processor.doc_summarizer.id GCS_OUTPUT = google_storage_bucket.doc_output.name BQ_TABLE_ID = local.bq_table_id BQ_LOCATION = var.region diff --git a/document-ai/infra/variables.tf b/document-ai/infra/variables.tf index 84f13e1..b43c1d9 100644 --- a/document-ai/infra/variables.tf +++ b/document-ai/infra/variables.tf @@ -22,6 +22,7 @@ locals { function_name = "form-parser" processor_name = "form-parser-${var.location}" + summarizer_name = "summary-parser-${var.location}" bq_table_name = "form_parser" bq_dataset_name = "document_ai"