diff --git a/document-ai/README.md b/document-ai/README.md
index 19830da..4710c0d 100644
--- a/document-ai/README.md
+++ b/document-ai/README.md
@@ -47,7 +47,7 @@ Pricing Estimates - We have created a sample estimate based on some usage we see
1. Click on Open in Google Cloud Shell button below.
-
+
diff --git a/document-ai/code/main.py b/document-ai/code/main.py
index 95d0a72..480bcd3 100644
--- a/document-ai/code/main.py
+++ b/document-ai/code/main.py
@@ -1,9 +1,9 @@
from google.cloud import bigquery, documentai_v1beta3, storage
import functions_framework
+from google.cloud import documentai
import os
import json
-
def process_document(bucket_name, object_name):
"""Process a document stored in GCS."""
print("Document processing started.")
@@ -20,13 +20,15 @@ def process_document(bucket_name, object_name):
# Read the file into memory
with open(file_path, "rb") as image:
image_content = image.read()
+ # Set the document content in the request
document = {"content": image_content, "mime_type": blob.content_type}
# Configure the process request
- processor_name = os.getenv("DOC_AI_PROCESSOR")
+ processor_name = os.getenv("FORM_PARSER_PROCESSOR")
if not processor_name:
- print("Environment variable DOC_AI_PROCESSOR not set")
- exit(1)
+ print("Environment variable FORM_PARSER_PROCESSOR not set")
+ return
+
request = {"name": processor_name, "document": document}
# Use the Document AI client to process the request
@@ -43,10 +45,24 @@ def process_document(bucket_name, object_name):
fieldValue = get_text(form_field.field_value, document)
document_dict[f"{fieldName}"] = fieldValue
+ # Extract Summary
+ # Set the document content in the request
+ document = {"content": image_content, "mime_type": blob.content_type}
+ print("Summarizing Document")
+ summary_processor_name = os.getenv("SUMMARY_PROCESSOR")
+ if not summary_processor_name:
+ print("Environment variable SUMMARY_PROCESSOR not set")
+ return
+
+ summary_request = {"name": summary_processor_name, "document": document}
+ summary_result = client.process_document(request=summary_request)
+ document = summary_result.document
+ summary_text = document.entities[0].mention_text
print("Document processing complete.")
- process_output(bucket_name, object_name, document_text, document_dict)
+ process_output(bucket_name, object_name, document_text, summary_text, document_dict)
+
def get_text(doc_element: dict, document: dict):
"""
Document AI identifies form fields by their offsets
@@ -67,7 +83,7 @@ def get_text(doc_element: dict, document: dict):
return response
-def process_output(bucket_name, object_name, document_text, document_dict):
+def process_output(bucket_name, object_name, document_text, summary_text, document_dict):
"""Moves a blob from one bucket to another."""
print("Process output started.")
storage_client = storage.Client()
@@ -80,10 +96,16 @@ def process_output(bucket_name, object_name, document_text, document_dict):
results_text_blob = destination_bucket.blob(results_text_name)
results_text_blob.upload_from_string(document_text)
+ print("Saving summary results into the output bucket...")
+ results_summary_name = "{}.summary".format(object_name)
+ results_summary_blob = destination_bucket.blob(results_summary_name)
+ results_summary_blob.upload_from_string(summary_text)
+
print("Saving json results into the output bucket...")
results_json = {
"document_file_name": object_name,
- "document_content": document_dict
+ "document_content": document_dict,
+ "document_summary": summary_text
}
results_json = json.dumps(results_json)
results_json_name = "{}.json".format(object_name)
@@ -105,6 +127,7 @@ def process_output(bucket_name, object_name, document_text, document_dict):
schema=[
bigquery.SchemaField("document_file_name", "STRING"),
bigquery.SchemaField("document_content", "JSON"),
+ bigquery.SchemaField("document_summary", "STRING"),
],
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
)
@@ -143,4 +166,4 @@ def trigger_gcs(cloud_event):
print(f"Created: {timeCreated}")
print(f"Updated: {updated}")
- process_document(bucket, name)
+ process_document(bucket, name)
\ No newline at end of file
diff --git a/document-ai/infra/bq.tf b/document-ai/infra/bq.tf
index 4824d29..83a51ea 100644
--- a/document-ai/infra/bq.tf
+++ b/document-ai/infra/bq.tf
@@ -37,6 +37,11 @@ resource "google_bigquery_table" "form_parser" {
"name": "document_content",
"type": "JSON",
"mode": "NULLABLE"
+ },
+ {
+ "name": "document_summary",
+ "type": "STRING",
+ "mode": "NULLABLE"
}
]
EOF
diff --git a/document-ai/infra/doc_ai.tf b/document-ai/infra/doc_ai.tf
index 524901f..62a5cc3 100644
--- a/document-ai/infra/doc_ai.tf
+++ b/document-ai/infra/doc_ai.tf
@@ -17,3 +17,8 @@ resource "google_document_ai_processor" "form_parser" {
display_name = local.processor_name
type = "FORM_PARSER_PROCESSOR"
}
+resource "google_document_ai_processor" "doc_summarizer" {
+ location = var.location
+ display_name = local.summarizer_name
+ type = "SUMMARY_PROCESSOR"
+}
\ No newline at end of file
diff --git a/document-ai/infra/functions.tf b/document-ai/infra/functions.tf
index 2716598..42ff739 100644
--- a/document-ai/infra/functions.tf
+++ b/document-ai/infra/functions.tf
@@ -50,7 +50,8 @@ resource "google_cloudfunctions2_function" "function" {
timeout_seconds = 60
service_account_email = google_service_account.doc_ai_form_function.email
environment_variables = {
- DOC_AI_PROCESSOR = google_document_ai_processor.form_parser.id
+ FORM_PARSER_PROCESSOR = google_document_ai_processor.form_parser.id
+ SUMMARY_PROCESSOR = google_document_ai_processor.doc_summarizer.id
GCS_OUTPUT = google_storage_bucket.doc_output.name
BQ_TABLE_ID = local.bq_table_id
BQ_LOCATION = var.region
diff --git a/document-ai/infra/variables.tf b/document-ai/infra/variables.tf
index 84f13e1..b43c1d9 100644
--- a/document-ai/infra/variables.tf
+++ b/document-ai/infra/variables.tf
@@ -22,6 +22,7 @@ locals {
function_name = "form-parser"
processor_name = "form-parser-${var.location}"
+ summarizer_name = "summary-parser-${var.location}"
bq_table_name = "form_parser"
bq_dataset_name = "document_ai"