Skip to content

Commit

Permalink
Merge pull request #115 from GoogleCloudPlatform/document-ai-genai
Browse files Browse the repository at this point in the history
Added GenAI summarization to Document AI processing
  • Loading branch information
rajasnathak authored Feb 1, 2024
2 parents 6867a93 + 6a5051c commit 56cee65
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 10 deletions.
2 changes: 1 addition & 1 deletion document-ai/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ Pricing Estimates - We have created a sample estimate based on some usage we see

1. Click on Open in Google Cloud Shell button below.

<a href="https://ssh.cloud.google.com/cloudshell/editor?cloudshell_git_repo=https://github.com/GoogleCloudPlatform/click-to-deploy-solutions&cloudshell_workspace=document-ai&cloudshell_open_in_editor=terraform/terraform.tfvars&cloudshell_tutorial=tutorial.md" target="_new">
<a href="https://ssh.cloud.google.com/cloudshell/editor?cloudshell_git_repo=https://github.com/GoogleCloudPlatform/click-to-deploy-solutions&cloudshell_workspace=document-ai&cloudshell_open_in_editor=infra/terraform.tfvars&cloudshell_tutorial=tutorial.md" target="_new">
<img alt="Open in Cloud Shell" src="https://gstatic.com/cloudssh/images/open-btn.svg">
</a>

Expand Down
39 changes: 31 additions & 8 deletions document-ai/code/main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from google.cloud import bigquery, documentai_v1beta3, storage
import functions_framework
from google.cloud import documentai
import os
import json


def process_document(bucket_name, object_name):
"""Process a document stored in GCS."""
print("Document processing started.")
Expand All @@ -20,13 +20,15 @@ def process_document(bucket_name, object_name):
# Read the file into memory
with open(file_path, "rb") as image:
image_content = image.read()
# Set the document content in the request
document = {"content": image_content, "mime_type": blob.content_type}

# Configure the process request
processor_name = os.getenv("DOC_AI_PROCESSOR")
processor_name = os.getenv("FORM_PARSER_PROCESSOR")
if not processor_name:
print("Environment variable DOC_AI_PROCESSOR not set")
exit(1)
print("Environment variable FORM_PARSER_PROCESSOR not set")
return

request = {"name": processor_name, "document": document}

# Use the Document AI client to process the request
Expand All @@ -43,10 +45,24 @@ def process_document(bucket_name, object_name):
fieldValue = get_text(form_field.field_value, document)
document_dict[f"{fieldName}"] = fieldValue

# Extract Summary
# Set the document content in the request
document = {"content": image_content, "mime_type": blob.content_type}
print("Summarizing Document")
summary_processor_name = os.getenv("SUMMARY_PROCESSOR")
if not summary_processor_name:
print("Environment variable SUMMARY_PROCESSOR not set")
return

summary_request = {"name": summary_processor_name, "document": document}
summary_result = client.process_document(request=summary_request)
document = summary_result.document
summary_text = document.entities[0].mention_text
print("Document processing complete.")
process_output(bucket_name, object_name, document_text, document_dict)
process_output(bucket_name, object_name, document_text, summary_text, document_dict)



def get_text(doc_element: dict, document: dict):
"""
Document AI identifies form fields by their offsets
Expand All @@ -67,7 +83,7 @@ def get_text(doc_element: dict, document: dict):
return response


def process_output(bucket_name, object_name, document_text, document_dict):
def process_output(bucket_name, object_name, document_text, summary_text, document_dict):
"""Moves a blob from one bucket to another."""
print("Process output started.")
storage_client = storage.Client()
Expand All @@ -80,10 +96,16 @@ def process_output(bucket_name, object_name, document_text, document_dict):
results_text_blob = destination_bucket.blob(results_text_name)
results_text_blob.upload_from_string(document_text)

print("Saving summary results into the output bucket...")
results_summary_name = "{}.summary".format(object_name)
results_summary_blob = destination_bucket.blob(results_summary_name)
results_summary_blob.upload_from_string(summary_text)

print("Saving json results into the output bucket...")
results_json = {
"document_file_name": object_name,
"document_content": document_dict
"document_content": document_dict,
"document_summary": summary_text
}
results_json = json.dumps(results_json)
results_json_name = "{}.json".format(object_name)
Expand All @@ -105,6 +127,7 @@ def process_output(bucket_name, object_name, document_text, document_dict):
schema=[
bigquery.SchemaField("document_file_name", "STRING"),
bigquery.SchemaField("document_content", "JSON"),
bigquery.SchemaField("document_summary", "STRING"),
],
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
)
Expand Down Expand Up @@ -143,4 +166,4 @@ def trigger_gcs(cloud_event):
print(f"Created: {timeCreated}")
print(f"Updated: {updated}")

process_document(bucket, name)
process_document(bucket, name)
5 changes: 5 additions & 0 deletions document-ai/infra/bq.tf
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ resource "google_bigquery_table" "form_parser" {
"name": "document_content",
"type": "JSON",
"mode": "NULLABLE"
},
{
"name": "document_summary",
"type": "STRING",
"mode": "NULLABLE"
}
]
EOF
Expand Down
5 changes: 5 additions & 0 deletions document-ai/infra/doc_ai.tf
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,8 @@ resource "google_document_ai_processor" "form_parser" {
display_name = local.processor_name
type = "FORM_PARSER_PROCESSOR"
}
resource "google_document_ai_processor" "doc_summarizer" {
location = var.location
display_name = local.summarizer_name
type = "SUMMARY_PROCESSOR"
}
3 changes: 2 additions & 1 deletion document-ai/infra/functions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ resource "google_cloudfunctions2_function" "function" {
timeout_seconds = 60
service_account_email = google_service_account.doc_ai_form_function.email
environment_variables = {
DOC_AI_PROCESSOR = google_document_ai_processor.form_parser.id
FORM_PARSER_PROCESSOR = google_document_ai_processor.form_parser.id
SUMMARY_PROCESSOR = google_document_ai_processor.doc_summarizer.id
GCS_OUTPUT = google_storage_bucket.doc_output.name
BQ_TABLE_ID = local.bq_table_id
BQ_LOCATION = var.region
Expand Down
1 change: 1 addition & 0 deletions document-ai/infra/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ locals {

function_name = "form-parser"
processor_name = "form-parser-${var.location}"
summarizer_name = "summary-parser-${var.location}"

bq_table_name = "form_parser"
bq_dataset_name = "document_ai"
Expand Down

0 comments on commit 56cee65

Please sign in to comment.