-
Notifications
You must be signed in to change notification settings - Fork 0
/
syncproc.py
104 lines (79 loc) · 3.16 KB
/
syncproc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import boto3
from decimal import Decimal
import json
import os
from helper import AwsHelper, S3Helper, DynamoDBHelper
from og import OutputGenerator
import datastore
def callTextract(bucketName, objectName, detectText, detectForms, detectTables):
textract = AwsHelper().getClient('textract')
if(not detectForms and not detectTables):
response = textract.detect_document_text(
Document={
'S3Object': {
'Bucket': bucketName,
'Name': objectName
}
}
)
else:
features = []
if(detectTables):
features.append("TABLES")
if(detectForms):
features.append("FORMS")
response = textract.analyze_document(
Document={
'S3Object': {
'Bucket': bucketName,
'Name': objectName
}
},
FeatureTypes=features
)
return response
def processImage(documentId, features, bucketName, objectName, outputTableName, documentsTableName):
detectText = "Text" in features
detectForms = "Forms" in features
detectTables = "Tables" in features
response = callTextract(bucketName, objectName, detectText, detectForms, detectTables)
dynamodb = AwsHelper().getResource("dynamodb")
ddb = dynamodb.Table(outputTableName)
print("Generating output for DocumentId: {}".format(documentId))
opg = OutputGenerator(documentId, response, bucketName, objectName, detectForms, detectTables, ddb)
opg.run()
print("DocumentId: {}".format(documentId))
ds = datastore.DocumentStore(documentsTableName, outputTableName)
ds.markDocumentComplete(documentId)
# --------------- Main handler ------------------
def processRequest(request):
output = ""
print("request: {}".format(request))
bucketName = request['bucketName']
objectName = request['objectName']
features = request['features']
documentId = request['documentId']
outputTable = request['outputTable']
documentsTable = request['documentsTable']
documentsTable = request["documentsTable"]
if(documentId and bucketName and objectName and features):
print("DocumentId: {}, features: {}, Object: {}/{}".format(documentId, features, bucketName, objectName))
processImage(documentId, features, bucketName, objectName, outputTable, documentsTable)
output = "Document: {}, features: {}, Object: {}/{} processed.".format(documentId, features, bucketName, objectName)
print(output)
return {
'statusCode': 200,
'body': output
}
def lambda_handler(event, context):
print("event: {}".format(event))
message = json.loads(event['Records'][0]['body'])
print("Message: {}".format(message))
request = {}
request["documentId"] = message['documentId']
request["bucketName"] = message['bucketName']
request["objectName"] = message['objectName']
request["features"] = message['features']
request["outputTable"] = os.environ['OUTPUT_TABLE']
request["documentsTable"] = os.environ['DOCUMENTS_TABLE']
return processRequest(request)