Skip to content

Commit

Permalink
Store detected boxes JSON in DynamoDB as well
Browse files Browse the repository at this point in the history
  • Loading branch information
vegarsti committed Feb 15, 2024
1 parent b191371 commit a5137dc
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 31 deletions.
59 changes: 42 additions & 17 deletions cmd/cli/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@ import (
"crypto/sha256"
"encoding/json"
"fmt"
"log"
"os"
"strings"
"text/tabwriter"

"github.com/vegarsti/extract"
"github.com/vegarsti/extract/box"
"github.com/vegarsti/extract/dynamodb"
"github.com/vegarsti/extract/image"
"github.com/vegarsti/extract/textract"
)

Expand Down Expand Up @@ -43,39 +46,61 @@ func main() {

// Check if table is stored
checksum := fmt.Sprintf("%x", sha256.Sum256(imageBytes))
fmt.Println(checksum)
storedBytes, err := dynamodb.GetTable(checksum)
if err != nil {
die(err)
}
if storedBytes != nil {
var table [][]string
json.Unmarshal(storedBytes, &table)
writeTable(table)
return
}
// fmt.Println(checksum)
// storedBytes, err := dynamodb.GetTable(checksum)
// if err != nil {
// die(err)
// }
// if storedBytes != nil {
// var table [][]string
// json.Unmarshal(storedBytes, &table)
// writeTable(table)
// return
// }

file := &extract.File{
Bytes: imageBytes,
ContentType: contentType,
}

output, err := textract.AnalyzeDocument(file)
// Don't use Textract's Analyze Document, use OCR and custom algorithm instead
output, err := textract.DetectDocumentText(file)
if err != nil {
die(err)
die(fmt.Errorf("textract text detection failed: %w", err))
}
table, err := textract.ToTableFromDetectedTable(output)
boxes, err := textract.ToLinesFromOCR(output)
if err != nil {
die(err)
die(fmt.Errorf("failed to convert to boxes: %w", err))
}
rows, table := box.ToTable(boxes)

// Add boxes
if contentType == extract.PNG {
newEncodedImage, err := image.AddBoxes(file.Bytes, boxes)
if err != nil {
log.Printf("add boxes to image 1 failed: %v", err)
} else {
rowsFlattened := make([]box.Box, 0)
for _, row := range rows {
rowsFlattened = append(rowsFlattened, row...)
}
newEncodedImage2, err := image.AddBoxes(file.Bytes, rowsFlattened)
if err != nil {
log.Printf("add boxes to image 2 failed: %v", err)
file.BytesWithBoxes = []byte(newEncodedImage)
file.BytesWithRowBoxes = []byte(newEncodedImage2)
}
fmt.Println("hello")
}
}

writeTable(table)

// store in dynamo db
tableJSON, err := json.Marshal(table)
if err != nil {
die(err)
}
if err := dynamodb.PutTable(checksum[:], tableJSON); err != nil {
if err := dynamodb.PutTable(checksum[:], tableJSON, []byte{}); err != nil {
die(err)
}
}
Expand Down
32 changes: 19 additions & 13 deletions cmd/lambda/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,25 +160,27 @@ func getTable(file *extract.File) ([][]string, error) {
}
rows, table := box.ToTable(boxes)

// Add boxes
if file.ContentType == extract.PNG {
newEncodedImage, err := image.AddBoxes(file.Bytes, boxes)
if err != nil {
log.Printf("add boxes to image 1 failed: %v", err)
} else {
// Create images with words and cells
go func() {
if file.ContentType == extract.PNG {
imageWithWords, err := image.AddBoxes(file.Bytes, boxes)
if err != nil {
log.Printf("add word boxes to image failed: %v", err)
return
}
rowsFlattened := make([]box.Box, 0)
for _, row := range rows {
rowsFlattened = append(rowsFlattened, row...)
}
newEncodedImage2, err := image.AddBoxes(file.Bytes, rowsFlattened)
imageWithCells, err := image.AddBoxes(file.Bytes, rowsFlattened)
if err != nil {
log.Printf("add boxes to image 2 failed: %v", err)
} else {
file.BytesWithBoxes = []byte(newEncodedImage)
file.BytesWithRowBoxes = []byte(newEncodedImage2)
log.Printf("add cell boxes to image failed: %v", err)
return
}
file.BytesWithBoxes = []byte(imageWithWords)
file.BytesWithRowBoxes = []byte(imageWithCells)
}
}
}()

log.Printf("ocr-to-table: %s", time.Since(startAlgorithm).String())
if err != nil {
Expand Down Expand Up @@ -223,7 +225,11 @@ func getTable(file *extract.File) ([][]string, error) {
})
g.Go(func() error {
startPut := time.Now()
if err := dynamodb.PutTable(file.Checksum, tableBytes); err != nil {
boxesJSON, err := json.Marshal(boxes)
if err != nil {
return fmt.Errorf("failed to convert boxes to json: %w", err)
}
if err := dynamodb.PutTable(file.Checksum, tableBytes, boxesJSON); err != nil {
return fmt.Errorf("dynamodb.PutTable: %w", err)
}
log.Printf("dynamodb put: %s", time.Since(startPut).String())
Expand Down
3 changes: 2 additions & 1 deletion dynamodb/dynamodb.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ func CreateTable(sess *session.Session) error {
return nil
}

func PutTable(checksum string, table []byte) error {
func PutTable(checksum string, table []byte, boxesJSON []byte) error {
sess, err := session.NewSession()
if err != nil {
return fmt.Errorf("unable to create session: %w", err)
Expand All @@ -46,6 +46,7 @@ func PutTable(checksum string, table []byte) error {
// Old: Used table detection directly, new uses custom algorithm
// "JSONTable": {B: table},
"JSONTableCustomDetection": {B: table},
"JSONBoxes": {B: boxesJSON},
},
TableName: aws.String("Tables"),
}
Expand Down

0 comments on commit a5137dc

Please sign in to comment.