-
Notifications
You must be signed in to change notification settings - Fork 8
/
extract.go
142 lines (125 loc) · 3.01 KB
/
extract.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
package extract
import (
"sort"
)
type Word struct {
Text string
LeftX float64
RightX float64
TopY float64
BottomY float64
}
type byXLeft []Word
func (s byXLeft) Len() int {
return len(s)
}
func (s byXLeft) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
func (s byXLeft) Less(i, j int) bool {
return s[i].LeftX < s[j].LeftX
}
type byRow []Word
func (s byRow) Len() int {
return len(s)
}
func (s byRow) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
func (s byRow) Less(i, j int) bool {
// sort boxes by row, then by x
// find row by checking if the bottom y is above the top y.
// within a row, use xLeft
if s[i].BottomY < s[j].TopY {
return true // i is on row before j
}
if s[i].TopY > s[j].BottomY {
return false // j is on row before i
}
// same row, so compare x
return s[i].LeftX < s[j].LeftX
}
type BySize [][2]float64
func (s BySize) Len() int {
return len(s)
}
func (s BySize) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
func (s BySize) Less(i, j int) bool {
return s[i][1]-s[i][0] < s[j][1]-s[j][0]
}
// words need to be sorted by xLeft
// returns splits sorted by size
func FindSplits(words []Word) []float64 {
// sort words by xleft
sort.Sort(byXLeft(words))
splits := make([][2]float64, 0)
xRight := float64(0)
for i, w := range words {
if w.LeftX > xRight && i > 0 {
splits = append(splits, [2]float64{xRight, w.LeftX})
}
if w.RightX > xRight {
xRight = w.RightX
}
}
sort.Sort(BySize(splits))
splitAt := make([]float64, len(splits))
for i, interval := range splits {
start := interval[0]
end := interval[1]
length := end - start
splitAt[i] = start + length/2
}
sort.Float64s(splitAt)
return splitAt
}
func SplitRowBoxesMidpoint(words []Word, xs []float64) [][]Word {
midpoint := func(word Word) float64 { return word.LeftX + (word.RightX-word.LeftX)/2 }
partitions := SplitRowBoxesFunc(words, xs, midpoint)
return partitions
}
func SplitRowBoxesEdge(words []Word, xs []float64) [][]Word {
leftX := func(word Word) float64 { return word.LeftX }
partitions := SplitRowBoxesFunc(words, xs, leftX)
return partitions
}
func SplitRowBoxesFunc(words []Word, xs []float64, f func(Word) float64) [][]Word {
sort.Sort(byXLeft(words))
partitions := make([][]Word, len(xs)+1)
for i := range partitions {
partitions[i] = make([]Word, 0)
}
i := 0
for _, word := range words {
if i < len(xs) && f(word) > xs[i] {
i++
}
partitions[i] = append(partitions[i], word)
}
return partitions
}
// words need to be sorted by row order,
// assume one row has max(yBottom) < min(yMax) other row
func PartitionIntoRows(words []Word) [][]Word {
// sort by row
sort.Sort(byRow(words))
partitions := make([][]Word, 0)
firstRow := make([]Word, 0)
firstRow = append(firstRow, words[0])
partitions = append(partitions, firstRow)
i := 0
prevX := float64(0)
for _, w := range words[1:] {
// if new row
if w.LeftX < prevX {
i++
newRow := make([]Word, 0)
partitions = append(partitions, newRow)
}
partitions[i] = append(partitions[i], w)
prevX = w.LeftX
}
return partitions
}