-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstage1-preprocessing.go
188 lines (150 loc) · 5.46 KB
/
stage1-preprocessing.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
package simdcsv
import (
"log"
"math/bits"
"reflect"
"unsafe"
)
type stage1Input struct {
quoteMaskIn uint64
separatorMaskIn uint64
carriageReturnMaskIn uint64
quoteMaskInNext uint64
quoted uint64
newlineMaskIn uint64
newlineMaskInNext uint64
}
type stage1Output struct {
quoteMaskOut uint64
separatorMaskOut uint64
carriageReturnMaskOut uint64
needsPostProcessing uint64
}
func preprocessMasks(input *stage1Input, output *stage1Output) {
const clearMask = 0xfffffffffffffffe
separatorMaskIn := input.separatorMaskIn
carriageReturnMaskIn := input.carriageReturnMaskIn
quoteMaskIn := input.quoteMaskIn
separatorPos := bits.TrailingZeros64(separatorMaskIn)
carriageReturnPos := bits.TrailingZeros64(carriageReturnMaskIn)
quotePos := bits.TrailingZeros64(quoteMaskIn)
output.quoteMaskOut = quoteMaskIn // copy quote mask to output
output.separatorMaskOut = separatorMaskIn // copy separator mask to output
output.carriageReturnMaskOut = carriageReturnMaskIn // copy carriage return mask to output
output.needsPostProcessing = 0 // flag to indicate whether post-processing is need for these masks
for {
if quotePos < separatorPos && quotePos < carriageReturnPos {
if input.quoted != 0 && quotePos == 63 && input.quoteMaskInNext&1 == 1 { // last bit of quote mask and first bit of next quote mask set?
// clear out both active bit and ...
quoteMaskIn &= clearMask << quotePos
output.quoteMaskOut &= ^(uint64(1) << quotePos) // mask out quote
output.needsPostProcessing = 1 // post-processing is required for double quotes
// first bit of next quote mask
input.quoteMaskInNext &= ^uint64(1)
} else if input.quoted != 0 && quoteMaskIn&(1<<(quotePos+1)) != 0 { // next quote bit is also set (so two adjacent bits) ?
// clear out both active bit and subsequent bit
quoteMaskIn &= clearMask << (quotePos + 1)
output.quoteMaskOut &= ^(uint64(3) << quotePos) // mask out two quotes
output.needsPostProcessing = 1 // post-processing is required for double quotes
} else {
input.quoted = ^input.quoted
quoteMaskIn &= clearMask << quotePos
}
quotePos = bits.TrailingZeros64(quoteMaskIn)
} else if separatorPos < quotePos && separatorPos < carriageReturnPos {
if input.quoted != 0 {
output.separatorMaskOut &= ^(uint64(1) << separatorPos) // mask out separator bit in quoted field
}
separatorMaskIn &= clearMask << separatorPos
separatorPos = bits.TrailingZeros64(separatorMaskIn)
} else if carriageReturnPos < quotePos && carriageReturnPos < separatorPos {
if input.quoted != 0 {
output.carriageReturnMaskOut &= ^(uint64(1) << carriageReturnPos) // mask out carriage return bit in quoted field
output.needsPostProcessing = 1 // post-processing is required for carriage returns in quoted fields
} else {
if carriageReturnPos == 63 { // special handling for last position of mask
if input.newlineMaskInNext&1 == 0 {
output.carriageReturnMaskOut &= ^(uint64(1) << carriageReturnPos) // mask out carriage return for replacement without following newline
}
} else {
if input.newlineMaskIn&(uint64(1)<<(carriageReturnPos+1)) == 0 {
output.carriageReturnMaskOut &= ^(uint64(1) << carriageReturnPos) // mask out carriage return bit in quoted field
}
}
}
carriageReturnMaskIn &= clearMask << carriageReturnPos
carriageReturnPos = bits.TrailingZeros64(carriageReturnMaskIn)
} else {
// we must be done
break
}
}
return
}
type postProcRow struct {
start int
end int
}
//
// Determine which rows and columns need post processing
// This is needed to replace both "" to " as well as
// \r\n to \n for specific fields
func getPostProcRows(buf []byte, postProc []uint64, simdrecords [][]string) []postProcRow {
// TODO: Crude implementation, make more refined/granular
sliceptr := func(slc []byte) uintptr {
return (*reflect.SliceHeader)(unsafe.Pointer(&slc)).Data
}
stringptr := func(s string) uintptr {
return (*reflect.StringHeader)(unsafe.Pointer(&s)).Data
}
ppRows := make([]postProcRow, 0, 128)
row, pbuf := 0, sliceptr(buf)
for ipp, pp := range postProc {
if ipp < len(postProc)-1 && pp == postProc[ipp+1] {
continue // if offset occurs multiple times, process only last one
}
// find start row to process
for row < len(simdrecords) && uint64(stringptr(simdrecords[row][0])-pbuf) < pp {
row++
}
ppr := postProcRow{}
if row > 0 {
ppr.start = row - 1
}
// find end row to process
for row < len(simdrecords) && uint64(stringptr(simdrecords[row][0])-pbuf) < pp+64 {
row++
}
ppr.end = row
ppRows = append(ppRows, ppr)
}
if len(ppRows) <= 1 {
return ppRows
}
// merge overlapping ranges into a single range
ppRowsMerged := make([]postProcRow, 0, len(ppRows))
start, end := ppRows[0].start, ppRows[0].end
for _, pp := range ppRows[1:] {
if end < pp.start {
ppRowsMerged = append(ppRowsMerged, postProcRow{start, end})
start, end = pp.start, pp.end
} else {
end = pp.end
}
}
ppRowsMerged = append(ppRowsMerged, postProcRow{start, end})
return ppRowsMerged
}
func diffBitmask(diff1, diff2 string) (diff string) {
if len(diff1) != len(diff2) {
log.Fatalf("sizes don't match")
}
for i := range diff1 {
if diff1[i] != diff2[i] {
diff += "^"
} else {
diff += " "
}
}
return diff1 + "\n" + diff2 + "\n" + diff + "\n"
}