forked from dbmcclain/regex
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fast-csv.lisp
296 lines (269 loc) · 10.3 KB
/
fast-csv.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
;; fast-csv.lisp -- Fast, Simple, CSV File Reading
;;
;; This version is very fast because it does the minimum possible work
;; in reading CSV files, and extracting information on demand.
;;
;; It is assumed that no fields are quoted fields that contain the primary delimiter.
;; Primary delimiters (comma for CSV, tab for TSV files) are heuristically determined
;; from examination of the first line of the file, which is assumed to be the headings
;; line. Subsequent lines are assumed to contain data. Empty lines are discarded.
;;
;; These conditions are satisfied by nearly 99%+ of all the CSV files ever seen.
;; If your CSV/TSV file violates any of these conditions, then use the more elaborate
;; and general-purpose routines in CSV.LISP.
;;
;; DM/RAL 06/07
;; ----------------------------------------------------------------------------
(in-package fast-csv)
(defun collect-delimiter-positions (s delim &key count (start 0))
(loop with pos = start
with end = start
while (and end
(or (null count)
(plusp count)))
do (setq end (position delim s :start pos))
collect (or end (length s))
when end
do (setq pos (1+ end))
when count
do (decf count)
))
(defun collect-headers (s delim)
(let ((pos (collect-delimiter-positions s delim)))
(loop with start = 0
for p in pos
collect (subseq s start p)
do (setq start (1+ p))
)))
(defun read-lines (&optional fname)
;;
;; Return a list of text lines from the file
;;
(lw:when-let (fnm (or fname
(capi:prompt-for-file
"Select File to Read"
:filters '("CSV Files" "*.csv;*.tsv"
"All Files" "*.*"))
))
(with-open-file (fp fnm)
(loop for line = (read-line fp nil :eof)
until (eq line :eof)
while line
when (plusp (length line)) ;; elide empty lines
collect line))
))
(defun decide-csv-or-tsv (line)
;;
;; count the number of commas and tabs in the list of lines.
;; If commas outnumber tabs then we probably have CSV. Else we
;; probably have TSV.
;;
(let ((n-comma (count-if (um:curry #'char= #\,) line))
(n-tab (count-if (um:curry #'char= #\Tab) line)))
(if (> n-comma n-tab)
#\,
#\Tab)
))
(defstruct csv-info
hdrs
delim
row-infos
nlines
lines
ends-cache)
(defun collect-additional-ends (info column row line)
(with-accessors ((ends-cache csv-info-ends-cache)
(row-infos csv-info-row-infos )
(delim csv-info-delim )) info
(let* ((ends (aref row-infos row))
(new-ends (if ends
;; we have some partial information
(let* ((nends (length ends))
(more-ends (collect-delimiter-positions
line delim
:count (1+ (- column nends))
:start (1+ (aref ends (1- nends)))
)))
(concatenate 'vector ends more-ends))
;; else we never had any information -- build from scratch
(coerce
(collect-delimiter-positions
line delim
:count (1+ column)
:start 0)
'vector)))
(unique-ends (gethash new-ends ends-cache nil)))
(unless unique-ends
(setf (gethash new-ends ends-cache) new-ends))
(setf (aref row-infos row) (or unique-ends new-ends))
)))
(defun ensure-column-of-row-exists (info column row line)
;; at this point row-info cannot be nil, but its contents might be...
(with-accessors ((row-infos csv-info-row-infos)) info
(let ((ends (aref row-infos row)))
(unless (< column (length ends))
(collect-additional-ends info column row line))
)))
(defun extract-data-of-column (info column reverse op)
(with-accessors ((nlines csv-info-nlines)
(lines csv-info-lines)
(row-infos csv-info-row-infos)) info
(let* ((nelm1 (1- nlines))
(data (make-array nlines)))
(loop for line in lines
for row from 0
for revrow = (- nelm1 row)
do
(ensure-column-of-row-exists info column row line)
;; at this point ends is a vector of delimiter positions
;; that is at least as long as one greater than the column index
(let* ((ends (aref row-infos row))
(start (if (zerop column)
0
(1+ (aref ends (1- column)))
))
(end (aref ends column))
(datum (funcall op line start end)))
(setf (aref data (if reverse revrow row)) datum)
))
data
)))
(defun get-column-of-hdr (hdr info)
(position (um:mkstr hdr) (csv-info-hdrs info)
:test #'string-equal))
(defun ensure-row-infos-exist (info column all)
(with-accessors ((row-infos csv-info-row-infos)
(nlines csv-info-nlines )
(lines csv-info-lines )) info
(unless row-infos
(setf row-infos (make-array nlines))
(when all
(loop for row from 0
for line in lines
do
(collect-additional-ends info column row line)))
)))
(defun get-numeric-field-op (pre-op)
(if pre-op
;; we need to extract a subseq for the user's function
(lambda (line start end)
(read-from-string
(funcall pre-op (subseq line start end))
nil 0))
;; else we can avoid creating a subseq
(lambda (line start end)
(read-from-string line nil 0
:start start :end end))
))
(defun get-string-field-op (pre-op post-op numeric-p)
(if numeric-p
(let ((op (get-numeric-field-op pre-op)))
(if post-op
(um:compose post-op op)
op))
(if pre-op
(if post-op
(um:compose post-op pre-op #'subseq)
(um:compose pre-op #'subseq))
(if post-op
(um:compose post-op #'subseq)
#'subseq))
))
;; --------------------------------------------------------------
;; user accessible routines...
;;
(defun read-file (fname
&key
(ndrop 0)
&allow-other-keys)
(let* ((lines (nthcdr ndrop (read-lines fname)))
(delim (decide-csv-or-tsv (first lines)))
(hdrs (collect-headers (first lines) delim))
(data-lines (rest lines)))
(make-csv-info
:hdrs hdrs
:delim delim
:row-infos nil ;; a vector of vectors of delimiter positions
:nlines (length data-lines)
:lines data-lines
:ends-cache (make-hash-table :test #'equalp)
)))
(defun get-column (hdr info
&key
reverse
scrubber
(pre-op scrubber)
post-op
numeric-p
&allow-other-keys)
;; user should utilize keys :reverse and :op
;; the :num-op key is for our private use
(let* ((column (get-column-of-hdr hdr info)))
(when column
(ensure-row-infos-exist info column :all)
(extract-data-of-column info column reverse
(get-string-field-op pre-op post-op numeric-p))
)))
(defun get-numeric-column (hdr info
&key
reverse
scrubber
(pre-op scrubber)
post-op
&allow-other-keys)
(get-column hdr info
:reverse reverse
:numeric-p t
:pre-op pre-op
:post-op post-op))
(defun get-field (hdr row info
&key
reverse
scrubber
(pre-op scrubber)
post-op
numeric-p
&allow-other-keys)
;; user should utilize keys :reverse and :op
;; the :num-op key is for our private use
(with-accessors ((lines csv-info-lines )
(row-infos csv-info-row-infos)
(nlines csv-info-nlines )) info
(let ((column (get-column-of-hdr hdr info)))
(when column
(ensure-row-infos-exist info column nil)
(let* ((actual-row (cond ((numberp row) (if reverse
(- nlines row 1)
row))
((eq row :last) (if reverse
0
(1- nlines)))
((eq row :first) (if reverse
(1- nlines)
0))
))
(line (nth actual-row lines)))
(ensure-column-of-row-exists info column actual-row line)
(let* ((ends (aref row-infos actual-row))
(start (if (zerop column)
0
(1+ (aref ends (1- column)))
))
(end (aref ends column)))
(funcall (get-string-field-op pre-op post-op numeric-p)
line start end)
))
))
))
(defun get-numeric-field (hdr row info
&key
reverse
scrubber
(pre-op scrubber)
post-op
&allow-other-keys)
(get-field hdr row info
:numeric-p t
:pre-op pre-op
:post-op post-op
:reverse reverse))