-
Notifications
You must be signed in to change notification settings - Fork 0
/
deduplicate-csv.scm
executable file
·44 lines (39 loc) · 1.35 KB
/
deduplicate-csv.scm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/bin/sh
# -*- scheme -*-
exec guile -e main -s "$0" "$@"
!#
;; Remove duplicate entries from the csv file (these are due to
;; downloading multiple versions of the same ID).
(use-modules (ice-9 rdelim) ; for read-line
(ice-9 i18n)
(srfi srfi-1) ; first, second, third
)
(define (deduplicate infile outfile)
(let ((known (make-hash-table))
(inport (open-input-file infile))
(outport (open-output-file outfile)))
;; first copy the header
(display (read-line inport) outport)
(newline outport)
(let copy-dedup ((line (read-line inport)))
(cond
((eof-object? line)
#t)
(else
(let* ((columns (string-split line #\;))
(source (first columns))
(target (second columns))
(key (string-append source target)))
(when (not (hash-ref known key))
(hash-set! known key #t)
(display line outport)
(newline outport))
(copy-dedup (read-line inport))))))))
(define (main args)
(let ((infile (if (null? (cdr args))
"trust.csv"
(second args)))
(outfile (if (or (null? (cdr args)) (null? (cdr (cdr args))))
"trust-deduplicated.csv"
(third args))))
(deduplicate infile outfile)))