-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlexer.lisp
121 lines (113 loc) · 4.19 KB
/
lexer.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
(in-package :aoc2020)
(define-parse-tree-synonym int
(:register
(:sequence
(:greedy-repetition 0 1 (:char-class #\- #\+))
(:greedy-repetition 1 nil :digit-class))))
(defun %int (d)
`(:register
(:sequence
(:greedy-repetition ,d ,d :digit-class))))
(define-parse-tree-synonym letter
(:register :word-char-class))
(define-parse-tree-synonym word
(:register
(:sequence
:word-boundary
(:greedy-repetition 1 nil :word-char-class)
:word-boundary)))
(defun %word (d)
`(:register
(:sequence
:word-boundary
(:greedy-repetition 1 ,d :word-char-class))))
(defun map-tokens (callback stream &key (sharedp nil) (debugp nil))
(declare (type (function (t)) callback)
(type stream stream))
(let ((buffer (make-array 256
:fill-pointer 0
:element-type 'character
:adjustable t))
(length-param 0)
(state-fn #'values)
(buffer-copy (if sharedp #'identity #'copy-seq)))
(declare (type function state-fn)
(type string buffer)
(type function buffer-copy))
;; helper functions
(labels ((feed-fsm (c)
(when debugp
(print `(:feed ,state-fn ,c) *trace-output*))
(funcall state-fn c))
(switch-to (s &optional c)
(setf state-fn s)
(when c
(feed-fsm c)))
(clear ()
(setf length-param 0)
(setf (fill-pointer buffer) 0))
(buffer (c)
(vector-push-extend c buffer (array-total-size buffer)))
(emit (token-type &rest components)
(apply callback token-type components))
(finish-literal-token ()
(when (> (length buffer) 0)
(emit :literal (funcall buffer-copy buffer)))
(clear)))
;; fsm state functions
(labels ((dispatch (c)
(case c
(#\%
(switch-to #'maybe-escape))
(t (buffer c))))
(maybe-escape (c)
(case c
(#\% (buffer c)
(switch-to #'dispatch))
(t (finish-literal-token)
(switch-to #'await-type c))))
(await-length-param (c)
(if-let (d (digit-char-p c))
(setf length-param (+ (* length-param 10) d))
(switch-to #'await-type c)))
(await-type (c)
(flet ((emit/dispatch (&rest token)
(apply #'emit token)
(switch-to #'dispatch)))
(case c
((#\i #\d) (emit/dispatch :integer length-param))
(#\c (emit/dispatch :character))
(#\s (emit/dispatch :word length-param))
(t (cond
((digit-char-p c)
(switch-to #'await-length-param c))
(t (error "unexpected %~a sequence" c))))))))
;; lexer
(loop
:initially (switch-to #'dispatch)
:for c := (read-char stream () ())
:while c
:do (feed-fsm c)
:finally (finish-literal-token))))))
(defun as-regex-node (token)
(ematch token
((list :integer d)
(if (= d 0)
(values 'int 'parse-integer)
(values (%int d) `(lambda (s) (parse-integer s :end ,d)))))
((list :character) (values 'letter 'first-elt))
((list :word d)
(values (if (= d 0) 'word (%word d)) 'identity))
((list :literal string) string)))
(defun decode-format (format)
(let (regex-tree decoders)
(with-input-from-string (stream format)
(map-tokens (lambda (&rest token)
(multiple-value-bind (node decoder) (as-regex-node token)
;; decoder iff a variable is needed
(push node regex-tree)
(when decoder
(push decoder decoders))))
stream))
(values `(:sequence ,@(nreverse regex-tree))
(nreverse decoders))))