-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathLexer.x
201 lines (177 loc) · 5.5 KB
/
Lexer.x
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
--*- Haskell -*-
{
module Lexer where
import Numeric (readOct, readHex)
import Data.Char (isUpper, isLower)
import qualified Text.ParserCombinators.Parsec.Pos as Pos
}
%wrapper "posn"
$l = [a-zA-Z] -- letters
$d = 0-9 -- digits
$i = [$l$d\_\'] -- identifier character
$s = [!\#\$\%&\*\+\-\.\/:\<=>\?@\\\|\~] -- symbolic char
@e = \\([abfnrtv\"\'\\]|$d{1,3}) -- character escape
@c = $printable#[\"\\]|@e -- string character
@g = \\$white+\\ -- string gap
tokens :-
$white+ ; -- white space
"--".* ; -- line comment
"{-"([~\-]|\-+[~\-\}]|\n)*\-+\} ; -- comment
[\,\(\)\[\]_\{\;\}] { single } -- single char token
$d+ { dec } -- signed integer literal
0[oO][0-7]+ { oct } -- signed octal literal
0[xX][0-9a-fA-F]+ { hex } -- signed hexadecimal literal
\'(@c|\")\' { ch } -- character literal
\"(@c|@g)*\" { str } -- string literal
$l$i* { ide } -- alphabetic identifier
$s+ { sym } -- symbolic identifier
{
single = mtk (single' . head)
dec = mtk (TokenInt . read)
oct = mtk (TokenInt . fst . head . readOct . drop 2)
hex = mtk (TokenInt . fst . head . readHex . drop 2)
ch = mtk (TokenChar . unescapeLit)
str = mtk (TokenStr . unescapeLit)
ide = mtk ide'
sym = mtk sym'
unescapeLit = unescape . init . tail
unescape [] = []
unescape ('\\':'a':cs) = '\a' : unescape cs
unescape ('\\':'b':cs) = '\b' : unescape cs
unescape ('\\':'f':cs) = '\f' : unescape cs
unescape ('\\':'n':cs) = '\n' : unescape cs
unescape ('\\':'r':cs) = '\r' : unescape cs
unescape ('\\':'t':cs) = '\t' : unescape cs
unescape ('\\':'v':cs) = '\v' : unescape cs
unescape ('\\':c:cs) = c : unescape cs
unescape (c:cs) = c : unescape cs
single' '(' = TokenLParen
single' ')' = TokenRParen
single' '{' = TokenLBrace
single' '}' = TokenRBrace
single' '[' = TokenLBracket
single' ']' = TokenRBracket
single' ',' = TokenComma
single' ';' = TokenSemicolon
single' '_' = TokenWildcard
ide' "if" = TokenIf
ide' "then" = TokenThen
ide' "else" = TokenElse
ide' "let" = TokenLet
ide' "in" = TokenIn
ide' "case" = TokenCase
ide' "of" = TokenOf
ide' "do" = TokenDo
ide' "data" = TokenData
ide' "type" = TokenType
ide' "class" = TokenClass
ide' "instance" = TokenInstance
ide' "where" = TokenWhere
ide' "import" = TokenImport
ide' "hiding" = TokenHiding
ide' s@(c:_)
| isUpper c = TokenConId s
| isLower c = TokenId s
ide' s = error ("unknown token " ++ s)
sym' ".." = TokenDotDot
sym' "::" = TokenCoco
sym' "=" = TokenEq
sym' "\\" = TokenLambda
sym' "|" = TokenBar
sym' "<-" = TokenLArrow
sym' "->" = TokenRArrow
sym' "@" = TokenAt
sym' "~" = TokenTilde
sym' "=>" = TokenImply
sym' s@(':':_) = TokenConOp s
sym' s = TokenOp s
mtk :: (String -> Token) -> AlexPosn -> String -> (Token, AlexPosn)
mtk f p s = (f s, p)
data Token = TokenId String
| TokenConId String
| TokenChar String
| TokenStr String
| TokenInt Int
| TokenIf
| TokenThen
| TokenElse
| TokenLet
| TokenIn
| TokenCase
| TokenOf
| TokenDo
| TokenData
| TokenType
| TokenClass
| TokenInstance
| TokenWhere
| TokenImport
| TokenHiding
| TokenDotDot
| TokenCoco
| TokenEq
| TokenLambda
| TokenBar
| TokenLArrow
| TokenRArrow
| TokenAt
| TokenTilde
| TokenImply
| TokenLParen
| TokenRParen
| TokenLBrace
| TokenRBrace
| TokenLBracket
| TokenRBracket
| TokenComma
| TokenSemicolon
| TokenWildcard
| TokenOp String
| TokenConOp String
deriving (Show, Eq)
lexer :: String -> String -> [(Token, Pos.SourcePos)]
lexer fname = map parsecToken . layout . annotate . alexScanTokens
where parsecToken (tok, AlexPn _ ln col) = (tok, Pos.newPos fname ln col)
lexer' :: String -> String -> [(Token, Pos.SourcePos)]
lexer' fname = map parsecToken . alexScanTokens
where parsecToken (tok, AlexPn _ ln col) = (tok, Pos.newPos fname ln col)
data AnToken = Layout Int
| Indent Int
| Token (Token, AlexPosn)
annotate :: [(Token, AlexPosn)] -> [AnToken]
annotate tps = annotate2 0 tps
annotate1 :: Int -> [(Token, AlexPosn)] -> [AnToken]
annotate1 line [] = []
annotate1 line (t@(tok, AlexPn _ ln col) : tps)
| line < ln = Indent col : rest
| otherwise = rest
where rest = Token t : (next tok) ln tps
next TokenLet = annotate2
next TokenWhere = annotate2
next TokenOf = annotate2
next _ = annotate1
annotate2 :: Int -> [(Token, AlexPosn)] -> [AnToken]
annotate2 line tps@((TokenLBrace, _):_) = annotate1 line tps
annotate2 line tps@((_, AlexPn _ ln col):_) = Layout col : annotate1 ln tps
layout :: [AnToken] -> [(Token, AlexPosn)]
layout ts = layout' ts []
nullPosn :: AlexPosn
nullPosn = AlexPn 0 0 0
layout' :: [AnToken] -> [Int] -> [(Token, AlexPosn)]
layout' ts@(Indent n : ts') ms@(m:ms')
| n == m = (TokenSemicolon, nullPosn) : layout' ts' ms
| n < m = (TokenRBrace, nullPosn) : layout' ts ms'
layout' (Indent n : ts) ms = layout' ts ms
layout' (Layout n : ts) ms@(m:_)
| n > m = (TokenLBrace, nullPosn) : layout' ts (n:ms)
| otherwise = (TokenLBrace, nullPosn)
: (TokenRBrace, nullPosn)
: layout' (Indent n : ts) ms
layout' (Layout n : ts) [] = (TokenLBrace, nullPosn) : layout' ts [n]
layout' (Token t@(TokenRBrace, _) : ts) (0:ms) = t : layout' ts ms
layout' (Token (TokenRBrace, _) : ts) ms = error "parse-error: `}' expected"
layout' (Token t@(TokenLBrace, _) : ts) ms = t : layout' ts (0:ms)
layout' (Token t : ts) ms = t : layout' ts ms
layout' [] [] = []
layout' [] (m:ms) = (TokenRBrace, nullPosn) : layout' [] ms
}