-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlexer.py
458 lines (371 loc) · 11.1 KB
/
lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
# We use Jay Conrod's IMP language
# ------- CONSTANTS ---------
SPACE = " "
SPACES = [" ", "\n", "\t"]
# Spaces to preprocess out of program string
ILLEGAL_SPACES = ["\b", "\n", "\t", "\f", "\r"]
COMMA = ","
SEMI = ";"
EXP = "**"
LPAREN = "("
RPAREN = ")"
PLUS = "+"
FUN_ARROW = "->"
MINUS = "-"
TIMES = "*"
DIV = "/"
EQ = "="
LT = "<"
GT = ">"
ASSIGN = ":="
LTE = "<="
GTE = ">="
FUN = "fun"
END_FUN = "endfun"
TRUE = "True"
FALSE = "False"
IF = "if"
THEN = "then"
ELIF = "elif"
ELSE = "else"
ENDIF = "endif"
ENDELIF = "endelif"
ENDELSE = "endelse"
WHILE = "while"
DO_WHILE = "dowhile"
END_WHILE = "endwhile"
FOR = "for"
FROM = "from"
TO = "to"
BY = "by"
DOFOR = "dofor"
ENDFOR = "endfor"
RETURN = "return"
IGNORE = "~"
DOUBLE_QUOTE = '"'
OPEN_TUP = "(|"
CLOSE_TUP = "|)"
OPEN_BRACKET = "["
CLOSE_BRACKET = "]"
DOT = "."
OPEN_DICT = "{"
CLOSE_DICT = "}"
COLON = ":"
REV_ARROW = "<-"
OPEN_STRUCT = "{|"
CLOSE_STRUCT = "|}"
# Order matters in keywords
keywords = sorted(
[
IGNORE,
COMMA,
SEMI,
EXP,
LPAREN,
RPAREN,
FUN_ARROW,
ASSIGN,
LTE,
GTE,
EQ,
LT,
GT,
PLUS,
MINUS,
TIMES,
DIV,
TRUE,
FALSE,
FUN,
IF,
ELSE,
ELIF,
THEN,
ENDIF,
ENDELIF,
ENDELSE,
WHILE,
DO_WHILE,
END_WHILE,
END_FUN,
FOR,
FROM,
TO,
BY,
DOFOR,
ENDFOR,
RETURN,
OPEN_TUP,
CLOSE_TUP,
OPEN_BRACKET,
CLOSE_BRACKET,
DOT,
OPEN_DICT,
CLOSE_DICT,
COLON,
REV_ARROW,
OPEN_STRUCT,
CLOSE_STRUCT,
],
reverse=True)
UNOPS = [
MINUS,
]
BOPS = [
EXP,
LTE,
GTE,
EQ,
LT,
GT,
PLUS,
MINUS,
TIMES,
DIV,
]
OPERATIONS = UNOPS + BOPS
keywords_lens = {kw: len(kw) for kw in keywords}
add_space_in_lex = [
FUN,
END_FUN,
TRUE,
FALSE,
IF,
THEN,
ELSE,
ELIF,
ENDIF,
ENDELIF,
ENDELSE,
WHILE,
DO_WHILE,
END_WHILE,
FOR,
FROM,
TO,
BY,
DOFOR,
ENDFOR,
RETURN,
]
no_space_in_lex = [kw for kw in keywords if kw not in add_space_in_lex]
ALPHABETICAL = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
NUMERICAL = "0123456789"
UNDERSCORE = "_"
variable_chars = [
ALPHABETICAL,
NUMERICAL,
UNDERSCORE,
]
VARIABLE_NOT_START_NUMERIC = True
# ------ TOKEN TYPES -------
VARIABLE = "variable"
INTEGER = "integer"
FLOAT = "float"
STRING = "string"
KEYWORD = "keyword"
# ------ EXCEPTIONS --------
class TOKENIZATION_ERROR(Exception):
"""
Raised when tokenization fails
"""
pass
# ------- MATCH TOKENS FUNCTIONS --------
def match_variable(string, idx, var_chars=variable_chars):
"""
match_variable(str, idx, var_chars, var_rules) checks if the str beginnign
at idx is a variable and returns the variable name and the lenght if the variable
based on the character rules allowed for a variable name in var_chars
Returns (var_name, len) if possible, else (None, 0)
Returns (None, 0) if string does not being with a variable allowed char, like udnerscore or
alphabetical
"""
def match_variable_helper(string, idx, var_chars, name, length):
# check is there remaining string to process
if idx >= len(string):
return (name, length)
first = string[idx]
# check var_chars
correct_char = False
for var_str in var_chars:
if first in var_str:
correct_char = True
break
if not correct_char:
return (name, length)
return match_variable_helper(string, idx + 1, var_chars, name + first, length + 1)
# check if the first character could be variable:
if (string[idx] not in UNDERSCORE) and (string[idx] not in ALPHABETICAL):
return (None, 0)
v, l = match_variable_helper(string, idx, var_chars, "", 0)
# no match
if l == 0:
return (None, 0)
# variable cannot start with numeric
if VARIABLE_NOT_START_NUMERIC and v[0] in NUMERICAL:
return (None, 0)
return ((VARIABLE, v), l)
def match_int(string, idx):
"""
match_int(string, idx) is an integer stirng and the length of the string if
wtring matches
Requires: string begins at an integer location
Returns (NOne, 0) if string does not begin with an int
Requires: idx does not start at alocation not corresponding to an int
"""
def match_int_helper(string, string_len, idx, num, length, dot_occurred):
# check is there remaining string to process
if idx >= string_len:
return (num, length)
first = string[idx]
# if first is a dot, could be a float
if first == DOT and not dot_occurred:
return match_int_helper(string, string_len, idx + 1, num + first, length + 1, True)
if first == DOT and dot_occurred:
return (num, length)
# check var_chars
if first not in NUMERICAL:
return (num, length)
return match_int_helper(string, string_len, idx + 1, num + first, length + 1, dot_occurred)
# check it begins with an int
if string[idx] not in NUMERICAL:
return (None, 0)
# check if you have double 0's at beginning, in which case lex fails
l = len(string)
if string[idx] == '0' and idx + 1 < l and string[idx + 1] == '0':
return (None, 0)
i, l = match_int_helper(string, l, idx, "", 0, False)
# check if decimal point dot in i in which case it is a float
if DOT in i:
return ((FLOAT, float(i)), l)
return ((INTEGER, int(i)), l)
def match_string(string, idx):
"""
match_string(string, idx) matches a string and the length of the string
and returns (String, str), length
REQUIRES: stirng begins at a string location, e.g. with an open double quote
so tht idx is the location of a double quote, and idx + 1is the first locaiton
after that double quote
"""
l = len(string)
collect_string = ""
for i in range(idx + 1, l):
c = string[i]
if c != DOUBLE_QUOTE:
collect_string += c
else:
# add 2 for start and end quotes
return ((STRING, str(collect_string)), len(collect_string) + 2)
def match_keyword(string, idx, keyword, keywords_lens):
"""
match_keywords(string, idx, keyword) returns True if kewword matched else False
keyword_lens is a dictionary mapping keyword to its length
Requires: the first char of keyword is also shared by string
"""
l_key = keywords_lens[keyword]
l_str = len(string)
if idx + l_key > l_str:
return False
return string[idx:(idx + l_key)] == keyword
def match_keywords(string, idx, keywords_lst=keywords, keywords_lens=keywords_lens, space_in_lex=add_space_in_lex):
"""
match_keywords(string, idx, keywords_lst) returns (keyword, len) for the
FIRST matched keyword in keywords_lst
If no matches returns (None, 0)
Requires a space after the keyword for a keyword in space_in_lex
and no space if the keyword is not in space_in_lex. No space is needed if the
program terminates
WARNING: DOES NOT INCLUDE SPACE IN THE KEYWORD LENGTH THAT IS RETURNED;
IGNORES SPACE AFTER KEYWORD THAT NEEDS SPACE
REQUIRES: All \t, \r, \b, \n, \f are replaced with " " space characters already
in string
REQUIRES: keywords_lst is ordered in order of importance of lexing
REQUIRES: idx is an idnex in string
string is the program string
idx is the index in the string, must be in stirng
keyword is a list of strings representing keywords
space_in_parse is a list of keywords requiring a space after lexing
"""
for kw in keywords_lst:
if match_keyword(string, idx, kw, keywords_lens):
l_key = keywords_lens[kw]
if kw in space_in_lex:
l_s = len(string)
# is the keyword the last characters in the string
if l_key + idx >= l_s:
return ((KEYWORD, kw), l_key)
else:
if string[l_key + idx] == SPACE:
return ((KEYWORD, kw), l_key)
else:
# not end with space
return (None, 0)
else:
return ((KEYWORD, kw), l_key)
return (None, 0)
# ----- PREPROCESSING PROGRAM -------
def clean_spaces(program_str):
"""
Removes: \n, \r, \t, \f, \b and replaces with spaces
Strips at front and end
Returns cleaned program string
"""
program_str = program_str.strip()
for c in ILLEGAL_SPACES:
program_str = program_str.replace(c, SPACE)
return program_str
def preprocess(program_str):
"""
preprocess(program_str) preprocesses the program string
Removes: \n, \r, \t, \f, \b and replaces with spaces
Strips at front and end
Return preprocessed program_str
"""
return clean_spaces(program_str)
# ------ LEXER ------
def lex(program_str):
"""
LEXER/TOKENIZER
Progam_str is a string of the program source code
Returns list of Tokens
Starts by Preprocesses program string
Lex is recursive
We have the invariant as below:
END if program_str is empty!
0. If we match character with a kwyword, check if it is a keyword and
lex it, no space is needed after it (e.g. a left paren, right paren, equals
<=, etc)
1. (With Spaces) If we match characters with a keyword, check to see if it is a keyword
a. Keyword if match and also space at end
b. Keyword if match and string ended already (no space at end)
2. If we have a character not matched with a keyword, it is part of a var
a. Variable if it only has alphabeticals, and _ numerics and end,
i .terminating the variable when a space is reached OR
ii. a non alphabetical isreached
III. When a () parentheses is reached
3. Numbers are numerics and are pulled out, only iotegers
4. Strings begin double quote and end with double quote
5. Spaces are eliminated
6. If not all consumed, terminate by raising
"""
def lex_helper(program_str, acc):
if program_str == "":
return acc
token, length = match_keywords(program_str, 0)
if token != None:
return lex_helper(program_str[length:], acc + [token])
token, length = match_variable(program_str, 0)
if token != None:
return lex_helper(program_str[length:], acc + [token])
token, length = match_int(program_str, 0)
if token != None:
return lex_helper(program_str[length:], acc + [token])
if program_str[0] == DOUBLE_QUOTE:
token, length = match_string(program_str, 0)
return lex_helper(program_str[length:], acc + [token])
if program_str[0] == SPACE:
return lex_helper(program_str[1:], acc)
raise TOKENIZATION_ERROR("Could not consume characters")
return lex_helper(preprocess(program_str), [])
if __name__ == "__main__":
pass