-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTokenizer.py
165 lines (132 loc) · 6.27 KB
/
Tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
######################################################################
# Tokenizer
######################################################################
# A character is *white* if it is a space, return, tab, or vertical tab.
def white(c): return c in [' ', '\r','\t','\v','\n','\\s']
# If S is a list of characters, remInitialWhite(S) removes initial white space chars
# characters from S.
def remInitialWhite(S):
while not S==[] and white(S[0]): S.pop(0)
return S
# A *special token* is a string that is a member of the following list.
specialTokens = ['+','*','-','/','^','<','=','>','>=','<=',')','(',':=','|','&','~','=>','<=>','..',',',';','[',']','{','}','\\',':','`','"','->']
# An *identifier* is a nonempty string of letters and digits
# beginning with a letter.
# A *numeral* is a nonempty string of digits
# A *token* is a numeral, identifier, or special token.
# munch: list(char) -> str*Bool
#
# If S begins with a token, munch(S) = (tok,True) where tok+S'=S and tok is
# the longest token that begins S. Otherwise, munch(S) = (chars,'False),
# where chars is as far as the algorithm got.
def munch(S):
chars = []
state = 'empty'
while canPush(S,state):
state = newState(state,S[0])
chars += S.pop(0)
if validToken(state): return (''.join(chars),True)
else: return (chars,False)
def validToken(state): return state in ['id','num','spec1','lessgreat','equal','period','digitsAndPoint','colon','colonEqual','doublePeriod','backquote','repeatingBlockRP','quoteClose','dash','arrow']
# States are 'empty', 'id', 'num', 'lessgreat', or 'spec1'. Each state
# corresponds to an assertion about the stack.
# empty: the stack is empty
# id : the stack contains an identifier
# num : the stack is a string of digits
# spec1: a special token that cannot be extended
# lessgreat: the stack is ['<'] or ['>']
# canPush(S,state) iff, in the current state S with the current stack stk,
# stk+[S[0]] is the beginning of a token.
def canPush(S,state):
if S==[]: return False
c=S[0]
if state == 'empty': return alphaNum(c) or beginsSpecial(c)
if state =='id':return alphaNum(c)
if state == 'num':
# don't allow push if the following two chars are both '.'
if len(S)>1:
if S[0]=='.' and S[1]=='.':
return False
return digit(c) or c=='.'
if state == 'lessgreat': return c == '='
if state == 'colon': return c=='='
if state == 'period' : return c=='.' or digit(c) or c=='('
if state == "equal": return c=='>'
if state == 'digitsAndPoint': return digit(c) or c=='('
if state == 'doublePeriod': return False
if state == 'colonEqual': return False
if state == 'spec1' : return False
if state == 'backquote': return alpha(c)
if state == 'repeatingBlockLP': return digit(c) or c=='.'
if state =='repeatingBlockP': return c=='.'
if state =='repeatingBlockPP': return c==')'
if state =='repeatingBlockRP': return False
if state =='quoteClose': return False
if state =='quoteSepcial': return c in ['\\','t','r','"','s']
if state =='quoteOpen' or state=='quoteContent': return True
if state == 'dash': return c=='>'
if state == 'arrow': return False
# beginSpecial(c) iff c is the beginning of a special token that is not ":-"
def beginsSpecial(c): return any([c==x[0] for x in specialTokens])
# If canpush(S,state) and c=S[0], the newState(state,c) is the
# state resulting from pushing c onto the stack.
def newState(state,c):
if state == 'empty':
return 'id' if alpha(c) else \
'num' if digit(c) else \
'lessgreat' if c in ['<','>'] else\
'colon' if c==':' else\
'period' if c=='.' else\
'equal' if c=='=' else\
'backquote' if c=='`' else\
'dash' if c=='-' else\
'quoteOpen' if c=='"' else\
'spec1'
if state =='id':return 'id' if alphaNum(c) else 'spec1'
if state == 'num': return 'digitsAndPoint' if c=='.' else 'num' if digit(c) else 'spec1'
if state == 'lessgreat': return 'equal' if c=='=' else\
'spec1'
if state == 'colon': return 'colonEqual' if c=='=' else 'spec1'
if state == 'colonEqual': return 'spec1'
if state =='equal': return 'spec1'
if state =='period': return 'digitsAndPoint' if digit(c) else 'repeatingBlockLP' if c=='(' else 'spec1'
if state =='backquote': return 'id' if alphaNum(c) else 'spec1'
if state =='digitsAndPoint': return 'digitsAndPoint' if digit(c) else 'doublePeriod' if c=='.' else 'repeatingBlockLP' if c=='(' else 'spec1'
if state =='doublePeriod': return 'spec1'
if state =='repeatingBlockLP': return 'repeatingBlockLP' if digit(c) else 'repeatingBlockP' if c=='.' else 'spec1'
if state =='repeatingBlockP': return 'repeatingBlockPP' if c=='.' else 'spec1'
if state =='repeatingBlockPP': return 'repeatingBlockRP' if c ==')' else 'spec1'
if state =='repeatingBlockRP': return 'spec1'
# state for quoted string
if state =='quoteOpen': return 'quoteClose' if c=='"' else 'quoteSpecial' if c=='\\' else 'quoteContent'
if state =='quoteClose': return 'spec1'
if state == 'quoteSepcial': return 'quoteContent' if c in ['\\','t','r','"','s'] else 'spec1'
if state =='quoteContent': return 'quoteSepcial' if c=='\\' else 'quoteClose' if c=='"' else 'quoteContent'
if state =='dash': return 'arrow' if c=='>' else 'spec1'
if state =='arrow': return 'spec1'
# alpha(c) means c is a letter. digit(c) means c is a digit. alphaNum(c) means
# it is one of the two.
def alpha(c): return 'A'<=c and c<='Z' or 'a'<=c and c<='z'
def digit(c): return '0'<=c and c<='9'
def alphaNum(c): return alpha(c) or digit(c)
# tokens: str -> list(str)*Bool
# destroys its argument
#
# If S is a valid string of tokens, tokens(S) = (toks,True) where toks is a list
# of those tokens. Otherwise tokens(S) is (toks,False) where toks is as far as the
# maxmunch algorithm got.
# alters S
def tokens(S):
S= list(S)
toks = []
while True:
remInitialWhite(S)
if S == [] :
return (toks,True)
else:
tok,success = munch(S)
if success: toks += [tok]
else: return (toks,False)
#print(tokens('h(x,y) := y+2*x'))
#print(tokens("2^3"))
#print(alpha("^"))