-
Notifications
You must be signed in to change notification settings - Fork 2
/
es_parser.py
executable file
·2231 lines (1837 loc) · 85.3 KB
/
es_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/python3
# ecmaspeak-py/es_parser.py:
# Parse ECMAScript code using an Earley-like approach.
#
# Copyright (C) 2018 J. Michael Dyck <[email protected]>
import pdb, unicodedata, sys, re
from collections import defaultdict, OrderedDict, namedtuple
from pprint import pprint # mainly for debugging
import misc
from mynamedtuple import mynamedtuple
import shared
from shared import spec, decode_entities, print_tree, stderr
from emu_grammars import GNode
from E_Value import ES_Value
character_named_ = {
# "Unicode Format-Control Characters"
'ZWNBSP': '\ufeff',
# table-32:
'TAB' : '\u0009',
'VT' : '\u000b',
'FF' : '\u000c',
# 'ZWNBSP': '\ufeff', # appears above
# 'USP' : isn't a single character
# table-33:
'LF' : '\u000a',
'CR' : '\u000d',
'LS' : '\u2028',
'PS' : '\u2029',
}
class ParseError(Exception):
def __init__(self, posn, item_strings):
self.posn = posn
self.kernel_item_strings = item_strings
# -----------
# I use very short names here so that when a tokenized RHS is printed, it isn't too long.
# Maybe I'll change my mind about that.
NT = mynamedtuple('NT', 'n') # non-terminal
T_lit = mynamedtuple('T_lit', 'c') # literal characters
T_u_p = mynamedtuple('T_u_p', 'p') # Unicode code point with a Unicode property
T_u_r = mynamedtuple('T_u_r', 'rlo rhi') # Range of Unicode code points
T_named = mynamedtuple('T_named', 'n') # named terminal
C_but_only_if = mynamedtuple('C_but_only_if', 'c')
C_but_not = mynamedtuple('C_but_not', 'b')
C_no_LT_here = mynamedtuple('C_no_LT_here', '')
C_lookahead = mynamedtuple('C_lookahead', 'matches tss')
X_eor = mynamedtuple('X_eor', '') # end-of-RHS (Doesn't appear in grammars, is only used in EarleySet.)
END_OF_INPUT = T_named('_EOI_')
ACCEPTANCE = T_named('_ACCEPTANCE_')
# -----------
NT .__str__ = lambda x: x.n
T_lit .__str__ = lambda x: '"%s"' % x.c
T_u_p .__str__ = lambda x: f"<any Unicode code point with property '{x.p}'>"
T_u_r .__str__ = lambda x: f"<range {x.rlo}-{x.rhi}>"
T_named .__str__ = lambda x: f"<{x.n}>"
C_but_only_if.__str__ = lambda x: f"(but only if {x.c})"
C_but_not .__str__ = lambda x: f"(but not {' | '.join(str(exc) for exc in x.b)})"
C_no_LT_here .__str__ = lambda x: f"(no LT here)"
C_lookahead .__str__ = lambda x: f"(lookahead {'is' if x.matches else 'isnt'} {stringify_terminal_sequences(x.tss)})"
def stringify_terminal_sequences(tss):
return ' | '.join(map(stringify_rthing_sequence, tss))
def stringify_rthing_sequence(ts):
return ' '.join(str(t) for t in ts)
# -----------
def Rthing_is_nonterminal(rthing):
return rthing.T == 'NT'
def Rthing_is_terminal(rthing):
return rthing.T.startswith('T_')
def Rthing_is_constraint(rthing):
return rthing.T.startswith('C_')
def are_distinct(values):
return len(set(values)) == len(values)
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
def simplify_grammar(grammar):
# Put the grammar in a more parser-friendly form.
# We simplify it in a few senses:
# - Convert from GNodes to hashable data structures.
# - Eliminate certain features:
# - Eliminate grammatical parameters (lhs-subscript, rhs-guard, rhs-subscript)
# by 'expanding' productions more-or-less as described in the spec.
# - Eliminate rhs-labels.
# - Eliminate multi-character literals in the lexical grammar.
# Put the expanded set of productions here:
grammar.exp_prodns = OrderedDict()
for (lhs_symbol, production_n) in sorted(grammar.prodn_for_lhs_.items()):
if 0:
print()
print(' ', lhs_symbol, production_n._param_names)
for rhs_n in production_n._rhss:
print(' ', rhs_n)
for params_setting in each_params_setting(production_n._param_names):
for rhs_n in production_n._rhss:
simplify_prod(grammar, params_setting, lhs_symbol, rhs_n)
if grammar.level == 'lexical': make_InputElement_common(grammar)
# grammar.print_exp_prodns()
return grammar.exp_prodns
# --------------------------------------------------------------------------
def each_params_setting(params):
# `params` is a list of grammatical parameters (i.e., just names).
# Each of them can take on a '+' or '~' setting.
# Yield every possible combination of settings for these parameters.
for bools in each_boolean_vector_of_length(len(params)):
yield [
('+' if b else '~') + p
for (b, p) in zip(bools, params)
]
def each_boolean_vector_of_length(n):
if n == 0:
yield []
elif n == 1:
yield [False]
yield [True]
elif n == 2:
yield [False, False]
yield [False, True]
yield [True, False]
yield [True, True]
elif n == 3:
yield [False, False, False]
yield [False, False, True]
yield [False, True, False]
yield [False, True, True]
yield [True, False, False]
yield [True, False, True]
yield [True, True, False]
yield [True, True, True]
else:
assert 0, n
def expand_nt_wrt_params_setting(nt, params_setting):
assert type(nt) == GNode
assert nt.kind == 'GNT'
result = nt._nt_name
for (arg_prefix, arg_name) in nt._params:
if arg_prefix == '?':
for param_setting in params_setting:
if param_setting[1:] == arg_name:
break
else:
# There is no param by that name in params_setting.
assert 0, nt
result += param_setting
elif arg_prefix in ['+', '~']:
result += arg_prefix + arg_name
# regardless of whether there's a param_setting
else:
assert 0, arg
return result
# --------------------------------------------------------------------------
def simplify_prod(grammar, params_setting, lhs_symbol, rhs_n):
exp_lhs_symbol = lhs_symbol + ''.join(params_setting)
assert rhs_n.kind in ['RHS_LINE', 'BACKTICKED_THING'], rhs_n.kind
# A RHS_LINE can have a guard.
if rhs_n.kind == 'RHS_LINE':
(optional_guard_n, rhs_body_n, optional_label_n) = rhs_n.children
if all(
guard_param_n.source_text() in params_setting
for guard_param_n in optional_guard_n.children
):
# The guard succeeds (in the current `params_setting`).
pass
else:
# The guard fails.
return
exp_rhs = []
for (i, rhs_item_n) in enumerate(rhs_n._rhs_items):
if rhs_item_n.kind == 'PARAMS':
# This is a guard, and we've already determined that it succeeds.
assert i == 0
elif rhs_item_n.kind == 'LABEL':
assert i == len(rhs_n._rhs_items)-1
# ----------------------------------------
elif rhs_item_n.kind == 'GNT':
exp_name = expand_nt_wrt_params_setting(rhs_item_n, params_setting)
nk = grammar.get_name_kind(rhs_item_n._nt_name)
if nk == 'NT':
exp_thing = NT(exp_name)
elif nk == 'T_named':
exp_thing = T_named(exp_name)
else:
assert 0, nk
if rhs_item_n._is_optional:
# The spec says that a RHS with N optionals
# is an abbreviation for 2^N RHSs,
# one for each combination of optionals being present/absent.
# However, during parsing,
# you want all 2^N to be instances of the same production,
# which is harder if they come from different productions.
# So instead, treat X? as a non-terminal, defined X? := X | epsilon
opt_exp_name = exp_name + '?'
if opt_exp_name not in grammar.exp_prodns:
o_lhs = rhs_item_n._nt_name + '?'
o_params_setting = params_setting # XXX should be subset
add_exp_prod1(grammar, opt_exp_name, [exp_thing], o_lhs, rhs_item_n._nt_name, o_params_setting)
add_exp_prod1(grammar, opt_exp_name, [ ], o_lhs, '', o_params_setting)
# Conceivably, the parser could infer these rules.
exp_rhs.append(NT(opt_exp_name))
else:
exp_rhs.append(exp_thing)
elif rhs_item_n.kind == 'BACKTICKED_THING':
chars = rhs_item_n._chars
if grammar.level == 'lexical' and len(chars) > 1:
#" A <em>lexical grammar</em> for ECMAScript ...
#" has as its terminal symbols Unicode code points ...
#
# So, in the lexical grammar, we explode multicharacter literals.
for char in chars:
exp_rhs.append(T_lit(char))
else:
exp_rhs.append(T_lit(chars))
elif rhs_item_n.kind == 'NAMED_CHAR':
exp_rhs.append(T_named(rhs_item_n.groups[0]))
elif rhs_item_n.kind == 'U_PROP':
exp_rhs.append(T_u_p(rhs_item_n.groups[0]))
elif rhs_item_n.kind == 'U_RANGE':
exp_rhs.append(T_u_r(int(rhs_item_n.groups[0],16), int(rhs_item_n.groups[1],16)))
elif rhs_item_n.kind == 'U_ANY':
exp_rhs.append(T_u_r(0x0000,0x10FFFF))
elif rhs_item_n.kind in ['LAC_SINGLE', 'LAC_SET']:
def convert_terminal_seq_n(terminal_seq_n):
assert terminal_seq_n.kind == 'TERMINAL_SEQ'
ts = []
for terminal_item_n in terminal_seq_n.children:
if terminal_item_n.kind == 'BACKTICKED_THING':
t = T_lit(terminal_item_n._chars)
elif terminal_item_n.kind == 'NAMED_CHAR':
t = T_named(terminal_item_n.groups[0])
elif terminal_item_n.kind == 'NLTH':
t = C_no_LT_here()
else:
t = str(terminal_item_n)
ts.append(t)
return tuple(ts)
if rhs_item_n.kind == 'LAC_SINGLE':
[lac_single_op, terminal_seq_n] = rhs_item_n.children
matches = {
'==': True,
'!=': False,
}[lac_single_op.source_text()]
rsymbol = C_lookahead(matches=matches, tss=tuple([convert_terminal_seq_n(terminal_seq_n)]))
elif rhs_item_n.kind == 'LAC_SET':
[lac_set_op, lac_set_operand_n] = rhs_item_n.children
if lac_set_operand_n.kind == 'NT':
nt_name = lac_set_operand_n._nt_name
# (could precompute these, but probably not worth it)
la_prodn = grammar.prodn_for_lhs_[nt_name]
tss = []
for la_rhs_n in la_prodn._rhss:
assert la_rhs_n.kind == 'BACKTICKED_THING'
t = T_lit(la_rhs_n._chars)
ts = tuple([t])
tss.append(ts)
elif lac_set_operand_n.kind == 'SET_OF_TERMINAL_SEQ':
tss = []
for terminal_seq_n in lac_set_operand_n.children:
tss.append(convert_terminal_seq_n(terminal_seq_n))
else:
assert 0
rsymbol = C_lookahead(matches=False, tss=tuple(tss))
else:
assert 0
exp_rhs.append(rsymbol)
elif rhs_item_n.kind == 'BUT_ONLY':
exp_rhs.append(C_but_only_if(decode_entities(rhs_item_n.groups[0])))
elif rhs_item_n.kind == 'NLTH':
exp_rhs.append(C_no_LT_here())
elif rhs_item_n.kind == 'BUT_NOT':
def convert_nt(nt_n):
nt_name = nt_n._nt_name
if nt_n.kind == 'GNT':
assert nt_n._params == []
assert nt_n._is_optional == False
elif nt_n.kind == 'NT':
pass
else:
assert 0
#
nk = grammar.get_name_kind(nt_name)
if nk == 'NT':
exp_thing = NT(nt_name)
elif nk == 'T_named':
exp_thing = T_named(nt_name)
else:
assert 0, nk
return exp_thing
[exclusion_n] = rhs_item_n.children
b = []
for excludable_n in exclusion_n._excludables:
if excludable_n.kind == 'NT':
ex_symbol = convert_nt(excludable_n)
elif excludable_n.kind == 'BACKTICKED_THING':
ex_symbol = T_lit(excludable_n._chars)
else:
assert 0
b.append(ex_symbol)
rsymbol = C_but_not(tuple(b))
exp_rhs.append(rsymbol)
else:
assert 0, rhs_item_n
add_exp_prod1(grammar, exp_lhs_symbol, exp_rhs, lhs_symbol, rhs_n._reduced, params_setting)
# --------------------------------------------------------------------------
def make_InputElement_common(grammar):
assert grammar.level == 'lexical'
lhs = 'InputElement_common'
for (i, nt_name) in enumerate(['WhiteSpace', 'LineTerminator', 'Comment', 'CommonToken']):
add_exp_prod1(grammar, lhs, [NT(nt_name)], lhs, nt_name, [])
# --------------------------------------------------------------------------
def add_exp_prod1(grammar, exp_lhs, exp_rhs, og_lhs, og_rhs_reduced, og_params_setting):
if exp_lhs not in grammar.exp_prodns:
grammar.exp_prodns[exp_lhs] = []
exprod = ExProd(exp_lhs, exp_rhs, og_lhs, og_rhs_reduced, og_params_setting)
grammar.exp_prodns[exp_lhs].append(exprod)
class ExProd(namedtuple('ExProd', 'ex_lhs ex_rhs og_lhs og_rhs_reduced og_params_setting')): pass
# --------------------------------------------------------------------------
def print_exp_prodns(grammar):
stderr(' print_exp_prodns ...')
filename = '%s_expanded_grammar' % grammar.name
f = shared.open_for_output(filename)
i = 0
for (exp_lhs, exprods) in grammar.exp_prodns.items():
for exprod in exprods:
# print("%3d: " % i, end=None)
print("%61s -> %s" % (
exp_lhs,
stringify_rthings(exprod.ex_rhs)
),
file=f
)
i += 1
f.close()
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
_trace_f = None
_trace_level = None
def trace(*args, **kwargs):
print(*args, **kwargs, file=_trace_f)
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
class _Earley:
def __init__(this_parser, name, how_much_to_consume, trace_prefix):
assert how_much_to_consume in ['all', 'as much as possible']
this_parser.name = name
this_parser.how_much_to_consume = how_much_to_consume
this_parser.trace_prefix = trace_prefix
grammar = spec.grammar_[(name, 'B')]
this_parser.productions_with_lhs_ = simplify_grammar(grammar)
# -------------------------------------------------
def run(
this_parser,
goal_symname,
tip,
start_ti_pos,
):
# Either returns a single ES_ParseNode, or raises a ParseError.
def trace_at(trace_level_of_this_msg, *args):
if trace_level_of_this_msg <= _trace_level:
print(this_parser.trace_prefix, *args, file=_trace_f)
def Rsymbol_get_rhs_start_points(rsymbol):
assert Rthing_is_nonterminal(rsymbol)
if rsymbol.n not in this_parser.productions_with_lhs_:
# grammar bug
raise_parse_error()
return [
Point(prod, 0)
for prod in this_parser.productions_with_lhs_[rsymbol.n]
]
# -------------------------------------------
# Roughly speaking, a 'Point' is a point in the grammar.
# i.e., a point in the RHS of some production.
# (Elsewhere called an LR(0) item,
# but I'm using "Item" for a slightly bigger concept.)
class Point(namedtuple('Point', 'prod dot_posn')):
def __str__(point):
lhs = point.prod.ex_lhs
rhs = point.prod.ex_rhs
# dot = '\u25CF'
dot = '@'
return "%s -> %s %s %s " % (
lhs,
' '.join(str(r) for r in rhs[0:point.dot_posn]),
dot,
' '.join(str(r) for r in rhs[point.dot_posn:])
)
def get_rthing_after_dot(point):
rhs = point.prod.ex_rhs
if point.dot_posn < len(rhs):
return rhs[point.dot_posn]
elif point.dot_posn == len(rhs):
return None
else:
assert 0
def advance(point):
# Return the next point after `point`.
assert point.dot_posn < len(point.prod.ex_rhs)
# We'd never be asked to advance from the last point in a production.
return Point(point.prod, point.dot_posn+1)
def get_prod(point):
return point.prod
def get_lhs_symbol(point):
lhs_symname = point.prod.ex_lhs
return lhs_symname
# -------------------------------------------
class Item(namedtuple('Item', 'cause transit_node resulting_point')):
def __str__(item):
return f"_ _ {item.resulting_point}"
def advance(item, node):
new_point = item.resulting_point.advance()
if new_point is None: return None
return Item(item, node, new_point)
def get_rthing_after_dot(item):
rthing = item.resulting_point.get_rthing_after_dot()
if rthing is None:
return X_eor()
else:
return rthing
def get_derived_items(item, this_set):
rthing = item.get_rthing_after_dot()
if rthing is None:
assert 0
elif type(rthing) == X_eor:
# There is nothing after the dot.
# I.e., the dot is at the end of the RHS.
# Perform "Completer":
for new_item in item.reduce():
yield new_item
elif Rthing_is_nonterminal(rthing):
# Perform "Predictor":
for point in Rsymbol_get_rhs_start_points(rthing):
yield Item(this_set, None, point)
# Aycock + Horspool (2002) say:
# if rthing is nullable, yield item.advance(None)
# But if it's *indirectly* nullable,
# that would result in a parse tree
# that doesn't reflect the substructure.
elif Rthing_is_terminal(rthing):
# Don't perform "Scanner" yet.
pass
elif Rthing_is_constraint(rthing):
# trace_at(9, " CHKING %s" % indent + str(item))
trace_at(9, f" checking constraint {rthing}")
constraint_is_satisfied = tip.check_constraint(item.transit_node, this_set.ti_pos, rthing)
if constraint_is_satisfied:
trace_at(9, f" constraint is satisfied")
yield item.advance(None)
else:
trace_at(9, f" constraint is not satisfied, so this item is a dead end")
# When closing an Earley-state, and you encounter an item
# with a lookahead-constraint in the post-dot position,
# what do you do with it? I can think of 3 approaches:
#
# (1) During closure, ignore the constraint; come back to it later,
# after the requisite number of tokens have been read,
# possibly after all the parsing is done.
#
# I think this could work, but might be inefficient,
# because the C_lookahead is there to prevent ambiguities,
# so ignoring it allows ambiguities to multiply.
#
# (2) During closure, pause and immediately go check
# whether the next few tokens satisfy the constraint.
# If they don't, then delete the item or mark it bad somehow,
# and don't let it contribute to the closure of the state.
#
# This would be a sensible approach if ES had a conventional lexer
# (where you can lex a whole text before doing any syntactic parsing).
# But with ES's lexer, the goal symbol depends on the current syntactic context,
# so the lexer can't get ahead of the parser (more than 1 token).
# When you're closing the state, you (theoretically) don't even know
# the current syntactic context, so you can't even get the next token,
# let alone any subsequent ones.
# Practically, the syntactic context is determined by the items in the
# state's kernel, so maybe with some prep-work you could know the context
# and thus get the next token.
# The next+1 would be harder though.
#
# (3) During closure, allow the item to contribute,
# but modify its contributions so as to enforce the constraint.
# (similar to baking the constraint into the grammar)
#
# ?
else:
assert 0, rthing
def reduce(item):
trace_at(9, ' reduce:', str(item))
prod = item.resulting_point.get_prod()
# "pop items off the stack"
child_nodes = []
back_item = item
while True:
if isinstance(back_item.cause, Item):
if back_item.transit_node is not None:
child_nodes.insert(0, back_item.transit_node)
back_item = back_item.cause
elif isinstance(back_item.cause, EarleySet):
assert back_item.transit_node is None
back_set = back_item.cause
break
else:
assert 0, back_item.cause
if child_nodes:
option_bits = ''.join(
str(len(child.children))
for child in child_nodes
if isinstance(child.symbol, str) and child.symbol.endswith('?')
)
extent = child_nodes
else:
option_bits = ''
extent = (eset_ti_pos, eset_ti_pos)
parent_node = ES_ParseNode((prod, option_bits), extent, tip)
lhs_symbol = prod.ex_lhs
back_items = back_set.get_items_expecting_symbol(NT(lhs_symbol))
if len(back_items) == 0:
trace_at(1, )
trace_at(1, "no items expecting '%s' in back_set:" % lhs_symbol)
back_set.trace(1)
assert 0 # because item must have come from somewhere
# The problem with Earley's algorithm and nullability:
# If lhs_symbol is nullable,
# then back_set might be this_set,
# which is still being built,
# so back_items might be partial.
if back_set.is_under_construction:
trace_at(1, f" Set is under construction, so remembering nullable {lhs_symbol}")
back_set.nullables.append( (NT(lhs_symbol), parent_node) )
# Have back_set remember lhs_symbol and parent_node:
# if an item is added to back_set that expects lhs_symbol,
# then that's another back_item...
for back_item in back_items:
new_item = back_item.advance(parent_node)
if new_item is None:
assert 0 # You must be able to advance out of a back_item
trace_at(1, " back_item.advance(...) returned None")
else:
yield new_item
# -------------------------------------------
class EarleySet:
def __init__(this_set, ti_pos):
trace_at(9, )
trace_at(9, f'new set at ti_pos = {ti_pos}...')
this_set.ti_pos = ti_pos
this_set.items_with_dot_before_ = defaultdict(list)
this_set.is_under_construction = True
this_set.nullables = []
def trace(this_set, tl):
trace_at(tl, )
trace_at(tl, "EarleySet:")
for (x, items) in sorted(this_set.items_with_dot_before_.items()):
trace_at(tl, f' {x}:')
for item in items:
trace_at(tl, ' ', str(item))
def close(this_set, kernel_items):
for item in kernel_items:
this_set._add_and_recurse(item, '')
this_set.is_under_construction = False
def _add_and_recurse(this_set, item, indent):
rthing = item.get_rthing_after_dot()
if item in this_set.items_with_dot_before_[rthing]:
# We've already processed this item, don't have to do anything.
return
trace_at(9, " ADDING %s" % indent + str(item))
this_set.items_with_dot_before_[rthing].append(item)
for (nul_symbol, nul_node) in this_set.nullables:
if nul_symbol == rthing:
trace_at(9, f" Recalling nullable {rthing}")
new_item = item.advance(nul_node)
this_set._add_and_recurse(new_item, indent+' ')
for new_item in item.get_derived_items(this_set):
this_set._add_and_recurse(new_item, indent+' ')
def get_expected_terminals(this_set):
result = []
# XXX optimize
for (symbol, items) in this_set.items_with_dot_before_.items():
if symbol == X_eor(): continue
# if Symbol_is_terminal(symbol):
for item in items:
rthing = item.get_rthing_after_dot()
assert rthing == symbol #??
if Rthing_is_terminal(rthing) and rthing not in result:
result.append(rthing)
return result
def get_items_expecting_symbol(this_set, symbol):
items = this_set.items_with_dot_before_[symbol]
assert len(items) > 0 or symbol == ACCEPTANCE
return items
# -------------------------------------------
def cannot_continue():
if this_parser.how_much_to_consume == 'as much as possible':
#" When a stream of code points is to be parsed
#" as an ECMAScript |Script| or |Module|,
#" it is first converted to a stream of input elements
#" by repeated application of the lexical grammar; ...
#
#" The source text is scanned from left to right,
#" repeatedly taking the longest possible sequence of code points
#" as the next input element.
trace_at(2, 'Checking for prior acceptables...')
if latest_accepting_item is None:
trace_at(2, 'nope, none')
raise_parse_error()
else:
goal_node = latest_accepting_item.transit_node
assert goal_node.production.ex_lhs == goal_symname
if _trace_level >= 2:
trace_at(2, 'returning prior acceptable:')
goal_node.dump(this_parser.trace_prefix + ' ', f=_trace_f)
return goal_node
else:
#" ... this stream of input elements is then parsed
#" by a single application of the syntactic grammar.
#" The input stream is syntactically in error
#" if the tokens in the stream of input elements
#" cannot be parsed as a single instance of the goal nonterminal
#" (|Script| or |Module|), with no tokens left over.
raise_parse_error()
def raise_parse_error():
item_strings = [
str(item)
for item in next_kernel_items
]
raise ParseError(eset_ti_pos, item_strings)
# -------------------------------------------
trace_at(1, )
trace_at(1, f"{this_parser.name}.run invoked at ti_pos = {start_ti_pos} with goal '{goal_symname}'")
if this_parser.how_much_to_consume == 'all':
# We're looking for an instance of the goal symbol
# followed by the end of input.
final_symbol = END_OF_INPUT
else:
# We'll accept an instance of the goal symbol
# that isn't followed by the end of input.
final_symbol = ACCEPTANCE
start_production = ExProd(
ex_lhs = '*START*',
ex_rhs = [
NT(n=goal_symname),
final_symbol
],
og_lhs = '*START*',
og_rhs_reduced = f"{goal_symname} {final_symbol}",
og_params_setting = []
)
this_parser.productions_with_lhs_['*START*'] = [start_production]
# And make an item for it:
initial_item = Item(None, None, Point(start_production, 0))
next_kernel_items = [initial_item]
if this_parser.how_much_to_consume == 'as much as possible':
latest_accepting_item = None
eset_ti_pos = start_ti_pos
while True:
eset = EarleySet(eset_ti_pos)
eset.close(next_kernel_items)
if _trace_level >= 3:
eset.trace(3)
# -----------------
expected_terminals = eset.get_expected_terminals()
if len(expected_terminals) == 0:
trace_at(2, "No expected terminals! (e.g., due to the failure of a constraint)")
return cannot_continue()
if _trace_level >= 9:
trace_at(9, )
trace_at(9, 'expected_terminals:')
strings = [ str(tsymbol) for tsymbol in expected_terminals ]
for st in sorted(strings):
trace_at(9, ' ', st)
if this_parser.how_much_to_consume == 'as much as possible':
accepting_items_here = eset.get_items_expecting_symbol(ACCEPTANCE)
if accepting_items_here:
trace_at(9, )
trace_at(9, '(there are accepting_items_here)')
if len(accepting_items_here) > 1:
trace_at(2, "%d items!" % len(accepting_items_here))
if _trace_level >= 2:
for item in accepting_items_here:
trace_at(2, '')
item.transit_node.dump(this_parser.trace_prefix + ' ', f=_trace_f)
print('NEED TO RESOLVE AMBIGUITY', file=_trace_f) # XXX
latest_accepting_item = accepting_items_here[0]
# -----------------
ASI_kludge = None
if this_parser.name.startswith('syntactic') and T_lit(';') in expected_terminals:
for item in eset.items_with_dot_before_[T_lit(';')]:
# trace_at(2, f"point {item.resulting_point}")
lhs = item.resulting_point.get_lhs_symbol()
if lhs == 'EmptyStatement':
assert ASI_kludge is None
ASI_kludge = lhs
elif lhs.startswith('DoWhileStatement'):
assert ASI_kludge is None
ASI_kludge = 'DoWhileStatement'
# -----------------
result = tip.get_next_terminal_instances(eset_ti_pos, expected_terminals, ASI_kludge)
trace_at(2, )
trace_at(2, f"(back in {this_parser.name}.run after return from get_next_terminal_instances...)")
if result is None:
# At this point in the source text,
# none of the expected terminals was seen.
trace_at(3, "gnti returned no terminal instances!")
return cannot_continue()
# -----------------
(rats, next_ti_pos) = result
# "rats" = rsymbols and terminal-instances
if _trace_level >= 3:
trace_at(3, )
trace_at(3, f'gnti returned {len(rats)} matches:')
for (rsymbol, termin) in rats:
trace_at(3, " For the expected terminal symbol:")
trace_at(3, " " + str(rsymbol))
trace_at(3, " we have the terminal instance:")
termin.dump(this_parser.trace_prefix + ' ', f=_trace_f)
trace_at(3, )
trace_at(3, f'and next_ti_pos = {next_ti_pos}')
assert eset_ti_pos <= next_ti_pos
# It would be strictly less-than, except for inserted semicolons.
# -------------------------------------
# print('...', file=sys.stderr)
next_kernel_items = []
for (rsymbol, termin) in rats:
for item in eset.get_items_expecting_symbol(rsymbol):
# print(rsymbol, file=sys.stderr)
# if _trace_level >= 9 and rsymbol == T_lit(';'): pdb.set_trace()
new_item = item.advance(termin)
# print(new_item, file=sys.stderr)
if new_item is None:
print('got None when attempting to advance', item, termin)
assert 0
else:
next_kernel_items.append(new_item)
if _trace_level >= 9:
trace_at(9, )
trace_at(9, 'next_kernel_items:')
for item in next_kernel_items:
trace_at(9, ' ', str(item))
if len(next_kernel_items) == 0:
trace_at(9, ' ', 'None!')
assert len(next_kernel_items) > 0
# -------------------------------------
if len(rats) == 1 and rats[0][1].symbol == END_OF_INPUT:
trace_at(1, 'success!')
trace_at(1, "results:")
valid_trees = []
for end_item in next_kernel_items:
assert end_item.transit_node.symbol == END_OF_INPUT
prev_item = end_item.cause
assert isinstance(prev_item, Item)
goal_node = prev_item.transit_node
assert goal_node.symbol == re.sub('[~+]\w+', '', goal_symname)
valid_trees.append(goal_node)
if _trace_level >= 1:
goal_node.dump(f=_trace_f)
n_valids = len(valid_trees)
if n_valids == 0:
assert 0
elif n_valids == 1:
[result] = valid_trees
else:
print(f'AMBIGUITY? {n_valids} RESULT TREES', file=_trace_f)
print('comparing two:', file=_trace_f)
show_ambiguity(valid_trees[0], valid_trees[1], _trace_f)
result = valid_trees[0]
return result
eset_ti_pos = next_ti_pos
assert 0 # unreachable
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
def gather_char_sets(productions_with_lhs_):
global char_set_
def recurse(name):
result = set()
for prod in productions_with_lhs_[name]:
assert len(prod.ex_rhs) == 1
[rsymbol] = prod.ex_rhs
if rsymbol.T == 'T_lit':
result.add( rsymbol.c )
elif rsymbol.T == 'T_named':
result.add( character_named_[rsymbol.n] )
elif rsymbol.T == 'NT':
result.update( recurse(rsymbol.n) )
else:
assert 0, rsymbol.T
# print("recurse: '%s' : %s" % (name, result))
return result
char_set_ = {}
for n in [
'LineTerminator',
'EscapeCharacter',
'HexDigit',
'OctalDigit',
'SyntaxCharacter',
# UnicodeIDContinue
]:
char_set_[n] = recurse(n)
# print("char set '%s' = %s" % (n, char_set_[n]))
# ------------------------------------------------------------------------------
def gather_ReservedWords(productions_with_lhs_):
# kludgy, but anything non-kludgy would be over-engineered
global ReservedWords
ReservedWords = set()
def recurse(name):
for prod in productions_with_lhs_[name]:
assert len(prod.ex_rhs) == 1
[rsym] = prod.ex_rhs
t = rsym.T
if t == 'NT':
# (not actually used any more, but keep it just in case)
recurse(rsym.n)
elif t == 'T_lit':
ReservedWords.add(rsym.c)
recurse('ReservedWord')
# print('ReservedWords:', sorted(ReservedWords))
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
def initialize():
# This requires that a Spec has been restored via shared.spec.restore()
global lexical_earley, syntactic_earley
lexical_earley = _Earley('lexical', 'as much as possible', '| ')
syntactic_earley = _Earley('syntactic', 'all', '')
gather_char_sets(lexical_earley.productions_with_lhs_)
gather_ReservedWords(syntactic_earley.productions_with_lhs_) # for "but not ReservedWord"
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
def parse(subject, goal_symname, trace_level=0, trace_f=sys.stdout):
# Using `lexical_earley` and `syntactic_earley`,
# attempt to parse the given `subject`,
# using `goal_symname` as the goal symbol.
#
# Return a single node or raise an Error.