1717 from .common import LexerConf
1818 from .parsers .lalr_parser_state import ParserState
1919
20- from .utils import classify , get_regexp_width , Serialize , logger
20+ from .utils import classify , get_regexp_width , Serialize , logger , TextSlice , TextOrSlice
2121from .exceptions import UnexpectedCharacters , LexError , UnexpectedToken
2222from .grammar import TOKEN_DEFAULT_PRIORITY
2323
@@ -289,7 +289,7 @@ def __eq__(self, other):
289289
290290 return self .char_pos == other .char_pos and self .newline_char == other .newline_char
291291
292- def feed (self , token : Token , test_newline = True ):
292+ def feed (self , token : TextOrSlice , test_newline = True ):
293293 """Consume a token and calculate the new line & column.
294294
295295 As an optional optimization, set test_newline=False if token doesn't contain a newline.
@@ -305,13 +305,13 @@ def feed(self, token: Token, test_newline=True):
305305
306306
307307class UnlessCallback :
308- def __init__ (self , scanner ):
308+ def __init__ (self , scanner : 'Scanner' ):
309309 self .scanner = scanner
310310
311- def __call__ (self , t ):
312- res = self .scanner .match (t .value , 0 )
313- if res :
314- _value , t .type = res
311+ def __call__ (self , t : Token ):
312+ res = self .scanner .fullmatch (t .value )
313+ if res is not None :
314+ t .type = res
315315 return t
316316
317317
@@ -347,19 +347,18 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes):
347347 if strtok .pattern .flags <= retok .pattern .flags :
348348 embedded_strs .add (strtok )
349349 if unless :
350- callback [retok .name ] = UnlessCallback (Scanner (unless , g_regex_flags , re_ , match_whole = True , use_bytes = use_bytes ))
350+ callback [retok .name ] = UnlessCallback (Scanner (unless , g_regex_flags , re_ , use_bytes = use_bytes ))
351351
352352 new_terminals = [t for t in terminals if t not in embedded_strs ]
353353 return new_terminals , callback
354354
355355
356356class Scanner :
357- def __init__ (self , terminals , g_regex_flags , re_ , use_bytes , match_whole = False ):
357+ def __init__ (self , terminals , g_regex_flags , re_ , use_bytes ):
358358 self .terminals = terminals
359359 self .g_regex_flags = g_regex_flags
360360 self .re_ = re_
361361 self .use_bytes = use_bytes
362- self .match_whole = match_whole
363362
364363 self .allowed_types = {t .name for t in self .terminals }
365364
@@ -369,10 +368,9 @@ def _build_mres(self, terminals, max_size):
369368 # Python sets an unreasonable group limit (currently 100) in its re module
370369 # Worse, the only way to know we reached it is by catching an AssertionError!
371370 # This function recursively tries less and less groups until it's successful.
372- postfix = '$' if self .match_whole else ''
373371 mres = []
374372 while terminals :
375- pattern = u'|' .join (u'(?P<%s>%s)' % (t .name , t .pattern .to_regexp () + postfix ) for t in terminals [:max_size ])
373+ pattern = u'|' .join (u'(?P<%s>%s)' % (t .name , t .pattern .to_regexp ()) for t in terminals [:max_size ])
376374 if self .use_bytes :
377375 pattern = pattern .encode ('latin-1' )
378376 try :
@@ -384,13 +382,20 @@ def _build_mres(self, terminals, max_size):
384382 terminals = terminals [max_size :]
385383 return mres
386384
387- def match (self , text , pos ):
385+ def match (self , text : TextSlice , pos ):
388386 for mre in self ._mres :
389- m = mre .match (text , pos )
387+ m = mre .match (text . text , pos , text . end )
390388 if m :
391389 return m .group (0 ), m .lastgroup
392390
393391
392+ def fullmatch (self , text : str ) -> Optional [str ]:
393+ for mre in self ._mres :
394+ m = mre .fullmatch (text )
395+ if m :
396+ return m .lastgroup
397+ return None
398+
394399def _regexp_has_newline (r : str ):
395400 r"""Expressions that may indicate newlines in a regexp:
396401 - newlines (\n)
@@ -409,20 +414,31 @@ class LexerState:
409414
410415 __slots__ = 'text' , 'line_ctr' , 'last_token'
411416
412- text : str
417+ text : TextSlice
413418 line_ctr : LineCounter
414419 last_token : Optional [Token ]
415420
416- def __init__ (self , text : str , line_ctr : Optional [LineCounter ]= None , last_token : Optional [Token ]= None ):
421+ def __init__ (self , text : TextSlice , line_ctr : Optional [LineCounter ] = None , last_token : Optional [Token ]= None ):
422+ if line_ctr is None :
423+ line_ctr = LineCounter (b'\n ' if isinstance (text .text , bytes ) else '\n ' )
424+
425+ if text .start > 0 :
426+ # Advance the line-count until line_ctr.char_pos == text.start
427+ line_ctr .feed (TextSlice (text .text , 0 , text .start ))
428+
429+ if not (text .start <= line_ctr .char_pos <= text .end ):
430+ raise ValueError ("LineCounter.char_pos is out of bounds" )
431+
417432 self .text = text
418- self .line_ctr = line_ctr or LineCounter ( b' \n ' if isinstance ( text , bytes ) else ' \n ' )
433+ self .line_ctr = line_ctr
419434 self .last_token = last_token
420435
436+
421437 def __eq__ (self , other ):
422438 if not isinstance (other , LexerState ):
423439 return NotImplemented
424440
425- return self .text is other .text and self .line_ctr == other .line_ctr and self .last_token == other .last_token
441+ return self .text == other .text and self .line_ctr == other .line_ctr and self .last_token == other .last_token
426442
427443 def __copy__ (self ):
428444 return type (self )(self .text , copy (self .line_ctr ), self .last_token )
@@ -432,15 +448,18 @@ class LexerThread:
432448 """A thread that ties a lexer instance and a lexer state, to be used by the parser
433449 """
434450
435- def __init__ (self , lexer : 'Lexer' , lexer_state : LexerState ):
451+ def __init__ (self , lexer : 'Lexer' , lexer_state : Optional [ LexerState ] ):
436452 self .lexer = lexer
437453 self .state = lexer_state
438454
439455 @classmethod
440- def from_text (cls , lexer : 'Lexer' , text : str ) -> 'LexerThread' :
456+ def from_text (cls , lexer : 'Lexer' , text_or_slice : TextOrSlice ) -> 'LexerThread' :
457+ text = TextSlice .cast_from (text_or_slice )
441458 return cls (lexer , LexerState (text ))
442459
443460 def lex (self , parser_state ):
461+ if self .state is None :
462+ raise TypeError ("Cannot lex: No text assigned to lexer state" )
444463 return self .lexer .lex (self .state , parser_state )
445464
446465 def __copy__ (self ):
@@ -461,9 +480,9 @@ class Lexer(ABC):
461480 def lex (self , lexer_state : LexerState , parser_state : Any ) -> Iterator [Token ]:
462481 return NotImplemented
463482
464- def make_lexer_state (self , text ):
483+ def make_lexer_state (self , text : str ):
465484 "Deprecated"
466- return LexerState (text )
485+ return LexerState (TextSlice . cast_from ( text ) )
467486
468487
469488def _check_regex_collisions (terminal_to_regexp : Dict [TerminalDef , str ], comparator , strict_mode , max_collisions_to_show = 8 ):
@@ -563,9 +582,9 @@ def __init__(self, conf: 'LexerConf', comparator=None) -> None:
563582 self .use_bytes = conf .use_bytes
564583 self .terminals_by_name = conf .terminals_by_name
565584
566- self ._scanner = None
585+ self ._scanner : Optional [ Scanner ] = None
567586
568- def _build_scanner (self ):
587+ def _build_scanner (self ) -> Scanner :
569588 terminals , self .callback = _create_unless (self .terminals , self .g_regex_flags , self .re , self .use_bytes )
570589 assert all (self .callback .values ())
571590
@@ -576,26 +595,26 @@ def _build_scanner(self):
576595 else :
577596 self .callback [type_ ] = f
578597
579- self . _scanner = Scanner (terminals , self .g_regex_flags , self .re , self .use_bytes )
598+ return Scanner (terminals , self .g_regex_flags , self .re , self .use_bytes )
580599
581600 @property
582- def scanner (self ):
601+ def scanner (self ) -> Scanner :
583602 if self ._scanner is None :
584- self ._build_scanner ()
603+ self ._scanner = self . _build_scanner ()
585604 return self ._scanner
586605
587606 def match (self , text , pos ):
588607 return self .scanner .match (text , pos )
589608
590609 def next_token (self , lex_state : LexerState , parser_state : Any = None ) -> Token :
591610 line_ctr = lex_state .line_ctr
592- while line_ctr .char_pos < len ( lex_state .text ) :
611+ while line_ctr .char_pos < lex_state .text . end :
593612 res = self .match (lex_state .text , line_ctr .char_pos )
594613 if not res :
595614 allowed = self .scanner .allowed_types - self .ignore_types
596615 if not allowed :
597616 allowed = {"<END-OF-FILE>" }
598- raise UnexpectedCharacters (lex_state .text , line_ctr .char_pos , line_ctr .line , line_ctr .column ,
617+ raise UnexpectedCharacters (lex_state .text . text , line_ctr .char_pos , line_ctr .line , line_ctr .column ,
599618 allowed = allowed , token_history = lex_state .last_token and [lex_state .last_token ],
600619 state = parser_state , terminals_by_name = self .terminals_by_name )
601620
0 commit comments