From ea096c66e7ce9771ff2b65b31c3b484c5dee3546 Mon Sep 17 00:00:00 2001 From: Andy C Date: Tue, 14 Jan 2025 21:29:55 -0500 Subject: [PATCH] [htm8 cleanup] Add unit test, add TODOs [doctools refactor] Add static typing --- data_lang/htm8.py | 28 +++++++++++++++++++++--- data_lang/htm8_test.py | 48 ++++++++++++++++++++++++++++++++++++++++++ doctools/help_gen.py | 11 +++++++--- doctools/ul_table.py | 12 +++++++---- lazylex/html.py | 12 ++--------- lazylex/html_test.py | 34 ------------------------------ 6 files changed, 91 insertions(+), 54 deletions(-) create mode 100755 data_lang/htm8_test.py diff --git a/data_lang/htm8.py b/data_lang/htm8.py index cef61b794..35587ef34 100644 --- a/data_lang/htm8.py +++ b/data_lang/htm8.py @@ -1,3 +1,25 @@ +"""data_lang/htm8.py + +TODO + +API: +- Get rid of AttrValueLexer - this should be in the TagLexer + - this also means that unquoted values can be more similar + - We can use a single lexer mode for everything inside <> + - the SPACE is the only difference +- Deprecate tag_lexer.GetTagName() in favor of lx.CanonicalTagName() or + _LiteralTagName() +- UTF-8 check, like JSON8 +- re2c + - port lexer, which will fix static typing issues + - the abstraction needs to support submatch? + - for finding the end of a tag, etc.? + +- LexError and ParseError need details + - harmonize with data_lang/j8.py, which uses error.Decode(msg, ..., + cur_line_num) +""" + import re from typing import Dict, List, Tuple, Optional, IO, Iterator, Any @@ -584,12 +606,12 @@ def Tokens(self): # # Note: for unquoted values, & isn't allowed, and thus & and c and # ™ are not allowed. We could relax that? -ATTR_VALUE_LEXER = CHAR_LEX + [ +ATTR_VALUE_LEX = CHAR_LEX + [ (r'[^>&\x00]+', h8_id.RawData), (r'.', h8_id.Invalid), ] -ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER) +ATTR_VALUE_LEX_COMPILED = MakeLexer(ATTR_VALUE_LEX) class AttrValueLexer(object): @@ -633,7 +655,7 @@ def Tokens(self): # Find the first match, like above. # Note: frontend/match.py uses _LongestMatch(), which is different! # TODO: reconcile them. This lexer should be expressible in re2c. - for pat, tok_id in ATTR_VALUE_LEXER: + for pat, tok_id in ATTR_VALUE_LEX_COMPILED: m = pat.match(self.s, pos) if m: if 0: diff --git a/data_lang/htm8_test.py b/data_lang/htm8_test.py new file mode 100755 index 000000000..65f57cee4 --- /dev/null +++ b/data_lang/htm8_test.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python2 +from __future__ import print_function + +import unittest +import re + +from data_lang import htm8 + +class RegexTest(unittest.TestCase): + + def testDotAll(self): + # type: () -> None + + # Note that $ matches end of line, not end of string + p1 = re.compile(r'.') + print(p1.match('\n')) + + p2 = re.compile(r'.', re.DOTALL) + print(p2.match('\n')) + + #p3 = re.compile(r'[.\n]', re.VERBOSE) + p3 = re.compile(r'[.\n]') + print(p3.match('\n')) + + print('Negation') + + p4 = re.compile(r'[^>]') + print(p4.match('\n')) + + def testAttrRe(self): + # type: () -> None + _ATTR_RE = htm8._ATTR_RE + m = _ATTR_RE.match(' empty= val') + print(m.groups()) + + +class FunctionsTest(unittest.TestCase): + + def testFindLineNum(self): + # type: () -> None + s = 'foo\n' * 3 + for pos in [1, 5, 10, 50]: # out of bounds + line_num = htm8.FindLineNum(s, pos) + print(line_num) + + +if __name__ == '__main__': + unittest.main() diff --git a/doctools/help_gen.py b/doctools/help_gen.py index 40bb06a31..3eed53013 100755 --- a/doctools/help_gen.py +++ b/doctools/help_gen.py @@ -203,6 +203,9 @@ def Render(self, line): return f.getvalue() +CurGroup = Tuple[AnyStr, List[Tuple[AnyStr, AnyStr]], AnyStr, List[Any]] + + class Splitter(HTMLParser.HTMLParser): """Split an HTML stream starting at each of the heading tags. @@ -225,12 +228,12 @@ class Splitter(HTMLParser.HTMLParser): """ def __init__(self, heading_tags, out): - # type: (List[str], List) -> None + # type: (List[str], List[CurGroup]) -> None HTMLParser.HTMLParser.__init__(self) self.heading_tags = heading_tags self.out = out - self.cur_group = None # type-not-checked: List[Tuple[str, str, List, List]] + self.cur_group = None # type: CurGroup self.in_heading = False self.indent = 0 @@ -332,7 +335,7 @@ def ExtractBody(s): def SplitIntoCards(heading_tags, contents): - # type: (List[str], str) -> Iterator + # type: (List[str], str) -> Iterator[str, Any, str, str] contents = ExtractBody(contents) groups = [] @@ -355,6 +358,7 @@ def SplitIntoCards(heading_tags, contents): def HelpTopics(s): + # type: (str) -> Iterator[Tuple[str, str, str]] """ Given a rendered toc-{osh,ysh}.html @@ -421,6 +425,7 @@ def __init__(self, name, attrs=None, text=None): def CardsFromIndex(sh, out_prefix): + # type: (str, str) -> None sections = [] for section_id, section_name, text in HelpTopics(sys.stdin.read()): if 0: diff --git a/doctools/ul_table.py b/doctools/ul_table.py index 9588c5658..9ba26b616 100755 --- a/doctools/ul_table.py +++ b/doctools/ul_table.py @@ -11,7 +11,6 @@ import sys from doctools.util import log -from lazylex import html from data_lang import htm8 from typing import List from typing import Optional @@ -28,12 +27,17 @@ def RemoveComments(s): """ f = StringIO() out = htm8.Output(s, f) - - tag_lexer = htm8.TagLexer(s) + lx = htm8.Lexer(s) pos = 0 + while True: + tok_id, end_pos = lx.Read() + if tok_id == h8_id.EndOfStream: + break + + if tok_id == h8_id.Invalid: + raise htm8.LexError(s, pos) - for tok_id, end_pos in html.ValidTokens(s): if tok_id == h8_id.Comment: value = s[pos:end_pos] # doc/release-index.md has etc. diff --git a/lazylex/html.py b/lazylex/html.py index 805a663e6..9b04d37c9 100755 --- a/lazylex/html.py +++ b/lazylex/html.py @@ -1,16 +1,8 @@ #!/usr/bin/env python2 """ -lazylex/html.py - Low-Level HTML Processing. +lazylex/html.py - Wrapper around HTM8 -See lazylex/README.md for details. - -TODO: -- Get rid of AttrValueLexer - this should be in the TagLexer - - this also means that unquoted values can be more similar - - We can use a single lexer mode for everything inside <> - - the SPACE is the only difference -- UTF-8 check, like JSON8 -- Static typing +See doc/lazylex.md for details. """ from __future__ import print_function diff --git a/lazylex/html_test.py b/lazylex/html_test.py index c3ddedcac..958b10af4 100755 --- a/lazylex/html_test.py +++ b/lazylex/html_test.py @@ -15,35 +15,6 @@ TEST_HTML = f.read() -class RegexTest(unittest.TestCase): - - def testDotAll(self): - # type: () -> None - import re - - # Note that $ matches end of line, not end of string - p1 = re.compile(r'.') - print(p1.match('\n')) - - p2 = re.compile(r'.', re.DOTALL) - print(p2.match('\n')) - - #p3 = re.compile(r'[.\n]', re.VERBOSE) - p3 = re.compile(r'[.\n]') - print(p3.match('\n')) - - print('Negation') - - p4 = re.compile(r'[^>]') - print(p4.match('\n')) - - def testAttrRe(self): - # type: () -> None - _ATTR_RE = htm8._ATTR_RE - m = _ATTR_RE.match(' empty= val') - print(m.groups()) - - class FunctionsTest(unittest.TestCase): def testFindLineNum(self): @@ -53,11 +24,6 @@ def testFindLineNum(self): line_num = htm8.FindLineNum(s, pos) print(line_num) - def testToText(self): - # type: () -> None - t = html.ToText(' three < four && five ') - self.assertEqual(' three < four && five ', t) - def _MakeTagLexer(s): # type: (str) -> html.TagLexer