From ea096c66e7ce9771ff2b65b31c3b484c5dee3546 Mon Sep 17 00:00:00 2001
From: Andy C <andy@oilshell.org>
Date: Tue, 14 Jan 2025 21:29:55 -0500
Subject: [PATCH] [htm8 cleanup] Add unit test, add TODOs

[doctools refactor] Add static typing
---
 data_lang/htm8.py      | 28 +++++++++++++++++++++---
 data_lang/htm8_test.py | 48 ++++++++++++++++++++++++++++++++++++++++++
 doctools/help_gen.py   | 11 +++++++---
 doctools/ul_table.py   | 12 +++++++----
 lazylex/html.py        | 12 ++---------
 lazylex/html_test.py   | 34 ------------------------------
 6 files changed, 91 insertions(+), 54 deletions(-)
 create mode 100755 data_lang/htm8_test.py

diff --git a/data_lang/htm8.py b/data_lang/htm8.py
index cef61b794..35587ef34 100644
--- a/data_lang/htm8.py
+++ b/data_lang/htm8.py
@@ -1,3 +1,25 @@
+"""data_lang/htm8.py
+
+TODO
+
+API:
+- Get rid of AttrValueLexer - this should be in the TagLexer 
+  - this also means that unquoted values can be more similar
+  - We can use a single lexer mode for everything inside <>
+    - the SPACE is the only difference
+- Deprecate tag_lexer.GetTagName() in favor of lx.CanonicalTagName() or
+  _LiteralTagName()
+- UTF-8 check, like JSON8
+- re2c
+  - port lexer, which will fix static typing issues
+  - the abstraction needs to support submatch?
+    - for finding the end of a tag, etc.?
+
+- LexError and ParseError need details
+  - harmonize with data_lang/j8.py, which uses error.Decode(msg, ...,
+    cur_line_num)
+"""
+
 import re
 
 from typing import Dict, List, Tuple, Optional, IO, Iterator, Any
@@ -584,12 +606,12 @@ def Tokens(self):
 #
 # Note: for unquoted values, & isn't allowed, and thus &amp; and &#99; and
 # &#x99; are not allowed.  We could relax that?
-ATTR_VALUE_LEXER = CHAR_LEX + [
+ATTR_VALUE_LEX = CHAR_LEX + [
     (r'[^>&\x00]+', h8_id.RawData),
     (r'.', h8_id.Invalid),
 ]
 
-ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
+ATTR_VALUE_LEX_COMPILED = MakeLexer(ATTR_VALUE_LEX)
 
 
 class AttrValueLexer(object):
@@ -633,7 +655,7 @@ def Tokens(self):
             # Find the first match, like above.
             # Note: frontend/match.py uses _LongestMatch(), which is different!
             # TODO: reconcile them.  This lexer should be expressible in re2c.
-            for pat, tok_id in ATTR_VALUE_LEXER:
+            for pat, tok_id in ATTR_VALUE_LEX_COMPILED:
                 m = pat.match(self.s, pos)
                 if m:
                     if 0:
diff --git a/data_lang/htm8_test.py b/data_lang/htm8_test.py
new file mode 100755
index 000000000..65f57cee4
--- /dev/null
+++ b/data_lang/htm8_test.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python2
+from __future__ import print_function
+
+import unittest
+import re
+
+from data_lang import htm8
+
+class RegexTest(unittest.TestCase):
+
+    def testDotAll(self):
+        # type: () -> None
+
+        # Note that $ matches end of line, not end of string
+        p1 = re.compile(r'.')
+        print(p1.match('\n'))
+
+        p2 = re.compile(r'.', re.DOTALL)
+        print(p2.match('\n'))
+
+        #p3 = re.compile(r'[.\n]', re.VERBOSE)
+        p3 = re.compile(r'[.\n]')
+        print(p3.match('\n'))
+
+        print('Negation')
+
+        p4 = re.compile(r'[^>]')
+        print(p4.match('\n'))
+
+    def testAttrRe(self):
+        # type: () -> None
+        _ATTR_RE = htm8._ATTR_RE
+        m = _ATTR_RE.match(' empty= val')
+        print(m.groups())
+
+
+class FunctionsTest(unittest.TestCase):
+
+    def testFindLineNum(self):
+        # type: () -> None
+        s = 'foo\n' * 3
+        for pos in [1, 5, 10, 50]:  # out of bounds
+            line_num = htm8.FindLineNum(s, pos)
+            print(line_num)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/doctools/help_gen.py b/doctools/help_gen.py
index 40bb06a31..3eed53013 100755
--- a/doctools/help_gen.py
+++ b/doctools/help_gen.py
@@ -203,6 +203,9 @@ def Render(self, line):
         return f.getvalue()
 
 
+CurGroup = Tuple[AnyStr, List[Tuple[AnyStr, AnyStr]], AnyStr, List[Any]]
+
+
 class Splitter(HTMLParser.HTMLParser):
     """Split an HTML stream starting at each of the heading tags.
 
@@ -225,12 +228,12 @@ class Splitter(HTMLParser.HTMLParser):
     """
 
     def __init__(self, heading_tags, out):
-        # type: (List[str], List) -> None
+        # type: (List[str], List[CurGroup]) -> None
         HTMLParser.HTMLParser.__init__(self)
         self.heading_tags = heading_tags
         self.out = out
 
-        self.cur_group = None  # type-not-checked: List[Tuple[str, str, List, List]]
+        self.cur_group = None  # type: CurGroup
         self.in_heading = False
 
         self.indent = 0
@@ -332,7 +335,7 @@ def ExtractBody(s):
 
 
 def SplitIntoCards(heading_tags, contents):
-    # type: (List[str], str) -> Iterator
+    # type: (List[str], str) -> Iterator[str, Any, str, str]
     contents = ExtractBody(contents)
 
     groups = []
@@ -355,6 +358,7 @@ def SplitIntoCards(heading_tags, contents):
 
 
 def HelpTopics(s):
+    # type: (str) -> Iterator[Tuple[str, str, str]]
     """
     Given a rendered toc-{osh,ysh}.html
 
@@ -421,6 +425,7 @@ def __init__(self, name, attrs=None, text=None):
 
 
 def CardsFromIndex(sh, out_prefix):
+    # type: (str, str) -> None
     sections = []
     for section_id, section_name, text in HelpTopics(sys.stdin.read()):
         if 0:
diff --git a/doctools/ul_table.py b/doctools/ul_table.py
index 9588c5658..9ba26b616 100755
--- a/doctools/ul_table.py
+++ b/doctools/ul_table.py
@@ -11,7 +11,6 @@
 import sys
 
 from doctools.util import log
-from lazylex import html
 from data_lang import htm8
 from typing import List
 from typing import Optional
@@ -28,12 +27,17 @@ def RemoveComments(s):
     """
     f = StringIO()
     out = htm8.Output(s, f)
-
-    tag_lexer = htm8.TagLexer(s)
+    lx = htm8.Lexer(s)
 
     pos = 0
+    while True:
+        tok_id, end_pos = lx.Read()
+        if tok_id == h8_id.EndOfStream:
+            break
+
+        if tok_id == h8_id.Invalid:
+            raise htm8.LexError(s, pos)
 
-    for tok_id, end_pos in html.ValidTokens(s):
         if tok_id == h8_id.Comment:
             value = s[pos:end_pos]
             # doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
diff --git a/lazylex/html.py b/lazylex/html.py
index 805a663e6..9b04d37c9 100755
--- a/lazylex/html.py
+++ b/lazylex/html.py
@@ -1,16 +1,8 @@
 #!/usr/bin/env python2
 """
-lazylex/html.py - Low-Level HTML Processing.
+lazylex/html.py - Wrapper around HTM8
 
-See lazylex/README.md for details.
-
-TODO:
-- Get rid of AttrValueLexer - this should be in the TagLexer 
-  - this also means that unquoted values can be more similar
-  - We can use a single lexer mode for everything inside <>
-    - the SPACE is the only difference
-- UTF-8 check, like JSON8
-- Static typing
+See doc/lazylex.md for details.
 
 """
 from __future__ import print_function
diff --git a/lazylex/html_test.py b/lazylex/html_test.py
index c3ddedcac..958b10af4 100755
--- a/lazylex/html_test.py
+++ b/lazylex/html_test.py
@@ -15,35 +15,6 @@
     TEST_HTML = f.read()
 
 
-class RegexTest(unittest.TestCase):
-
-    def testDotAll(self):
-        # type: () -> None
-        import re
-
-        # Note that $ matches end of line, not end of string
-        p1 = re.compile(r'.')
-        print(p1.match('\n'))
-
-        p2 = re.compile(r'.', re.DOTALL)
-        print(p2.match('\n'))
-
-        #p3 = re.compile(r'[.\n]', re.VERBOSE)
-        p3 = re.compile(r'[.\n]')
-        print(p3.match('\n'))
-
-        print('Negation')
-
-        p4 = re.compile(r'[^>]')
-        print(p4.match('\n'))
-
-    def testAttrRe(self):
-        # type: () -> None
-        _ATTR_RE = htm8._ATTR_RE
-        m = _ATTR_RE.match(' empty= val')
-        print(m.groups())
-
-
 class FunctionsTest(unittest.TestCase):
 
     def testFindLineNum(self):
@@ -53,11 +24,6 @@ def testFindLineNum(self):
             line_num = htm8.FindLineNum(s, pos)
             print(line_num)
 
-    def testToText(self):
-        # type: () -> None
-        t = html.ToText('<b name="&amp;"> three &lt; four && five </b>')
-        self.assertEqual(' three < four && five ', t)
-
 
 def _MakeTagLexer(s):
     # type: (str) -> html.TagLexer