Skip to content

Commit

Permalink
[htm8 cleanup] Add unit test, add TODOs
Browse files Browse the repository at this point in the history
[doctools refactor] Add static typing
  • Loading branch information
Andy C committed Jan 15, 2025
1 parent c9dcc65 commit ea096c6
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 54 deletions.
28 changes: 25 additions & 3 deletions data_lang/htm8.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
"""data_lang/htm8.py
TODO
API:
- Get rid of AttrValueLexer - this should be in the TagLexer
- this also means that unquoted values can be more similar
- We can use a single lexer mode for everything inside <>
- the SPACE is the only difference
- Deprecate tag_lexer.GetTagName() in favor of lx.CanonicalTagName() or
_LiteralTagName()
- UTF-8 check, like JSON8
- re2c
- port lexer, which will fix static typing issues
- the abstraction needs to support submatch?
- for finding the end of a tag, etc.?
- LexError and ParseError need details
- harmonize with data_lang/j8.py, which uses error.Decode(msg, ...,
cur_line_num)
"""

import re

from typing import Dict, List, Tuple, Optional, IO, Iterator, Any
Expand Down Expand Up @@ -584,12 +606,12 @@ def Tokens(self):
#
# Note: for unquoted values, & isn't allowed, and thus &amp; and &#99; and
# &#x99; are not allowed. We could relax that?
ATTR_VALUE_LEXER = CHAR_LEX + [
ATTR_VALUE_LEX = CHAR_LEX + [
(r'[^>&\x00]+', h8_id.RawData),
(r'.', h8_id.Invalid),
]

ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
ATTR_VALUE_LEX_COMPILED = MakeLexer(ATTR_VALUE_LEX)


class AttrValueLexer(object):
Expand Down Expand Up @@ -633,7 +655,7 @@ def Tokens(self):
# Find the first match, like above.
# Note: frontend/match.py uses _LongestMatch(), which is different!
# TODO: reconcile them. This lexer should be expressible in re2c.
for pat, tok_id in ATTR_VALUE_LEXER:
for pat, tok_id in ATTR_VALUE_LEX_COMPILED:
m = pat.match(self.s, pos)
if m:
if 0:
Expand Down
48 changes: 48 additions & 0 deletions data_lang/htm8_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env python2
from __future__ import print_function

import unittest
import re

from data_lang import htm8

class RegexTest(unittest.TestCase):

def testDotAll(self):
# type: () -> None

# Note that $ matches end of line, not end of string
p1 = re.compile(r'.')
print(p1.match('\n'))

p2 = re.compile(r'.', re.DOTALL)
print(p2.match('\n'))

#p3 = re.compile(r'[.\n]', re.VERBOSE)
p3 = re.compile(r'[.\n]')
print(p3.match('\n'))

print('Negation')

p4 = re.compile(r'[^>]')
print(p4.match('\n'))

def testAttrRe(self):
# type: () -> None
_ATTR_RE = htm8._ATTR_RE
m = _ATTR_RE.match(' empty= val')
print(m.groups())


class FunctionsTest(unittest.TestCase):

def testFindLineNum(self):
# type: () -> None
s = 'foo\n' * 3
for pos in [1, 5, 10, 50]: # out of bounds
line_num = htm8.FindLineNum(s, pos)
print(line_num)


if __name__ == '__main__':
unittest.main()
11 changes: 8 additions & 3 deletions doctools/help_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,9 @@ def Render(self, line):
return f.getvalue()


CurGroup = Tuple[AnyStr, List[Tuple[AnyStr, AnyStr]], AnyStr, List[Any]]


class Splitter(HTMLParser.HTMLParser):
"""Split an HTML stream starting at each of the heading tags.
Expand All @@ -225,12 +228,12 @@ class Splitter(HTMLParser.HTMLParser):
"""

def __init__(self, heading_tags, out):
# type: (List[str], List) -> None
# type: (List[str], List[CurGroup]) -> None
HTMLParser.HTMLParser.__init__(self)
self.heading_tags = heading_tags
self.out = out

self.cur_group = None # type-not-checked: List[Tuple[str, str, List, List]]
self.cur_group = None # type: CurGroup
self.in_heading = False

self.indent = 0
Expand Down Expand Up @@ -332,7 +335,7 @@ def ExtractBody(s):


def SplitIntoCards(heading_tags, contents):
# type: (List[str], str) -> Iterator
# type: (List[str], str) -> Iterator[str, Any, str, str]
contents = ExtractBody(contents)

groups = []
Expand All @@ -355,6 +358,7 @@ def SplitIntoCards(heading_tags, contents):


def HelpTopics(s):
# type: (str) -> Iterator[Tuple[str, str, str]]
"""
Given a rendered toc-{osh,ysh}.html
Expand Down Expand Up @@ -421,6 +425,7 @@ def __init__(self, name, attrs=None, text=None):


def CardsFromIndex(sh, out_prefix):
# type: (str, str) -> None
sections = []
for section_id, section_name, text in HelpTopics(sys.stdin.read()):
if 0:
Expand Down
12 changes: 8 additions & 4 deletions doctools/ul_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import sys

from doctools.util import log
from lazylex import html
from data_lang import htm8
from typing import List
from typing import Optional
Expand All @@ -28,12 +27,17 @@ def RemoveComments(s):
"""
f = StringIO()
out = htm8.Output(s, f)

tag_lexer = htm8.TagLexer(s)
lx = htm8.Lexer(s)

pos = 0
while True:
tok_id, end_pos = lx.Read()
if tok_id == h8_id.EndOfStream:
break

if tok_id == h8_id.Invalid:
raise htm8.LexError(s, pos)

for tok_id, end_pos in html.ValidTokens(s):
if tok_id == h8_id.Comment:
value = s[pos:end_pos]
# doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
Expand Down
12 changes: 2 additions & 10 deletions lazylex/html.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,8 @@
#!/usr/bin/env python2
"""
lazylex/html.py - Low-Level HTML Processing.
lazylex/html.py - Wrapper around HTM8
See lazylex/README.md for details.
TODO:
- Get rid of AttrValueLexer - this should be in the TagLexer
- this also means that unquoted values can be more similar
- We can use a single lexer mode for everything inside <>
- the SPACE is the only difference
- UTF-8 check, like JSON8
- Static typing
See doc/lazylex.md for details.
"""
from __future__ import print_function
Expand Down
34 changes: 0 additions & 34 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,35 +15,6 @@
TEST_HTML = f.read()


class RegexTest(unittest.TestCase):

def testDotAll(self):
# type: () -> None
import re

# Note that $ matches end of line, not end of string
p1 = re.compile(r'.')
print(p1.match('\n'))

p2 = re.compile(r'.', re.DOTALL)
print(p2.match('\n'))

#p3 = re.compile(r'[.\n]', re.VERBOSE)
p3 = re.compile(r'[.\n]')
print(p3.match('\n'))

print('Negation')

p4 = re.compile(r'[^>]')
print(p4.match('\n'))

def testAttrRe(self):
# type: () -> None
_ATTR_RE = htm8._ATTR_RE
m = _ATTR_RE.match(' empty= val')
print(m.groups())


class FunctionsTest(unittest.TestCase):

def testFindLineNum(self):
Expand All @@ -53,11 +24,6 @@ def testFindLineNum(self):
line_num = htm8.FindLineNum(s, pos)
print(line_num)

def testToText(self):
# type: () -> None
t = html.ToText('<b name="&amp;"> three &lt; four && five </b>')
self.assertEqual(' three < four && five ', t)


def _MakeTagLexer(s):
# type: (str) -> html.TagLexer
Expand Down

0 comments on commit ea096c6

Please sign in to comment.