From 1b73e1f4ee0bb6a3d9d8f8be59a562712412ace8 Mon Sep 17 00:00:00 2001
From: Adam Hopkins <adam@amhopkins.com>
Date: Sun, 5 Feb 2023 14:49:09 +0200
Subject: [PATCH 1/7] Python 3.7 compat

---
 html5tagger/builder.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/html5tagger/builder.py b/html5tagger/builder.py
index 7180b06..93ab010 100644
--- a/html5tagger/builder.py
+++ b/html5tagger/builder.py
@@ -1,5 +1,5 @@
 from .html5 import omit_endtag
-from .util import mangle, escape, escape_special, esc_script, esc_style, attributes
+from .util import attributes, esc_script, esc_style, escape, escape_special, mangle
 
 
 class Builder:
@@ -23,7 +23,11 @@ def _clear(self):
 
     @property
     def _allpieces(self):
-        return *self._pieces, self._endtag, *self._stack[::-1]
+        retval = []
+        retval.extend(self._pieces)
+        retval.append(self._endtag)
+        retval.extend(self._stack[::-1])
+        return tuple(retval)
 
     def _endtag_close(self):
         if self._endtag:

From e68288537c4f8b99f1f1ec0b2892b7d23e345bf7 Mon Sep 17 00:00:00 2001
From: Adam Hopkins <adam@amhopkins.com>
Date: Mon, 28 Aug 2023 12:57:17 +0300
Subject: [PATCH 2/7] Add HTML5 parser

---
 html5tagger/parser.py | 427 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 427 insertions(+)
 create mode 100644 html5tagger/parser.py

diff --git a/html5tagger/parser.py b/html5tagger/parser.py
new file mode 100644
index 0000000..d1a4e20
--- /dev/null
+++ b/html5tagger/parser.py
@@ -0,0 +1,427 @@
+from __future__ import annotations
+
+import re
+from typing import List, cast
+
+from .builder import Builder
+from .html5 import omit_endtag
+
+try:
+    from rich import print
+except ImportError:
+    pass
+
+
+DEFAULT = object()
+# fmt: off
+BASIC_INLINE = {"a", "abbr", "area", "b", "bdi", "bdo", "br", "button", "cite",
+                "code", "data", "datalist", "del", "dfn", "em", "i", "input",
+                "ins", "kbd", "label", "map", "mark", "meter", "noscript",
+                "output", "q", "ruby", "s", "samp", "select", "slot", "small",
+                "span", "strong", "sub", "sup", "u", "var", "wbr"}
+MEDIA_ELEMENTS = {"audio", "canvas", "embed", "iframe", "img", "object",
+                  "picture", "svg", "video"}
+FORM_ELEMENTS = {"fieldset", "form", "option", "textarea"}
+SCRIPT_SUPPORTING_ELEMENTS = {"script", "template"}
+HEADING_ELEMENTS = {"h1", "h2", "h3", "h4", "h5", "h6"}
+PHRASING_CONTENT = BASIC_INLINE | MEDIA_ELEMENTS | {"time", "template"}
+HEADING_CONTENT = PHRASING_CONTENT | HEADING_ELEMENTS
+SECTIONING_CONTENT = {"article", "aside", "nav", "section"}
+FLOW_CONTENT = (BASIC_INLINE | MEDIA_ELEMENTS | FORM_ELEMENTS |
+                {"address", "article", "aside", "blockquote", "caption",
+                 "details", "dialog", "div", "dl", "dt", "fieldset", "figure",
+                 "footer", "h1", "h2", "h3", "h4", "h5", "h6", "header",
+                 "hgroup", "hr", "main", "math", "menu", "nav", "ol", "p",
+                 "pre", "progress", "section", "table", "template", "time",
+                 "ul"})
+ROOT_CONTENT = {"html", "body"}
+ALLOWED_CONTENT_MODEL = {
+    # Transparent items specifically are not included
+    #     - a
+    #     - ins
+    #     - del
+    #     - map
+    #     - object
+    #     - video
+    #     - audio
+    #     - noscript
+    #     - slot
+    #     - canvas
+    # Also specifically left out are special elements
+    #     - template
+    "abbr": PHRASING_CONTENT,
+    "address": (FLOW_CONTENT - HEADING_CONTENT - SECTIONING_CONTENT - {"address", "header", "footer"}),  # noqa: E501
+    "area": None,
+    "article": FLOW_CONTENT,
+    "aside": FLOW_CONTENT,
+    "b": PHRASING_CONTENT,
+    "bdi": PHRASING_CONTENT,
+    "bdo": PHRASING_CONTENT,
+    "blockquote": FLOW_CONTENT,
+    "body": FLOW_CONTENT,
+    "br": None,
+    "br": None,
+    "button": PHRASING_CONTENT,
+    "canvas": None,
+    "caption": FLOW_CONTENT - {"table"},
+    "cite": PHRASING_CONTENT,
+    "code": PHRASING_CONTENT,
+    "col": None,
+    "colgroup": {"col", "template"},
+    "data": PHRASING_CONTENT,
+    "datalist": {"option"} | SCRIPT_SUPPORTING_ELEMENTS | PHRASING_CONTENT,
+    "dd": FLOW_CONTENT,
+    "details": FLOW_CONTENT - {"summary"},
+    "dfn": PHRASING_CONTENT - {"dfn"},
+    "dialog": FLOW_CONTENT,
+    "div": FLOW_CONTENT | SCRIPT_SUPPORTING_ELEMENTS | {"dt", "dd"},
+    "dl": {"dt", "dd", "div"} | SCRIPT_SUPPORTING_ELEMENTS,
+    "dt": (FLOW_CONTENT - {"header", "footer"} - SECTIONING_CONTENT - HEADING_CONTENT),  # noqa: E501
+    "em": PHRASING_CONTENT,
+    "embed": None,
+    "fieldset": {"legend"} | FLOW_CONTENT,
+    "figcaption": FLOW_CONTENT,
+    "figure": {"figcaption"} | FLOW_CONTENT,
+    "footer": FLOW_CONTENT - {"header", "footer"},
+    "form": FLOW_CONTENT - {"form"},
+    "h1": PHRASING_CONTENT,
+    "h2": PHRASING_CONTENT,
+    "h3": PHRASING_CONTENT,
+    "h4": PHRASING_CONTENT,
+    "h5": PHRASING_CONTENT,
+    "h6": PHRASING_CONTENT,
+    "header": FLOW_CONTENT - {"header", "footer"},
+    "hgroup": HEADING_ELEMENTS | {"p"},
+    "hr": None,
+    "i": PHRASING_CONTENT,
+    "iframe": None,
+    "img": None,
+    "input": None,
+    "kbd": PHRASING_CONTENT,
+    "label": PHRASING_CONTENT - {"label"},
+    "legend": PHRASING_CONTENT | HEADING_CONTENT,
+    "li": FLOW_CONTENT,
+    "link": None,
+    "main": FLOW_CONTENT,
+    "mark": PHRASING_CONTENT,
+    "menu": {"li"} | SCRIPT_SUPPORTING_ELEMENTS,
+    "meta": None,
+    "meter": PHRASING_CONTENT - {"meter"},
+    "nav": FLOW_CONTENT,
+    "ol": {"li"} | SCRIPT_SUPPORTING_ELEMENTS,
+    "optgroup": {"option"} | SCRIPT_SUPPORTING_ELEMENTS,
+    "option": None,
+    "output": PHRASING_CONTENT,
+    "p": PHRASING_CONTENT,
+    "picture": {"source", "img"},
+    "pre": PHRASING_CONTENT,
+    "progress": PHRASING_CONTENT - {"progress"},
+    "q": PHRASING_CONTENT,
+    "rp": None,
+    "rt": PHRASING_CONTENT,
+    "ruby": PHRASING_CONTENT | {"rt", "rp"},
+    "s": PHRASING_CONTENT,
+    "samp": PHRASING_CONTENT,
+    "script": None,
+    "search": FLOW_CONTENT,
+    "section": FLOW_CONTENT,
+    "select": {"option", "optgroup", "hr"},
+    "small": PHRASING_CONTENT,
+    "source": None,
+    "span": PHRASING_CONTENT,
+    "strong": PHRASING_CONTENT,
+    "style": None,
+    "sub": PHRASING_CONTENT,
+    "summary": PHRASING_CONTENT | HEADING_CONTENT,
+    "sup": PHRASING_CONTENT,
+    "table": {"tbody", "thead", "tfoot", "tr", "caption", "colgroup"} | SCRIPT_SUPPORTING_ELEMENTS,  # noqa: E501
+    "tbody": {"tr"},
+    "td": FLOW_CONTENT,
+    "textarea": None,
+    "tfoot": {"tr"},
+    "th": FLOW_CONTENT - {"header", "footer"} - SECTIONING_CONTENT - HEADING_CONTENT,  # noqa: E501
+    "thead": {"tr"},
+    "time": PHRASING_CONTENT,
+    "title": None,
+    "tr": {"th", "td"},
+    "track": None,
+    "u": PHRASING_CONTENT,
+    "ul": {"li"} | SCRIPT_SUPPORTING_ELEMENTS,
+    "var": PHRASING_CONTENT,
+    "wbr": None,
+}
+TAG_MATCH_PATTERN = re.compile(r"<(\w+)")
+ATTRIBUTE_MATCH_PATTERN = re.compile(r'(\w+)=("[^"]*"|\w+)')
+PARSE_TAG_PATTERN = re.compile(r"<(/?)(\w+)|([\w-]+)=('[^']*'|\"[^\"]*\"|\w+)")
+# fmt: on
+
+
+class HTMLNode:
+    ROOT_KEY = "__root__"
+    DOCTYPE_KEY = "__doctype__"
+    COMMENT_KEY = "__comment__"
+    DOCTYPE = "<!DOCTYPE html>"
+
+    __slots__ = (
+        "_name",
+        "_attributes",
+        "_content",
+        "_children",
+        "_endtag",
+        "_parent",
+        "_closed",
+    )
+
+    def __init__(self, name: str, attributes: dict[str, str]) -> None:
+        self._name = name
+        self._attributes = attributes
+        self._content = ""
+        self._children: List[HTMLNode] = []
+        self._endtag = name not in omit_endtag
+        self._parent: HTMLNode | None = None
+        self._closed = False
+
+    def __repr__(self) -> str:
+        display = f"{self._name!r}, {self._attributes!r}, {self._content!r}"
+        return f"HTMLNode({display})"
+
+    def __str__(self) -> str:
+        return f"<HTMLNode {self._name}>"
+
+    def add_child(self, child: HTMLNode) -> None:
+        if self._closed:
+            message = (
+                f"Tag {self._name!r} is already closed. "
+                f"Trying to add {child}"
+            )
+            raise ValueError(message)
+        self._children.append(child)
+
+    def add_text_content(self, text: str) -> None:
+        if self._closed:
+            message = (
+                f"Tag {self._name!r} is already closed. "
+                "Trying to add text content."
+            )
+            raise ValueError(message)
+        self._content += text
+
+    def can_contain(self, tag_name: str) -> bool:
+        limited = ALLOWED_CONTENT_MODEL.get(self._name, DEFAULT)
+        if limited is DEFAULT:
+            return True
+        if not limited:
+            return False
+        return tag_name in cast(set[str], limited)
+
+    def __iter__(self):
+        return iter(self._children)
+
+    @property
+    def children(self):
+        return self._children
+
+    @property
+    def content(self):
+        return self._content
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def opening_tag(self):
+        tag = f"<{self._name}"
+        if self._attributes:
+            tag += " " + " ".join(
+                f"{key}={value}" for key, value in self._attributes.items()
+            )
+        tag += ">"
+        return tag
+
+    @property
+    def closing_tag(self):
+        if not self._endtag:
+            return ""
+        return f"</{self._name}>"
+
+    @property
+    def is_allowed_text_content(self) -> bool:
+        return True
+
+    def close(self) -> None:
+        for child in (child for child in self._children if not child._closed):
+            child.close()
+        self._closed = True
+
+
+class RootNode(HTMLNode):
+    def __init__(self) -> None:
+        super().__init__(HTMLNode.ROOT_KEY, {})
+        self._endtag = False
+
+    @property
+    def opening_tag(self):
+        return ""
+
+    def can_contain(self, tag_name: str) -> bool:
+        return True
+
+
+class DoctypeNode(HTMLNode):
+    def __init__(self) -> None:
+        super().__init__(HTMLNode.DOCTYPE_KEY, {})
+        self._endtag = False
+
+    @property
+    def opening_tag(self):
+        return self.DOCTYPE
+
+
+class CommentNode(HTMLNode):
+    def __init__(self) -> None:
+        super().__init__(HTMLNode.COMMENT_KEY, {})
+        self._endtag = False
+
+    @property
+    def opening_tag(self):
+        return f"<!-- {self._content} -->"
+
+    @property
+    def content(self):
+        return ""
+
+
+class NodeCreator:
+    def create(self, builder: Builder) -> HTMLNode:
+        root = self._create_root()
+        stack: list[HTMLNode] = [root]
+
+        for piece in [
+            p
+            for maybe_tag in builder._allpieces
+            for p in (
+                maybe_tag._allpieces
+                if isinstance(maybe_tag, Builder)
+                else [maybe_tag]
+            )
+        ]:
+            tag_name, is_closing, attributes = self._parse_tag(piece)
+
+            if piece == HTMLNode.DOCTYPE:
+                self._handle_doctype(stack)
+                continue
+            elif tag_name == HTMLNode.COMMENT_KEY:
+                self._handle_comment(piece[4:-3], stack)
+                continue
+            elif tag_name == "text":
+                self._handle_text(piece, stack)
+                continue
+
+            if is_closing:
+                self._close_node_by_name(tag_name, stack)
+                continue
+
+            self._handle_new_node(tag_name, attributes, stack)
+
+        self._close_node(root, stack)
+
+        return root
+
+    def _create_root(self) -> RootNode:
+        return RootNode()
+
+    def _handle_doctype(self, stack: list[HTMLNode]) -> None:
+        if len(stack) > 1:
+            raise ValueError("Doctype must be the first element")
+        doctype = DoctypeNode()
+        stack[0].add_child(doctype)
+
+    def _handle_comment(self, text: str, stack: list[HTMLNode]) -> None:
+        comment = CommentNode()
+        comment.add_text_content(text)
+        for node in reversed(stack):
+            if not node._closed:
+                node.add_child(comment)
+                break
+
+    def _handle_text(self, text: str, stack: list[HTMLNode]) -> None:
+        for node in reversed(stack):
+            if node.is_allowed_text_content:
+                node.add_text_content(text)
+                break
+
+    def _handle_new_node(
+        self, tag_name: str, attributes: dict[str, str], stack: list[HTMLNode]
+    ) -> None:
+        new_node = HTMLNode(tag_name, attributes)
+        for node in reversed(stack):
+            if node.can_contain(new_node.name):
+                node.add_child(new_node)
+                break
+            self._close_node(node, stack)
+        stack.append(new_node)
+
+    def _close_node(self, node: HTMLNode, stack: list[HTMLNode]) -> None:
+        node.close()
+        stack.pop()
+
+    def _close_node_by_name(self, name: str, stack: list[HTMLNode]) -> None:
+        for node in reversed(stack):
+            self._close_node(node, stack)
+            if node.name == name:
+                break
+
+    @staticmethod
+    def _parse_tag(tag: str) -> tuple[str, bool, dict[str, str]]:
+        if tag.startswith("<!"):
+            return HTMLNode.COMMENT_KEY, False, {}
+
+        matches = list(PARSE_TAG_PATTERN.finditer(tag))
+        if not matches:
+            return "text", False, {}
+
+        is_end_tag, tagname = matches[0].groups()[:2]
+        attrs = {m.group(3): m.group(4) for m in matches[1:]}
+
+        return tagname, bool(is_end_tag), attrs
+
+
+class HTMLSyntaxTree:
+    def __init__(self, root: HTMLNode) -> None:
+        self._root = root
+
+    def __iter__(self):
+        return iter(self._root)
+
+    @property
+    def root(self) -> HTMLNode:
+        return self._root
+
+    def to_html(self, indent: str = "    ") -> str:
+        return self._to_html(self.root, indent)
+
+    def _to_html(self, node: HTMLNode, indent: str, level: int = 0) -> str:
+        output = ""
+        output += indent * level + node.opening_tag + "\n"
+        if node.content:
+            output += indent * (level + 1) + node.content + "\n"
+        for child in node.children:
+            increment = 0 if isinstance(node, (RootNode,)) else 1
+            output += self._to_html(child, indent, level + increment)
+        if node.closing_tag:
+            output += indent * level + node.closing_tag + "\n"
+        return output
+
+    def display_tree(self, indent: str = "    "):
+        self._display_node(self.root, indent)
+
+    def _display_node(self, node: HTMLNode, indent: str, level: int = 0):
+        print(indent * level, node)
+        for child in node.children:
+            self._display_node(child, indent, level + 1)
+
+    @classmethod
+    def create(cls, builder: Builder) -> HTMLSyntaxTree:
+        root = NodeCreator().create(builder)
+        return cls(root)

From c24492d71b3efc3b52111ca25a3cce4ba35d14cd Mon Sep 17 00:00:00 2001
From: Adam Hopkins <adam@amhopkins.com>
Date: Mon, 28 Aug 2023 13:06:17 +0300
Subject: [PATCH 3/7] Both pretty and condensed HTML output

---
 html5tagger/parser.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/html5tagger/parser.py b/html5tagger/parser.py
index d1a4e20..ec5de2b 100644
--- a/html5tagger/parser.py
+++ b/html5tagger/parser.py
@@ -6,12 +6,6 @@
 from .builder import Builder
 from .html5 import omit_endtag
 
-try:
-    from rich import print
-except ImportError:
-    pass
-
-
 DEFAULT = object()
 # fmt: off
 BASIC_INLINE = {"a", "abbr", "area", "b", "bdi", "bdo", "br", "button", "cite",
@@ -388,8 +382,9 @@ def _parse_tag(tag: str) -> tuple[str, bool, dict[str, str]]:
 
 
 class HTMLSyntaxTree:
-    def __init__(self, root: HTMLNode) -> None:
+    def __init__(self, root: HTMLNode, builder: Builder) -> None:
         self._root = root
+        self._builder = builder
 
     def __iter__(self):
         return iter(self._root)
@@ -398,7 +393,9 @@ def __iter__(self):
     def root(self) -> HTMLNode:
         return self._root
 
-    def to_html(self, indent: str = "    ") -> str:
+    def to_html(self, pretty: bool = False, indent: str = "    ") -> str:
+        if not pretty:
+            return str(self._builder)
         return self._to_html(self.root, indent)
 
     def _to_html(self, node: HTMLNode, indent: str, level: int = 0) -> str:
@@ -424,4 +421,4 @@ def _display_node(self, node: HTMLNode, indent: str, level: int = 0):
     @classmethod
     def create(cls, builder: Builder) -> HTMLSyntaxTree:
         root = NodeCreator().create(builder)
-        return cls(root)
+        return cls(root, builder)

From 9189b03131d1e59680a505f5d9e69ef2f0e28623 Mon Sep 17 00:00:00 2001
From: Adam Hopkins <adam@amhopkins.com>
Date: Mon, 28 Aug 2023 13:06:58 +0300
Subject: [PATCH 4/7] squash

---
 html5tagger/builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/html5tagger/builder.py b/html5tagger/builder.py
index 93ab010..ee76d3b 100644
--- a/html5tagger/builder.py
+++ b/html5tagger/builder.py
@@ -1,5 +1,5 @@
 from .html5 import omit_endtag
-from .util import attributes, esc_script, esc_style, escape, escape_special, mangle
+from .util import mangle, escape, escape_special, esc_script, esc_style, attributes
 
 
 class Builder:

From 246af4c0dc001c6f1bac7ccc6baa5f824491b2c8 Mon Sep 17 00:00:00 2001
From: Adam Hopkins <adam@amhopkins.com>
Date: Mon, 28 Aug 2023 13:13:18 +0300
Subject: [PATCH 5/7] Text key and text element check

---
 html5tagger/parser.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/html5tagger/parser.py b/html5tagger/parser.py
index ec5de2b..9bdea89 100644
--- a/html5tagger/parser.py
+++ b/html5tagger/parser.py
@@ -29,6 +29,19 @@
                  "pre", "progress", "section", "table", "template", "time",
                  "ul"})
 ROOT_CONTENT = {"html", "body"}
+TEXT_ALLOWED_ELEMENTS = {"a", "abbr", "address", "article", "aside", "b",
+                         "bdi", "bdo", "blockquote", "button", "caption",
+                         "cite", "code", "data", "datalist", "dd", "del",
+                         "details", "dfn", "div", "dl", "dt", "em", "fieldset",
+                         "figcaption", "figure", "footer", "form", "h1", "h2",
+                         "h3", "h4", "h5", "h6", "header", "hgroup", "i",
+                         "ins", "kbd", "label", "legend", "li", "main", "mark",
+                         "menu", "meter", "nav", "noscript", "ol", "option",
+                         "output", "p", "pre", "progress", "q", "rb", "rp",
+                         "rt", "rtc", "ruby", "s", "samp", "section", "select",
+                         "small", "span", "strong", "sub", "summary", "sup",
+                         "table", "tbody", "td", "textarea", "tfoot", "th",
+                         "thead", "time", "tr", "u", "ul", "var"}
 ALLOWED_CONTENT_MODEL = {
     # Transparent items specifically are not included
     #     - a
@@ -154,6 +167,7 @@ class HTMLNode:
     ROOT_KEY = "__root__"
     DOCTYPE_KEY = "__doctype__"
     COMMENT_KEY = "__comment__"
+    TEXT_KEY = "__text__"
     DOCTYPE = "<!DOCTYPE html>"
 
     __slots__ = (
@@ -176,7 +190,7 @@ def __init__(self, name: str, attributes: dict[str, str]) -> None:
         self._closed = False
 
     def __repr__(self) -> str:
-        display = f"{self._name!r}, {self._attributes!r}, {self._content!r}"
+        display = f"{self._name!r}, {self._attributes!r}"
         return f"HTMLNode({display})"
 
     def __str__(self) -> str:
@@ -241,7 +255,7 @@ def closing_tag(self):
 
     @property
     def is_allowed_text_content(self) -> bool:
-        return True
+        return self._name in TEXT_ALLOWED_ELEMENTS
 
     def close(self) -> None:
         for child in (child for child in self._children if not child._closed):
@@ -308,7 +322,7 @@ def create(self, builder: Builder) -> HTMLNode:
             elif tag_name == HTMLNode.COMMENT_KEY:
                 self._handle_comment(piece[4:-3], stack)
                 continue
-            elif tag_name == "text":
+            elif tag_name == HTMLNode.TEXT_KEY:
                 self._handle_text(piece, stack)
                 continue
 
@@ -373,7 +387,7 @@ def _parse_tag(tag: str) -> tuple[str, bool, dict[str, str]]:
 
         matches = list(PARSE_TAG_PATTERN.finditer(tag))
         if not matches:
-            return "text", False, {}
+            return HTMLNode.TEXT_KEY, False, {}
 
         is_end_tag, tagname = matches[0].groups()[:2]
         attrs = {m.group(3): m.group(4) for m in matches[1:]}

From 43ef7ece2ffeba6c1bec0cfe7479b4ceff2ba9e7 Mon Sep 17 00:00:00 2001
From: Adam Hopkins <adam@amhopkins.com>
Date: Mon, 28 Aug 2023 13:14:23 +0300
Subject: [PATCH 6/7] Remove formatting comments

---
 html5tagger/parser.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/html5tagger/parser.py b/html5tagger/parser.py
index 9bdea89..de1149b 100644
--- a/html5tagger/parser.py
+++ b/html5tagger/parser.py
@@ -7,7 +7,6 @@
 from .html5 import omit_endtag
 
 DEFAULT = object()
-# fmt: off
 BASIC_INLINE = {"a", "abbr", "area", "b", "bdi", "bdo", "br", "button", "cite",
                 "code", "data", "datalist", "del", "dfn", "em", "i", "input",
                 "ins", "kbd", "label", "map", "mark", "meter", "noscript",
@@ -57,7 +56,7 @@
     # Also specifically left out are special elements
     #     - template
     "abbr": PHRASING_CONTENT,
-    "address": (FLOW_CONTENT - HEADING_CONTENT - SECTIONING_CONTENT - {"address", "header", "footer"}),  # noqa: E501
+    "address": (FLOW_CONTENT - HEADING_CONTENT - SECTIONING_CONTENT - {"address", "header", "footer"}),
     "area": None,
     "article": FLOW_CONTENT,
     "aside": FLOW_CONTENT,
@@ -83,7 +82,7 @@
     "dialog": FLOW_CONTENT,
     "div": FLOW_CONTENT | SCRIPT_SUPPORTING_ELEMENTS | {"dt", "dd"},
     "dl": {"dt", "dd", "div"} | SCRIPT_SUPPORTING_ELEMENTS,
-    "dt": (FLOW_CONTENT - {"header", "footer"} - SECTIONING_CONTENT - HEADING_CONTENT),  # noqa: E501
+    "dt": (FLOW_CONTENT - {"header", "footer"} - SECTIONING_CONTENT - HEADING_CONTENT),
     "em": PHRASING_CONTENT,
     "embed": None,
     "fieldset": {"legend"} | FLOW_CONTENT,
@@ -141,12 +140,12 @@
     "sub": PHRASING_CONTENT,
     "summary": PHRASING_CONTENT | HEADING_CONTENT,
     "sup": PHRASING_CONTENT,
-    "table": {"tbody", "thead", "tfoot", "tr", "caption", "colgroup"} | SCRIPT_SUPPORTING_ELEMENTS,  # noqa: E501
+    "table": {"tbody", "thead", "tfoot", "tr", "caption", "colgroup"} | SCRIPT_SUPPORTING_ELEMENTS,
     "tbody": {"tr"},
     "td": FLOW_CONTENT,
     "textarea": None,
     "tfoot": {"tr"},
-    "th": FLOW_CONTENT - {"header", "footer"} - SECTIONING_CONTENT - HEADING_CONTENT,  # noqa: E501
+    "th": FLOW_CONTENT - {"header", "footer"} - SECTIONING_CONTENT - HEADING_CONTENT,
     "thead": {"tr"},
     "time": PHRASING_CONTENT,
     "title": None,
@@ -160,7 +159,6 @@
 TAG_MATCH_PATTERN = re.compile(r"<(\w+)")
 ATTRIBUTE_MATCH_PATTERN = re.compile(r'(\w+)=("[^"]*"|\w+)')
 PARSE_TAG_PATTERN = re.compile(r"<(/?)(\w+)|([\w-]+)=('[^']*'|\"[^\"]*\"|\w+)")
-# fmt: on
 
 
 class HTMLNode:

From fcb2aab0b0df926a425a0bba97f36ad155a34b33 Mon Sep 17 00:00:00 2001
From: Adam Hopkins <adam@amhopkins.com>
Date: Mon, 28 Aug 2023 13:51:42 +0300
Subject: [PATCH 7/7] Nicer single line display

---
 html5tagger/parser.py | 47 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 9 deletions(-)

diff --git a/html5tagger/parser.py b/html5tagger/parser.py
index de1149b..1ef0b8e 100644
--- a/html5tagger/parser.py
+++ b/html5tagger/parser.py
@@ -405,22 +405,51 @@ def __iter__(self):
     def root(self) -> HTMLNode:
         return self._root
 
-    def to_html(self, pretty: bool = False, indent: str = "    ") -> str:
+    def to_html(
+        self,
+        pretty: bool = False,
+        indent: str = "    ",
+        single_line_max_length: int = 60,
+    ) -> str:
         if not pretty:
             return str(self._builder)
-        return self._to_html(self.root, indent)
-
-    def _to_html(self, node: HTMLNode, indent: str, level: int = 0) -> str:
+        return self._to_html(self.root, indent, single_line_max_length)
+
+    def _to_html(
+        self,
+        node: HTMLNode,
+        indent: str,
+        single_line_max_length: int,
+        level: int = 0
+    ) -> str:
         output = ""
-        output += indent * level + node.opening_tag + "\n"
+        new_line = "\n"
+        opening_prefix = indent * level
+        closing_prefix = opening_prefix
+        content_prefix = indent * (level + 1)
+        line_length = len(node.opening_tag) + len(node.content) + len(node.closing_tag) + len(opening_prefix)
+        single_line = False
+        if (
+            not node.children
+            and single_line_max_length
+            and line_length < single_line_max_length
+        ):
+            new_line = ""
+            content_prefix = ""
+            closing_prefix = ""
+            single_line = True
+        output += opening_prefix + node.opening_tag + new_line
         if node.content:
-            output += indent * (level + 1) + node.content + "\n"
+            output += content_prefix + node.content + new_line
+        ending = "\n" if node.closing_tag or single_line else ""
         for child in node.children:
             increment = 0 if isinstance(node, (RootNode,)) else 1
-            output += self._to_html(child, indent, level + increment)
+            output += self._to_html(
+                child, indent, single_line_max_length, level + increment
+            )
         if node.closing_tag:
-            output += indent * level + node.closing_tag + "\n"
-        return output
+            output += closing_prefix + node.closing_tag
+        return output + ending
 
     def display_tree(self, indent: str = "    "):
         self._display_node(self.root, indent)