From 1b73e1f4ee0bb6a3d9d8f8be59a562712412ace8 Mon Sep 17 00:00:00 2001 From: Adam Hopkins Date: Sun, 5 Feb 2023 14:49:09 +0200 Subject: [PATCH 1/7] Python 3.7 compat --- html5tagger/builder.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/html5tagger/builder.py b/html5tagger/builder.py index 7180b06..93ab010 100644 --- a/html5tagger/builder.py +++ b/html5tagger/builder.py @@ -1,5 +1,5 @@ from .html5 import omit_endtag -from .util import mangle, escape, escape_special, esc_script, esc_style, attributes +from .util import attributes, esc_script, esc_style, escape, escape_special, mangle class Builder: @@ -23,7 +23,11 @@ def _clear(self): @property def _allpieces(self): - return *self._pieces, self._endtag, *self._stack[::-1] + retval = [] + retval.extend(self._pieces) + retval.append(self._endtag) + retval.extend(self._stack[::-1]) + return tuple(retval) def _endtag_close(self): if self._endtag: From e68288537c4f8b99f1f1ec0b2892b7d23e345bf7 Mon Sep 17 00:00:00 2001 From: Adam Hopkins Date: Mon, 28 Aug 2023 12:57:17 +0300 Subject: [PATCH 2/7] Add HTML5 parser --- html5tagger/parser.py | 427 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 427 insertions(+) create mode 100644 html5tagger/parser.py diff --git a/html5tagger/parser.py b/html5tagger/parser.py new file mode 100644 index 0000000..d1a4e20 --- /dev/null +++ b/html5tagger/parser.py @@ -0,0 +1,427 @@ +from __future__ import annotations + +import re +from typing import List, cast + +from .builder import Builder +from .html5 import omit_endtag + +try: + from rich import print +except ImportError: + pass + + +DEFAULT = object() +# fmt: off +BASIC_INLINE = {"a", "abbr", "area", "b", "bdi", "bdo", "br", "button", "cite", + "code", "data", "datalist", "del", "dfn", "em", "i", "input", + "ins", "kbd", "label", "map", "mark", "meter", "noscript", + "output", "q", "ruby", "s", "samp", "select", "slot", "small", + "span", "strong", "sub", "sup", "u", "var", "wbr"} +MEDIA_ELEMENTS = {"audio", "canvas", "embed", "iframe", "img", "object", + "picture", "svg", "video"} +FORM_ELEMENTS = {"fieldset", "form", "option", "textarea"} +SCRIPT_SUPPORTING_ELEMENTS = {"script", "template"} +HEADING_ELEMENTS = {"h1", "h2", "h3", "h4", "h5", "h6"} +PHRASING_CONTENT = BASIC_INLINE | MEDIA_ELEMENTS | {"time", "template"} +HEADING_CONTENT = PHRASING_CONTENT | HEADING_ELEMENTS +SECTIONING_CONTENT = {"article", "aside", "nav", "section"} +FLOW_CONTENT = (BASIC_INLINE | MEDIA_ELEMENTS | FORM_ELEMENTS | + {"address", "article", "aside", "blockquote", "caption", + "details", "dialog", "div", "dl", "dt", "fieldset", "figure", + "footer", "h1", "h2", "h3", "h4", "h5", "h6", "header", + "hgroup", "hr", "main", "math", "menu", "nav", "ol", "p", + "pre", "progress", "section", "table", "template", "time", + "ul"}) +ROOT_CONTENT = {"html", "body"} +ALLOWED_CONTENT_MODEL = { + # Transparent items specifically are not included + # - a + # - ins + # - del + # - map + # - object + # - video + # - audio + # - noscript + # - slot + # - canvas + # Also specifically left out are special elements + # - template + "abbr": PHRASING_CONTENT, + "address": (FLOW_CONTENT - HEADING_CONTENT - SECTIONING_CONTENT - {"address", "header", "footer"}), # noqa: E501 + "area": None, + "article": FLOW_CONTENT, + "aside": FLOW_CONTENT, + "b": PHRASING_CONTENT, + "bdi": PHRASING_CONTENT, + "bdo": PHRASING_CONTENT, + "blockquote": FLOW_CONTENT, + "body": FLOW_CONTENT, + "br": None, + "br": None, + "button": PHRASING_CONTENT, + "canvas": None, + "caption": FLOW_CONTENT - {"table"}, + "cite": PHRASING_CONTENT, + "code": PHRASING_CONTENT, + "col": None, + "colgroup": {"col", "template"}, + "data": PHRASING_CONTENT, + "datalist": {"option"} | SCRIPT_SUPPORTING_ELEMENTS | PHRASING_CONTENT, + "dd": FLOW_CONTENT, + "details": FLOW_CONTENT - {"summary"}, + "dfn": PHRASING_CONTENT - {"dfn"}, + "dialog": FLOW_CONTENT, + "div": FLOW_CONTENT | SCRIPT_SUPPORTING_ELEMENTS | {"dt", "dd"}, + "dl": {"dt", "dd", "div"} | SCRIPT_SUPPORTING_ELEMENTS, + "dt": (FLOW_CONTENT - {"header", "footer"} - SECTIONING_CONTENT - HEADING_CONTENT), # noqa: E501 + "em": PHRASING_CONTENT, + "embed": None, + "fieldset": {"legend"} | FLOW_CONTENT, + "figcaption": FLOW_CONTENT, + "figure": {"figcaption"} | FLOW_CONTENT, + "footer": FLOW_CONTENT - {"header", "footer"}, + "form": FLOW_CONTENT - {"form"}, + "h1": PHRASING_CONTENT, + "h2": PHRASING_CONTENT, + "h3": PHRASING_CONTENT, + "h4": PHRASING_CONTENT, + "h5": PHRASING_CONTENT, + "h6": PHRASING_CONTENT, + "header": FLOW_CONTENT - {"header", "footer"}, + "hgroup": HEADING_ELEMENTS | {"p"}, + "hr": None, + "i": PHRASING_CONTENT, + "iframe": None, + "img": None, + "input": None, + "kbd": PHRASING_CONTENT, + "label": PHRASING_CONTENT - {"label"}, + "legend": PHRASING_CONTENT | HEADING_CONTENT, + "li": FLOW_CONTENT, + "link": None, + "main": FLOW_CONTENT, + "mark": PHRASING_CONTENT, + "menu": {"li"} | SCRIPT_SUPPORTING_ELEMENTS, + "meta": None, + "meter": PHRASING_CONTENT - {"meter"}, + "nav": FLOW_CONTENT, + "ol": {"li"} | SCRIPT_SUPPORTING_ELEMENTS, + "optgroup": {"option"} | SCRIPT_SUPPORTING_ELEMENTS, + "option": None, + "output": PHRASING_CONTENT, + "p": PHRASING_CONTENT, + "picture": {"source", "img"}, + "pre": PHRASING_CONTENT, + "progress": PHRASING_CONTENT - {"progress"}, + "q": PHRASING_CONTENT, + "rp": None, + "rt": PHRASING_CONTENT, + "ruby": PHRASING_CONTENT | {"rt", "rp"}, + "s": PHRASING_CONTENT, + "samp": PHRASING_CONTENT, + "script": None, + "search": FLOW_CONTENT, + "section": FLOW_CONTENT, + "select": {"option", "optgroup", "hr"}, + "small": PHRASING_CONTENT, + "source": None, + "span": PHRASING_CONTENT, + "strong": PHRASING_CONTENT, + "style": None, + "sub": PHRASING_CONTENT, + "summary": PHRASING_CONTENT | HEADING_CONTENT, + "sup": PHRASING_CONTENT, + "table": {"tbody", "thead", "tfoot", "tr", "caption", "colgroup"} | SCRIPT_SUPPORTING_ELEMENTS, # noqa: E501 + "tbody": {"tr"}, + "td": FLOW_CONTENT, + "textarea": None, + "tfoot": {"tr"}, + "th": FLOW_CONTENT - {"header", "footer"} - SECTIONING_CONTENT - HEADING_CONTENT, # noqa: E501 + "thead": {"tr"}, + "time": PHRASING_CONTENT, + "title": None, + "tr": {"th", "td"}, + "track": None, + "u": PHRASING_CONTENT, + "ul": {"li"} | SCRIPT_SUPPORTING_ELEMENTS, + "var": PHRASING_CONTENT, + "wbr": None, +} +TAG_MATCH_PATTERN = re.compile(r"<(\w+)") +ATTRIBUTE_MATCH_PATTERN = re.compile(r'(\w+)=("[^"]*"|\w+)') +PARSE_TAG_PATTERN = re.compile(r"<(/?)(\w+)|([\w-]+)=('[^']*'|\"[^\"]*\"|\w+)") +# fmt: on + + +class HTMLNode: + ROOT_KEY = "__root__" + DOCTYPE_KEY = "__doctype__" + COMMENT_KEY = "__comment__" + DOCTYPE = "" + + __slots__ = ( + "_name", + "_attributes", + "_content", + "_children", + "_endtag", + "_parent", + "_closed", + ) + + def __init__(self, name: str, attributes: dict[str, str]) -> None: + self._name = name + self._attributes = attributes + self._content = "" + self._children: List[HTMLNode] = [] + self._endtag = name not in omit_endtag + self._parent: HTMLNode | None = None + self._closed = False + + def __repr__(self) -> str: + display = f"{self._name!r}, {self._attributes!r}, {self._content!r}" + return f"HTMLNode({display})" + + def __str__(self) -> str: + return f"" + + def add_child(self, child: HTMLNode) -> None: + if self._closed: + message = ( + f"Tag {self._name!r} is already closed. " + f"Trying to add {child}" + ) + raise ValueError(message) + self._children.append(child) + + def add_text_content(self, text: str) -> None: + if self._closed: + message = ( + f"Tag {self._name!r} is already closed. " + "Trying to add text content." + ) + raise ValueError(message) + self._content += text + + def can_contain(self, tag_name: str) -> bool: + limited = ALLOWED_CONTENT_MODEL.get(self._name, DEFAULT) + if limited is DEFAULT: + return True + if not limited: + return False + return tag_name in cast(set[str], limited) + + def __iter__(self): + return iter(self._children) + + @property + def children(self): + return self._children + + @property + def content(self): + return self._content + + @property + def name(self): + return self._name + + @property + def opening_tag(self): + tag = f"<{self._name}" + if self._attributes: + tag += " " + " ".join( + f"{key}={value}" for key, value in self._attributes.items() + ) + tag += ">" + return tag + + @property + def closing_tag(self): + if not self._endtag: + return "" + return f"" + + @property + def is_allowed_text_content(self) -> bool: + return True + + def close(self) -> None: + for child in (child for child in self._children if not child._closed): + child.close() + self._closed = True + + +class RootNode(HTMLNode): + def __init__(self) -> None: + super().__init__(HTMLNode.ROOT_KEY, {}) + self._endtag = False + + @property + def opening_tag(self): + return "" + + def can_contain(self, tag_name: str) -> bool: + return True + + +class DoctypeNode(HTMLNode): + def __init__(self) -> None: + super().__init__(HTMLNode.DOCTYPE_KEY, {}) + self._endtag = False + + @property + def opening_tag(self): + return self.DOCTYPE + + +class CommentNode(HTMLNode): + def __init__(self) -> None: + super().__init__(HTMLNode.COMMENT_KEY, {}) + self._endtag = False + + @property + def opening_tag(self): + return f"" + + @property + def content(self): + return "" + + +class NodeCreator: + def create(self, builder: Builder) -> HTMLNode: + root = self._create_root() + stack: list[HTMLNode] = [root] + + for piece in [ + p + for maybe_tag in builder._allpieces + for p in ( + maybe_tag._allpieces + if isinstance(maybe_tag, Builder) + else [maybe_tag] + ) + ]: + tag_name, is_closing, attributes = self._parse_tag(piece) + + if piece == HTMLNode.DOCTYPE: + self._handle_doctype(stack) + continue + elif tag_name == HTMLNode.COMMENT_KEY: + self._handle_comment(piece[4:-3], stack) + continue + elif tag_name == "text": + self._handle_text(piece, stack) + continue + + if is_closing: + self._close_node_by_name(tag_name, stack) + continue + + self._handle_new_node(tag_name, attributes, stack) + + self._close_node(root, stack) + + return root + + def _create_root(self) -> RootNode: + return RootNode() + + def _handle_doctype(self, stack: list[HTMLNode]) -> None: + if len(stack) > 1: + raise ValueError("Doctype must be the first element") + doctype = DoctypeNode() + stack[0].add_child(doctype) + + def _handle_comment(self, text: str, stack: list[HTMLNode]) -> None: + comment = CommentNode() + comment.add_text_content(text) + for node in reversed(stack): + if not node._closed: + node.add_child(comment) + break + + def _handle_text(self, text: str, stack: list[HTMLNode]) -> None: + for node in reversed(stack): + if node.is_allowed_text_content: + node.add_text_content(text) + break + + def _handle_new_node( + self, tag_name: str, attributes: dict[str, str], stack: list[HTMLNode] + ) -> None: + new_node = HTMLNode(tag_name, attributes) + for node in reversed(stack): + if node.can_contain(new_node.name): + node.add_child(new_node) + break + self._close_node(node, stack) + stack.append(new_node) + + def _close_node(self, node: HTMLNode, stack: list[HTMLNode]) -> None: + node.close() + stack.pop() + + def _close_node_by_name(self, name: str, stack: list[HTMLNode]) -> None: + for node in reversed(stack): + self._close_node(node, stack) + if node.name == name: + break + + @staticmethod + def _parse_tag(tag: str) -> tuple[str, bool, dict[str, str]]: + if tag.startswith(" None: + self._root = root + + def __iter__(self): + return iter(self._root) + + @property + def root(self) -> HTMLNode: + return self._root + + def to_html(self, indent: str = " ") -> str: + return self._to_html(self.root, indent) + + def _to_html(self, node: HTMLNode, indent: str, level: int = 0) -> str: + output = "" + output += indent * level + node.opening_tag + "\n" + if node.content: + output += indent * (level + 1) + node.content + "\n" + for child in node.children: + increment = 0 if isinstance(node, (RootNode,)) else 1 + output += self._to_html(child, indent, level + increment) + if node.closing_tag: + output += indent * level + node.closing_tag + "\n" + return output + + def display_tree(self, indent: str = " "): + self._display_node(self.root, indent) + + def _display_node(self, node: HTMLNode, indent: str, level: int = 0): + print(indent * level, node) + for child in node.children: + self._display_node(child, indent, level + 1) + + @classmethod + def create(cls, builder: Builder) -> HTMLSyntaxTree: + root = NodeCreator().create(builder) + return cls(root) From c24492d71b3efc3b52111ca25a3cce4ba35d14cd Mon Sep 17 00:00:00 2001 From: Adam Hopkins Date: Mon, 28 Aug 2023 13:06:17 +0300 Subject: [PATCH 3/7] Both pretty and condensed HTML output --- html5tagger/parser.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/html5tagger/parser.py b/html5tagger/parser.py index d1a4e20..ec5de2b 100644 --- a/html5tagger/parser.py +++ b/html5tagger/parser.py @@ -6,12 +6,6 @@ from .builder import Builder from .html5 import omit_endtag -try: - from rich import print -except ImportError: - pass - - DEFAULT = object() # fmt: off BASIC_INLINE = {"a", "abbr", "area", "b", "bdi", "bdo", "br", "button", "cite", @@ -388,8 +382,9 @@ def _parse_tag(tag: str) -> tuple[str, bool, dict[str, str]]: class HTMLSyntaxTree: - def __init__(self, root: HTMLNode) -> None: + def __init__(self, root: HTMLNode, builder: Builder) -> None: self._root = root + self._builder = builder def __iter__(self): return iter(self._root) @@ -398,7 +393,9 @@ def __iter__(self): def root(self) -> HTMLNode: return self._root - def to_html(self, indent: str = " ") -> str: + def to_html(self, pretty: bool = False, indent: str = " ") -> str: + if not pretty: + return str(self._builder) return self._to_html(self.root, indent) def _to_html(self, node: HTMLNode, indent: str, level: int = 0) -> str: @@ -424,4 +421,4 @@ def _display_node(self, node: HTMLNode, indent: str, level: int = 0): @classmethod def create(cls, builder: Builder) -> HTMLSyntaxTree: root = NodeCreator().create(builder) - return cls(root) + return cls(root, builder) From 9189b03131d1e59680a505f5d9e69ef2f0e28623 Mon Sep 17 00:00:00 2001 From: Adam Hopkins Date: Mon, 28 Aug 2023 13:06:58 +0300 Subject: [PATCH 4/7] squash --- html5tagger/builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5tagger/builder.py b/html5tagger/builder.py index 93ab010..ee76d3b 100644 --- a/html5tagger/builder.py +++ b/html5tagger/builder.py @@ -1,5 +1,5 @@ from .html5 import omit_endtag -from .util import attributes, esc_script, esc_style, escape, escape_special, mangle +from .util import mangle, escape, escape_special, esc_script, esc_style, attributes class Builder: From 246af4c0dc001c6f1bac7ccc6baa5f824491b2c8 Mon Sep 17 00:00:00 2001 From: Adam Hopkins Date: Mon, 28 Aug 2023 13:13:18 +0300 Subject: [PATCH 5/7] Text key and text element check --- html5tagger/parser.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/html5tagger/parser.py b/html5tagger/parser.py index ec5de2b..9bdea89 100644 --- a/html5tagger/parser.py +++ b/html5tagger/parser.py @@ -29,6 +29,19 @@ "pre", "progress", "section", "table", "template", "time", "ul"}) ROOT_CONTENT = {"html", "body"} +TEXT_ALLOWED_ELEMENTS = {"a", "abbr", "address", "article", "aside", "b", + "bdi", "bdo", "blockquote", "button", "caption", + "cite", "code", "data", "datalist", "dd", "del", + "details", "dfn", "div", "dl", "dt", "em", "fieldset", + "figcaption", "figure", "footer", "form", "h1", "h2", + "h3", "h4", "h5", "h6", "header", "hgroup", "i", + "ins", "kbd", "label", "legend", "li", "main", "mark", + "menu", "meter", "nav", "noscript", "ol", "option", + "output", "p", "pre", "progress", "q", "rb", "rp", + "rt", "rtc", "ruby", "s", "samp", "section", "select", + "small", "span", "strong", "sub", "summary", "sup", + "table", "tbody", "td", "textarea", "tfoot", "th", + "thead", "time", "tr", "u", "ul", "var"} ALLOWED_CONTENT_MODEL = { # Transparent items specifically are not included # - a @@ -154,6 +167,7 @@ class HTMLNode: ROOT_KEY = "__root__" DOCTYPE_KEY = "__doctype__" COMMENT_KEY = "__comment__" + TEXT_KEY = "__text__" DOCTYPE = "" __slots__ = ( @@ -176,7 +190,7 @@ def __init__(self, name: str, attributes: dict[str, str]) -> None: self._closed = False def __repr__(self) -> str: - display = f"{self._name!r}, {self._attributes!r}, {self._content!r}" + display = f"{self._name!r}, {self._attributes!r}" return f"HTMLNode({display})" def __str__(self) -> str: @@ -241,7 +255,7 @@ def closing_tag(self): @property def is_allowed_text_content(self) -> bool: - return True + return self._name in TEXT_ALLOWED_ELEMENTS def close(self) -> None: for child in (child for child in self._children if not child._closed): @@ -308,7 +322,7 @@ def create(self, builder: Builder) -> HTMLNode: elif tag_name == HTMLNode.COMMENT_KEY: self._handle_comment(piece[4:-3], stack) continue - elif tag_name == "text": + elif tag_name == HTMLNode.TEXT_KEY: self._handle_text(piece, stack) continue @@ -373,7 +387,7 @@ def _parse_tag(tag: str) -> tuple[str, bool, dict[str, str]]: matches = list(PARSE_TAG_PATTERN.finditer(tag)) if not matches: - return "text", False, {} + return HTMLNode.TEXT_KEY, False, {} is_end_tag, tagname = matches[0].groups()[:2] attrs = {m.group(3): m.group(4) for m in matches[1:]} From 43ef7ece2ffeba6c1bec0cfe7479b4ceff2ba9e7 Mon Sep 17 00:00:00 2001 From: Adam Hopkins Date: Mon, 28 Aug 2023 13:14:23 +0300 Subject: [PATCH 6/7] Remove formatting comments --- html5tagger/parser.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/html5tagger/parser.py b/html5tagger/parser.py index 9bdea89..de1149b 100644 --- a/html5tagger/parser.py +++ b/html5tagger/parser.py @@ -7,7 +7,6 @@ from .html5 import omit_endtag DEFAULT = object() -# fmt: off BASIC_INLINE = {"a", "abbr", "area", "b", "bdi", "bdo", "br", "button", "cite", "code", "data", "datalist", "del", "dfn", "em", "i", "input", "ins", "kbd", "label", "map", "mark", "meter", "noscript", @@ -57,7 +56,7 @@ # Also specifically left out are special elements # - template "abbr": PHRASING_CONTENT, - "address": (FLOW_CONTENT - HEADING_CONTENT - SECTIONING_CONTENT - {"address", "header", "footer"}), # noqa: E501 + "address": (FLOW_CONTENT - HEADING_CONTENT - SECTIONING_CONTENT - {"address", "header", "footer"}), "area": None, "article": FLOW_CONTENT, "aside": FLOW_CONTENT, @@ -83,7 +82,7 @@ "dialog": FLOW_CONTENT, "div": FLOW_CONTENT | SCRIPT_SUPPORTING_ELEMENTS | {"dt", "dd"}, "dl": {"dt", "dd", "div"} | SCRIPT_SUPPORTING_ELEMENTS, - "dt": (FLOW_CONTENT - {"header", "footer"} - SECTIONING_CONTENT - HEADING_CONTENT), # noqa: E501 + "dt": (FLOW_CONTENT - {"header", "footer"} - SECTIONING_CONTENT - HEADING_CONTENT), "em": PHRASING_CONTENT, "embed": None, "fieldset": {"legend"} | FLOW_CONTENT, @@ -141,12 +140,12 @@ "sub": PHRASING_CONTENT, "summary": PHRASING_CONTENT | HEADING_CONTENT, "sup": PHRASING_CONTENT, - "table": {"tbody", "thead", "tfoot", "tr", "caption", "colgroup"} | SCRIPT_SUPPORTING_ELEMENTS, # noqa: E501 + "table": {"tbody", "thead", "tfoot", "tr", "caption", "colgroup"} | SCRIPT_SUPPORTING_ELEMENTS, "tbody": {"tr"}, "td": FLOW_CONTENT, "textarea": None, "tfoot": {"tr"}, - "th": FLOW_CONTENT - {"header", "footer"} - SECTIONING_CONTENT - HEADING_CONTENT, # noqa: E501 + "th": FLOW_CONTENT - {"header", "footer"} - SECTIONING_CONTENT - HEADING_CONTENT, "thead": {"tr"}, "time": PHRASING_CONTENT, "title": None, @@ -160,7 +159,6 @@ TAG_MATCH_PATTERN = re.compile(r"<(\w+)") ATTRIBUTE_MATCH_PATTERN = re.compile(r'(\w+)=("[^"]*"|\w+)') PARSE_TAG_PATTERN = re.compile(r"<(/?)(\w+)|([\w-]+)=('[^']*'|\"[^\"]*\"|\w+)") -# fmt: on class HTMLNode: From fcb2aab0b0df926a425a0bba97f36ad155a34b33 Mon Sep 17 00:00:00 2001 From: Adam Hopkins Date: Mon, 28 Aug 2023 13:51:42 +0300 Subject: [PATCH 7/7] Nicer single line display --- html5tagger/parser.py | 47 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/html5tagger/parser.py b/html5tagger/parser.py index de1149b..1ef0b8e 100644 --- a/html5tagger/parser.py +++ b/html5tagger/parser.py @@ -405,22 +405,51 @@ def __iter__(self): def root(self) -> HTMLNode: return self._root - def to_html(self, pretty: bool = False, indent: str = " ") -> str: + def to_html( + self, + pretty: bool = False, + indent: str = " ", + single_line_max_length: int = 60, + ) -> str: if not pretty: return str(self._builder) - return self._to_html(self.root, indent) - - def _to_html(self, node: HTMLNode, indent: str, level: int = 0) -> str: + return self._to_html(self.root, indent, single_line_max_length) + + def _to_html( + self, + node: HTMLNode, + indent: str, + single_line_max_length: int, + level: int = 0 + ) -> str: output = "" - output += indent * level + node.opening_tag + "\n" + new_line = "\n" + opening_prefix = indent * level + closing_prefix = opening_prefix + content_prefix = indent * (level + 1) + line_length = len(node.opening_tag) + len(node.content) + len(node.closing_tag) + len(opening_prefix) + single_line = False + if ( + not node.children + and single_line_max_length + and line_length < single_line_max_length + ): + new_line = "" + content_prefix = "" + closing_prefix = "" + single_line = True + output += opening_prefix + node.opening_tag + new_line if node.content: - output += indent * (level + 1) + node.content + "\n" + output += content_prefix + node.content + new_line + ending = "\n" if node.closing_tag or single_line else "" for child in node.children: increment = 0 if isinstance(node, (RootNode,)) else 1 - output += self._to_html(child, indent, level + increment) + output += self._to_html( + child, indent, single_line_max_length, level + increment + ) if node.closing_tag: - output += indent * level + node.closing_tag + "\n" - return output + output += closing_prefix + node.closing_tag + return output + ending def display_tree(self, indent: str = " "): self._display_node(self.root, indent)