diff --git a/html5tagger/parser.py b/html5tagger/parser.py new file mode 100644 index 0000000..1ef0b8e --- /dev/null +++ b/html5tagger/parser.py @@ -0,0 +1,465 @@ +from __future__ import annotations + +import re +from typing import List, cast + +from .builder import Builder +from .html5 import omit_endtag + +DEFAULT = object() +BASIC_INLINE = {"a", "abbr", "area", "b", "bdi", "bdo", "br", "button", "cite", + "code", "data", "datalist", "del", "dfn", "em", "i", "input", + "ins", "kbd", "label", "map", "mark", "meter", "noscript", + "output", "q", "ruby", "s", "samp", "select", "slot", "small", + "span", "strong", "sub", "sup", "u", "var", "wbr"} +MEDIA_ELEMENTS = {"audio", "canvas", "embed", "iframe", "img", "object", + "picture", "svg", "video"} +FORM_ELEMENTS = {"fieldset", "form", "option", "textarea"} +SCRIPT_SUPPORTING_ELEMENTS = {"script", "template"} +HEADING_ELEMENTS = {"h1", "h2", "h3", "h4", "h5", "h6"} +PHRASING_CONTENT = BASIC_INLINE | MEDIA_ELEMENTS | {"time", "template"} +HEADING_CONTENT = PHRASING_CONTENT | HEADING_ELEMENTS +SECTIONING_CONTENT = {"article", "aside", "nav", "section"} +FLOW_CONTENT = (BASIC_INLINE | MEDIA_ELEMENTS | FORM_ELEMENTS | + {"address", "article", "aside", "blockquote", "caption", + "details", "dialog", "div", "dl", "dt", "fieldset", "figure", + "footer", "h1", "h2", "h3", "h4", "h5", "h6", "header", + "hgroup", "hr", "main", "math", "menu", "nav", "ol", "p", + "pre", "progress", "section", "table", "template", "time", + "ul"}) +ROOT_CONTENT = {"html", "body"} +TEXT_ALLOWED_ELEMENTS = {"a", "abbr", "address", "article", "aside", "b", + "bdi", "bdo", "blockquote", "button", "caption", + "cite", "code", "data", "datalist", "dd", "del", + "details", "dfn", "div", "dl", "dt", "em", "fieldset", + "figcaption", "figure", "footer", "form", "h1", "h2", + "h3", "h4", "h5", "h6", "header", "hgroup", "i", + "ins", "kbd", "label", "legend", "li", "main", "mark", + "menu", "meter", "nav", "noscript", "ol", "option", + "output", "p", "pre", "progress", "q", "rb", "rp", + "rt", "rtc", "ruby", "s", "samp", "section", "select", + "small", "span", "strong", "sub", "summary", "sup", + "table", "tbody", "td", "textarea", "tfoot", "th", + "thead", "time", "tr", "u", "ul", "var"} +ALLOWED_CONTENT_MODEL = { + # Transparent items specifically are not included + # - a + # - ins + # - del + # - map + # - object + # - video + # - audio + # - noscript + # - slot + # - canvas + # Also specifically left out are special elements + # - template + "abbr": PHRASING_CONTENT, + "address": (FLOW_CONTENT - HEADING_CONTENT - SECTIONING_CONTENT - {"address", "header", "footer"}), + "area": None, + "article": FLOW_CONTENT, + "aside": FLOW_CONTENT, + "b": PHRASING_CONTENT, + "bdi": PHRASING_CONTENT, + "bdo": PHRASING_CONTENT, + "blockquote": FLOW_CONTENT, + "body": FLOW_CONTENT, + "br": None, + "br": None, + "button": PHRASING_CONTENT, + "canvas": None, + "caption": FLOW_CONTENT - {"table"}, + "cite": PHRASING_CONTENT, + "code": PHRASING_CONTENT, + "col": None, + "colgroup": {"col", "template"}, + "data": PHRASING_CONTENT, + "datalist": {"option"} | SCRIPT_SUPPORTING_ELEMENTS | PHRASING_CONTENT, + "dd": FLOW_CONTENT, + "details": FLOW_CONTENT - {"summary"}, + "dfn": PHRASING_CONTENT - {"dfn"}, + "dialog": FLOW_CONTENT, + "div": FLOW_CONTENT | SCRIPT_SUPPORTING_ELEMENTS | {"dt", "dd"}, + "dl": {"dt", "dd", "div"} | SCRIPT_SUPPORTING_ELEMENTS, + "dt": (FLOW_CONTENT - {"header", "footer"} - SECTIONING_CONTENT - HEADING_CONTENT), + "em": PHRASING_CONTENT, + "embed": None, + "fieldset": {"legend"} | FLOW_CONTENT, + "figcaption": FLOW_CONTENT, + "figure": {"figcaption"} | FLOW_CONTENT, + "footer": FLOW_CONTENT - {"header", "footer"}, + "form": FLOW_CONTENT - {"form"}, + "h1": PHRASING_CONTENT, + "h2": PHRASING_CONTENT, + "h3": PHRASING_CONTENT, + "h4": PHRASING_CONTENT, + "h5": PHRASING_CONTENT, + "h6": PHRASING_CONTENT, + "header": FLOW_CONTENT - {"header", "footer"}, + "hgroup": HEADING_ELEMENTS | {"p"}, + "hr": None, + "i": PHRASING_CONTENT, + "iframe": None, + "img": None, + "input": None, + "kbd": PHRASING_CONTENT, + "label": PHRASING_CONTENT - {"label"}, + "legend": PHRASING_CONTENT | HEADING_CONTENT, + "li": FLOW_CONTENT, + "link": None, + "main": FLOW_CONTENT, + "mark": PHRASING_CONTENT, + "menu": {"li"} | SCRIPT_SUPPORTING_ELEMENTS, + "meta": None, + "meter": PHRASING_CONTENT - {"meter"}, + "nav": FLOW_CONTENT, + "ol": {"li"} | SCRIPT_SUPPORTING_ELEMENTS, + "optgroup": {"option"} | SCRIPT_SUPPORTING_ELEMENTS, + "option": None, + "output": PHRASING_CONTENT, + "p": PHRASING_CONTENT, + "picture": {"source", "img"}, + "pre": PHRASING_CONTENT, + "progress": PHRASING_CONTENT - {"progress"}, + "q": PHRASING_CONTENT, + "rp": None, + "rt": PHRASING_CONTENT, + "ruby": PHRASING_CONTENT | {"rt", "rp"}, + "s": PHRASING_CONTENT, + "samp": PHRASING_CONTENT, + "script": None, + "search": FLOW_CONTENT, + "section": FLOW_CONTENT, + "select": {"option", "optgroup", "hr"}, + "small": PHRASING_CONTENT, + "source": None, + "span": PHRASING_CONTENT, + "strong": PHRASING_CONTENT, + "style": None, + "sub": PHRASING_CONTENT, + "summary": PHRASING_CONTENT | HEADING_CONTENT, + "sup": PHRASING_CONTENT, + "table": {"tbody", "thead", "tfoot", "tr", "caption", "colgroup"} | SCRIPT_SUPPORTING_ELEMENTS, + "tbody": {"tr"}, + "td": FLOW_CONTENT, + "textarea": None, + "tfoot": {"tr"}, + "th": FLOW_CONTENT - {"header", "footer"} - SECTIONING_CONTENT - HEADING_CONTENT, + "thead": {"tr"}, + "time": PHRASING_CONTENT, + "title": None, + "tr": {"th", "td"}, + "track": None, + "u": PHRASING_CONTENT, + "ul": {"li"} | SCRIPT_SUPPORTING_ELEMENTS, + "var": PHRASING_CONTENT, + "wbr": None, +} +TAG_MATCH_PATTERN = re.compile(r"<(\w+)") +ATTRIBUTE_MATCH_PATTERN = re.compile(r'(\w+)=("[^"]*"|\w+)') +PARSE_TAG_PATTERN = re.compile(r"<(/?)(\w+)|([\w-]+)=('[^']*'|\"[^\"]*\"|\w+)") + + +class HTMLNode: + ROOT_KEY = "__root__" + DOCTYPE_KEY = "__doctype__" + COMMENT_KEY = "__comment__" + TEXT_KEY = "__text__" + DOCTYPE = "" + + __slots__ = ( + "_name", + "_attributes", + "_content", + "_children", + "_endtag", + "_parent", + "_closed", + ) + + def __init__(self, name: str, attributes: dict[str, str]) -> None: + self._name = name + self._attributes = attributes + self._content = "" + self._children: List[HTMLNode] = [] + self._endtag = name not in omit_endtag + self._parent: HTMLNode | None = None + self._closed = False + + def __repr__(self) -> str: + display = f"{self._name!r}, {self._attributes!r}" + return f"HTMLNode({display})" + + def __str__(self) -> str: + return f"" + + def add_child(self, child: HTMLNode) -> None: + if self._closed: + message = ( + f"Tag {self._name!r} is already closed. " + f"Trying to add {child}" + ) + raise ValueError(message) + self._children.append(child) + + def add_text_content(self, text: str) -> None: + if self._closed: + message = ( + f"Tag {self._name!r} is already closed. " + "Trying to add text content." + ) + raise ValueError(message) + self._content += text + + def can_contain(self, tag_name: str) -> bool: + limited = ALLOWED_CONTENT_MODEL.get(self._name, DEFAULT) + if limited is DEFAULT: + return True + if not limited: + return False + return tag_name in cast(set[str], limited) + + def __iter__(self): + return iter(self._children) + + @property + def children(self): + return self._children + + @property + def content(self): + return self._content + + @property + def name(self): + return self._name + + @property + def opening_tag(self): + tag = f"<{self._name}" + if self._attributes: + tag += " " + " ".join( + f"{key}={value}" for key, value in self._attributes.items() + ) + tag += ">" + return tag + + @property + def closing_tag(self): + if not self._endtag: + return "" + return f"" + + @property + def is_allowed_text_content(self) -> bool: + return self._name in TEXT_ALLOWED_ELEMENTS + + def close(self) -> None: + for child in (child for child in self._children if not child._closed): + child.close() + self._closed = True + + +class RootNode(HTMLNode): + def __init__(self) -> None: + super().__init__(HTMLNode.ROOT_KEY, {}) + self._endtag = False + + @property + def opening_tag(self): + return "" + + def can_contain(self, tag_name: str) -> bool: + return True + + +class DoctypeNode(HTMLNode): + def __init__(self) -> None: + super().__init__(HTMLNode.DOCTYPE_KEY, {}) + self._endtag = False + + @property + def opening_tag(self): + return self.DOCTYPE + + +class CommentNode(HTMLNode): + def __init__(self) -> None: + super().__init__(HTMLNode.COMMENT_KEY, {}) + self._endtag = False + + @property + def opening_tag(self): + return f"" + + @property + def content(self): + return "" + + +class NodeCreator: + def create(self, builder: Builder) -> HTMLNode: + root = self._create_root() + stack: list[HTMLNode] = [root] + + for piece in [ + p + for maybe_tag in builder._allpieces + for p in ( + maybe_tag._allpieces + if isinstance(maybe_tag, Builder) + else [maybe_tag] + ) + ]: + tag_name, is_closing, attributes = self._parse_tag(piece) + + if piece == HTMLNode.DOCTYPE: + self._handle_doctype(stack) + continue + elif tag_name == HTMLNode.COMMENT_KEY: + self._handle_comment(piece[4:-3], stack) + continue + elif tag_name == HTMLNode.TEXT_KEY: + self._handle_text(piece, stack) + continue + + if is_closing: + self._close_node_by_name(tag_name, stack) + continue + + self._handle_new_node(tag_name, attributes, stack) + + self._close_node(root, stack) + + return root + + def _create_root(self) -> RootNode: + return RootNode() + + def _handle_doctype(self, stack: list[HTMLNode]) -> None: + if len(stack) > 1: + raise ValueError("Doctype must be the first element") + doctype = DoctypeNode() + stack[0].add_child(doctype) + + def _handle_comment(self, text: str, stack: list[HTMLNode]) -> None: + comment = CommentNode() + comment.add_text_content(text) + for node in reversed(stack): + if not node._closed: + node.add_child(comment) + break + + def _handle_text(self, text: str, stack: list[HTMLNode]) -> None: + for node in reversed(stack): + if node.is_allowed_text_content: + node.add_text_content(text) + break + + def _handle_new_node( + self, tag_name: str, attributes: dict[str, str], stack: list[HTMLNode] + ) -> None: + new_node = HTMLNode(tag_name, attributes) + for node in reversed(stack): + if node.can_contain(new_node.name): + node.add_child(new_node) + break + self._close_node(node, stack) + stack.append(new_node) + + def _close_node(self, node: HTMLNode, stack: list[HTMLNode]) -> None: + node.close() + stack.pop() + + def _close_node_by_name(self, name: str, stack: list[HTMLNode]) -> None: + for node in reversed(stack): + self._close_node(node, stack) + if node.name == name: + break + + @staticmethod + def _parse_tag(tag: str) -> tuple[str, bool, dict[str, str]]: + if tag.startswith(" None: + self._root = root + self._builder = builder + + def __iter__(self): + return iter(self._root) + + @property + def root(self) -> HTMLNode: + return self._root + + def to_html( + self, + pretty: bool = False, + indent: str = " ", + single_line_max_length: int = 60, + ) -> str: + if not pretty: + return str(self._builder) + return self._to_html(self.root, indent, single_line_max_length) + + def _to_html( + self, + node: HTMLNode, + indent: str, + single_line_max_length: int, + level: int = 0 + ) -> str: + output = "" + new_line = "\n" + opening_prefix = indent * level + closing_prefix = opening_prefix + content_prefix = indent * (level + 1) + line_length = len(node.opening_tag) + len(node.content) + len(node.closing_tag) + len(opening_prefix) + single_line = False + if ( + not node.children + and single_line_max_length + and line_length < single_line_max_length + ): + new_line = "" + content_prefix = "" + closing_prefix = "" + single_line = True + output += opening_prefix + node.opening_tag + new_line + if node.content: + output += content_prefix + node.content + new_line + ending = "\n" if node.closing_tag or single_line else "" + for child in node.children: + increment = 0 if isinstance(node, (RootNode,)) else 1 + output += self._to_html( + child, indent, single_line_max_length, level + increment + ) + if node.closing_tag: + output += closing_prefix + node.closing_tag + return output + ending + + def display_tree(self, indent: str = " "): + self._display_node(self.root, indent) + + def _display_node(self, node: HTMLNode, indent: str, level: int = 0): + print(indent * level, node) + for child in node.children: + self._display_node(child, indent, level + 1) + + @classmethod + def create(cls, builder: Builder) -> HTMLSyntaxTree: + root = NodeCreator().create(builder) + return cls(root, builder)