diff --git a/html5tagger/parser.py b/html5tagger/parser.py
new file mode 100644
index 0000000..1ef0b8e
--- /dev/null
+++ b/html5tagger/parser.py
@@ -0,0 +1,465 @@
+from __future__ import annotations
+
+import re
+from typing import List, cast
+
+from .builder import Builder
+from .html5 import omit_endtag
+
+DEFAULT = object()
+BASIC_INLINE = {"a", "abbr", "area", "b", "bdi", "bdo", "br", "button", "cite",
+ "code", "data", "datalist", "del", "dfn", "em", "i", "input",
+ "ins", "kbd", "label", "map", "mark", "meter", "noscript",
+ "output", "q", "ruby", "s", "samp", "select", "slot", "small",
+ "span", "strong", "sub", "sup", "u", "var", "wbr"}
+MEDIA_ELEMENTS = {"audio", "canvas", "embed", "iframe", "img", "object",
+ "picture", "svg", "video"}
+FORM_ELEMENTS = {"fieldset", "form", "option", "textarea"}
+SCRIPT_SUPPORTING_ELEMENTS = {"script", "template"}
+HEADING_ELEMENTS = {"h1", "h2", "h3", "h4", "h5", "h6"}
+PHRASING_CONTENT = BASIC_INLINE | MEDIA_ELEMENTS | {"time", "template"}
+HEADING_CONTENT = PHRASING_CONTENT | HEADING_ELEMENTS
+SECTIONING_CONTENT = {"article", "aside", "nav", "section"}
+FLOW_CONTENT = (BASIC_INLINE | MEDIA_ELEMENTS | FORM_ELEMENTS |
+ {"address", "article", "aside", "blockquote", "caption",
+ "details", "dialog", "div", "dl", "dt", "fieldset", "figure",
+ "footer", "h1", "h2", "h3", "h4", "h5", "h6", "header",
+ "hgroup", "hr", "main", "math", "menu", "nav", "ol", "p",
+ "pre", "progress", "section", "table", "template", "time",
+ "ul"})
+ROOT_CONTENT = {"html", "body"}
+TEXT_ALLOWED_ELEMENTS = {"a", "abbr", "address", "article", "aside", "b",
+ "bdi", "bdo", "blockquote", "button", "caption",
+ "cite", "code", "data", "datalist", "dd", "del",
+ "details", "dfn", "div", "dl", "dt", "em", "fieldset",
+ "figcaption", "figure", "footer", "form", "h1", "h2",
+ "h3", "h4", "h5", "h6", "header", "hgroup", "i",
+ "ins", "kbd", "label", "legend", "li", "main", "mark",
+ "menu", "meter", "nav", "noscript", "ol", "option",
+ "output", "p", "pre", "progress", "q", "rb", "rp",
+ "rt", "rtc", "ruby", "s", "samp", "section", "select",
+ "small", "span", "strong", "sub", "summary", "sup",
+ "table", "tbody", "td", "textarea", "tfoot", "th",
+ "thead", "time", "tr", "u", "ul", "var"}
+ALLOWED_CONTENT_MODEL = {
+ # Transparent items specifically are not included
+ # - a
+ # - ins
+ # - del
+ # - map
+ # - object
+ # - video
+ # - audio
+ # - noscript
+ # - slot
+ # - canvas
+ # Also specifically left out are special elements
+ # - template
+ "abbr": PHRASING_CONTENT,
+ "address": (FLOW_CONTENT - HEADING_CONTENT - SECTIONING_CONTENT - {"address", "header", "footer"}),
+ "area": None,
+ "article": FLOW_CONTENT,
+ "aside": FLOW_CONTENT,
+ "b": PHRASING_CONTENT,
+ "bdi": PHRASING_CONTENT,
+ "bdo": PHRASING_CONTENT,
+ "blockquote": FLOW_CONTENT,
+ "body": FLOW_CONTENT,
+ "br": None,
+ "br": None,
+ "button": PHRASING_CONTENT,
+ "canvas": None,
+ "caption": FLOW_CONTENT - {"table"},
+ "cite": PHRASING_CONTENT,
+ "code": PHRASING_CONTENT,
+ "col": None,
+ "colgroup": {"col", "template"},
+ "data": PHRASING_CONTENT,
+ "datalist": {"option"} | SCRIPT_SUPPORTING_ELEMENTS | PHRASING_CONTENT,
+ "dd": FLOW_CONTENT,
+ "details": FLOW_CONTENT - {"summary"},
+ "dfn": PHRASING_CONTENT - {"dfn"},
+ "dialog": FLOW_CONTENT,
+ "div": FLOW_CONTENT | SCRIPT_SUPPORTING_ELEMENTS | {"dt", "dd"},
+ "dl": {"dt", "dd", "div"} | SCRIPT_SUPPORTING_ELEMENTS,
+ "dt": (FLOW_CONTENT - {"header", "footer"} - SECTIONING_CONTENT - HEADING_CONTENT),
+ "em": PHRASING_CONTENT,
+ "embed": None,
+ "fieldset": {"legend"} | FLOW_CONTENT,
+ "figcaption": FLOW_CONTENT,
+ "figure": {"figcaption"} | FLOW_CONTENT,
+ "footer": FLOW_CONTENT - {"header", "footer"},
+ "form": FLOW_CONTENT - {"form"},
+ "h1": PHRASING_CONTENT,
+ "h2": PHRASING_CONTENT,
+ "h3": PHRASING_CONTENT,
+ "h4": PHRASING_CONTENT,
+ "h5": PHRASING_CONTENT,
+ "h6": PHRASING_CONTENT,
+ "header": FLOW_CONTENT - {"header", "footer"},
+ "hgroup": HEADING_ELEMENTS | {"p"},
+ "hr": None,
+ "i": PHRASING_CONTENT,
+ "iframe": None,
+ "img": None,
+ "input": None,
+ "kbd": PHRASING_CONTENT,
+ "label": PHRASING_CONTENT - {"label"},
+ "legend": PHRASING_CONTENT | HEADING_CONTENT,
+ "li": FLOW_CONTENT,
+ "link": None,
+ "main": FLOW_CONTENT,
+ "mark": PHRASING_CONTENT,
+ "menu": {"li"} | SCRIPT_SUPPORTING_ELEMENTS,
+ "meta": None,
+ "meter": PHRASING_CONTENT - {"meter"},
+ "nav": FLOW_CONTENT,
+ "ol": {"li"} | SCRIPT_SUPPORTING_ELEMENTS,
+ "optgroup": {"option"} | SCRIPT_SUPPORTING_ELEMENTS,
+ "option": None,
+ "output": PHRASING_CONTENT,
+ "p": PHRASING_CONTENT,
+ "picture": {"source", "img"},
+ "pre": PHRASING_CONTENT,
+ "progress": PHRASING_CONTENT - {"progress"},
+ "q": PHRASING_CONTENT,
+ "rp": None,
+ "rt": PHRASING_CONTENT,
+ "ruby": PHRASING_CONTENT | {"rt", "rp"},
+ "s": PHRASING_CONTENT,
+ "samp": PHRASING_CONTENT,
+ "script": None,
+ "search": FLOW_CONTENT,
+ "section": FLOW_CONTENT,
+ "select": {"option", "optgroup", "hr"},
+ "small": PHRASING_CONTENT,
+ "source": None,
+ "span": PHRASING_CONTENT,
+ "strong": PHRASING_CONTENT,
+ "style": None,
+ "sub": PHRASING_CONTENT,
+ "summary": PHRASING_CONTENT | HEADING_CONTENT,
+ "sup": PHRASING_CONTENT,
+ "table": {"tbody", "thead", "tfoot", "tr", "caption", "colgroup"} | SCRIPT_SUPPORTING_ELEMENTS,
+ "tbody": {"tr"},
+ "td": FLOW_CONTENT,
+ "textarea": None,
+ "tfoot": {"tr"},
+ "th": FLOW_CONTENT - {"header", "footer"} - SECTIONING_CONTENT - HEADING_CONTENT,
+ "thead": {"tr"},
+ "time": PHRASING_CONTENT,
+ "title": None,
+ "tr": {"th", "td"},
+ "track": None,
+ "u": PHRASING_CONTENT,
+ "ul": {"li"} | SCRIPT_SUPPORTING_ELEMENTS,
+ "var": PHRASING_CONTENT,
+ "wbr": None,
+}
+TAG_MATCH_PATTERN = re.compile(r"<(\w+)")
+ATTRIBUTE_MATCH_PATTERN = re.compile(r'(\w+)=("[^"]*"|\w+)')
+PARSE_TAG_PATTERN = re.compile(r"<(/?)(\w+)|([\w-]+)=('[^']*'|\"[^\"]*\"|\w+)")
+
+
+class HTMLNode:
+ ROOT_KEY = "__root__"
+ DOCTYPE_KEY = "__doctype__"
+ COMMENT_KEY = "__comment__"
+ TEXT_KEY = "__text__"
+ DOCTYPE = ""
+
+ __slots__ = (
+ "_name",
+ "_attributes",
+ "_content",
+ "_children",
+ "_endtag",
+ "_parent",
+ "_closed",
+ )
+
+ def __init__(self, name: str, attributes: dict[str, str]) -> None:
+ self._name = name
+ self._attributes = attributes
+ self._content = ""
+ self._children: List[HTMLNode] = []
+ self._endtag = name not in omit_endtag
+ self._parent: HTMLNode | None = None
+ self._closed = False
+
+ def __repr__(self) -> str:
+ display = f"{self._name!r}, {self._attributes!r}"
+ return f"HTMLNode({display})"
+
+ def __str__(self) -> str:
+ return f""
+
+ def add_child(self, child: HTMLNode) -> None:
+ if self._closed:
+ message = (
+ f"Tag {self._name!r} is already closed. "
+ f"Trying to add {child}"
+ )
+ raise ValueError(message)
+ self._children.append(child)
+
+ def add_text_content(self, text: str) -> None:
+ if self._closed:
+ message = (
+ f"Tag {self._name!r} is already closed. "
+ "Trying to add text content."
+ )
+ raise ValueError(message)
+ self._content += text
+
+ def can_contain(self, tag_name: str) -> bool:
+ limited = ALLOWED_CONTENT_MODEL.get(self._name, DEFAULT)
+ if limited is DEFAULT:
+ return True
+ if not limited:
+ return False
+ return tag_name in cast(set[str], limited)
+
+ def __iter__(self):
+ return iter(self._children)
+
+ @property
+ def children(self):
+ return self._children
+
+ @property
+ def content(self):
+ return self._content
+
+ @property
+ def name(self):
+ return self._name
+
+ @property
+ def opening_tag(self):
+ tag = f"<{self._name}"
+ if self._attributes:
+ tag += " " + " ".join(
+ f"{key}={value}" for key, value in self._attributes.items()
+ )
+ tag += ">"
+ return tag
+
+ @property
+ def closing_tag(self):
+ if not self._endtag:
+ return ""
+ return f"{self._name}>"
+
+ @property
+ def is_allowed_text_content(self) -> bool:
+ return self._name in TEXT_ALLOWED_ELEMENTS
+
+ def close(self) -> None:
+ for child in (child for child in self._children if not child._closed):
+ child.close()
+ self._closed = True
+
+
+class RootNode(HTMLNode):
+ def __init__(self) -> None:
+ super().__init__(HTMLNode.ROOT_KEY, {})
+ self._endtag = False
+
+ @property
+ def opening_tag(self):
+ return ""
+
+ def can_contain(self, tag_name: str) -> bool:
+ return True
+
+
+class DoctypeNode(HTMLNode):
+ def __init__(self) -> None:
+ super().__init__(HTMLNode.DOCTYPE_KEY, {})
+ self._endtag = False
+
+ @property
+ def opening_tag(self):
+ return self.DOCTYPE
+
+
+class CommentNode(HTMLNode):
+ def __init__(self) -> None:
+ super().__init__(HTMLNode.COMMENT_KEY, {})
+ self._endtag = False
+
+ @property
+ def opening_tag(self):
+ return f""
+
+ @property
+ def content(self):
+ return ""
+
+
+class NodeCreator:
+ def create(self, builder: Builder) -> HTMLNode:
+ root = self._create_root()
+ stack: list[HTMLNode] = [root]
+
+ for piece in [
+ p
+ for maybe_tag in builder._allpieces
+ for p in (
+ maybe_tag._allpieces
+ if isinstance(maybe_tag, Builder)
+ else [maybe_tag]
+ )
+ ]:
+ tag_name, is_closing, attributes = self._parse_tag(piece)
+
+ if piece == HTMLNode.DOCTYPE:
+ self._handle_doctype(stack)
+ continue
+ elif tag_name == HTMLNode.COMMENT_KEY:
+ self._handle_comment(piece[4:-3], stack)
+ continue
+ elif tag_name == HTMLNode.TEXT_KEY:
+ self._handle_text(piece, stack)
+ continue
+
+ if is_closing:
+ self._close_node_by_name(tag_name, stack)
+ continue
+
+ self._handle_new_node(tag_name, attributes, stack)
+
+ self._close_node(root, stack)
+
+ return root
+
+ def _create_root(self) -> RootNode:
+ return RootNode()
+
+ def _handle_doctype(self, stack: list[HTMLNode]) -> None:
+ if len(stack) > 1:
+ raise ValueError("Doctype must be the first element")
+ doctype = DoctypeNode()
+ stack[0].add_child(doctype)
+
+ def _handle_comment(self, text: str, stack: list[HTMLNode]) -> None:
+ comment = CommentNode()
+ comment.add_text_content(text)
+ for node in reversed(stack):
+ if not node._closed:
+ node.add_child(comment)
+ break
+
+ def _handle_text(self, text: str, stack: list[HTMLNode]) -> None:
+ for node in reversed(stack):
+ if node.is_allowed_text_content:
+ node.add_text_content(text)
+ break
+
+ def _handle_new_node(
+ self, tag_name: str, attributes: dict[str, str], stack: list[HTMLNode]
+ ) -> None:
+ new_node = HTMLNode(tag_name, attributes)
+ for node in reversed(stack):
+ if node.can_contain(new_node.name):
+ node.add_child(new_node)
+ break
+ self._close_node(node, stack)
+ stack.append(new_node)
+
+ def _close_node(self, node: HTMLNode, stack: list[HTMLNode]) -> None:
+ node.close()
+ stack.pop()
+
+ def _close_node_by_name(self, name: str, stack: list[HTMLNode]) -> None:
+ for node in reversed(stack):
+ self._close_node(node, stack)
+ if node.name == name:
+ break
+
+ @staticmethod
+ def _parse_tag(tag: str) -> tuple[str, bool, dict[str, str]]:
+ if tag.startswith(" None:
+ self._root = root
+ self._builder = builder
+
+ def __iter__(self):
+ return iter(self._root)
+
+ @property
+ def root(self) -> HTMLNode:
+ return self._root
+
+ def to_html(
+ self,
+ pretty: bool = False,
+ indent: str = " ",
+ single_line_max_length: int = 60,
+ ) -> str:
+ if not pretty:
+ return str(self._builder)
+ return self._to_html(self.root, indent, single_line_max_length)
+
+ def _to_html(
+ self,
+ node: HTMLNode,
+ indent: str,
+ single_line_max_length: int,
+ level: int = 0
+ ) -> str:
+ output = ""
+ new_line = "\n"
+ opening_prefix = indent * level
+ closing_prefix = opening_prefix
+ content_prefix = indent * (level + 1)
+ line_length = len(node.opening_tag) + len(node.content) + len(node.closing_tag) + len(opening_prefix)
+ single_line = False
+ if (
+ not node.children
+ and single_line_max_length
+ and line_length < single_line_max_length
+ ):
+ new_line = ""
+ content_prefix = ""
+ closing_prefix = ""
+ single_line = True
+ output += opening_prefix + node.opening_tag + new_line
+ if node.content:
+ output += content_prefix + node.content + new_line
+ ending = "\n" if node.closing_tag or single_line else ""
+ for child in node.children:
+ increment = 0 if isinstance(node, (RootNode,)) else 1
+ output += self._to_html(
+ child, indent, single_line_max_length, level + increment
+ )
+ if node.closing_tag:
+ output += closing_prefix + node.closing_tag
+ return output + ending
+
+ def display_tree(self, indent: str = " "):
+ self._display_node(self.root, indent)
+
+ def _display_node(self, node: HTMLNode, indent: str, level: int = 0):
+ print(indent * level, node)
+ for child in node.children:
+ self._display_node(child, indent, level + 1)
+
+ @classmethod
+ def create(cls, builder: Builder) -> HTMLSyntaxTree:
+ root = NodeCreator().create(builder)
+ return cls(root, builder)