From b9d633594aea7bb78457fa025723f1052fec61ae Mon Sep 17 00:00:00 2001
From: OleJoik <57186239+OleJoik@users.noreply.github.com>
Date: Thu, 13 Jun 2024 07:32:09 +0200
Subject: [PATCH] Add utilities to convert from html to htpy (#26)
* Add utilities to convert from html to htpy
* Refactor cli and python interface into html2htpy
* Minor refactor, use iterable instead of loop
* Refactor tests (hardcoded expected, no formatting)
* Undo unintentional formatting of __init__.py
* String content escaping
* simplify cli app, input from file or stdin
* Remove the import of html2htpy in init file
* Update import of html2htpy in test
* Bugfix: accept jinja style templates w/period
* Docs for html2htpy
* Removing select.select when reading stdin
* Update Formatting, -f auto/ruff/black/none from path
* Default shorthand syntax, --explicit -> kwargs id, class_
* Refactor, more descriptive function name
* fix failing test
* Avoid intermediate var in _get_formatter, immediate returns
* html -> HTML, HTPY -> htpy, python -> Python
* Get rid of __name__ == "__main__" in html2htpy
* Adding black as dev dependency
* Updates to docs after changes to --shorthand flag
* Update tests for html2htpy
* Ruff lint --fix and ruff format
* mypy lint
* Remove 'Self', use 'Any'. Python 3.10 compatible
* Ruff formatting
* -i: Flag to include import htpy elements in output
* Remove "expected formatting" from test
* Bugfix: correct handling of void elements without / in endtag
* Additional import options: notably --imports=h
* --no-shorthand instead of --explicit options
* Fix minor outdated info in docs
* Fix typo in docs
* Another typo in docs
* Remove rogue print()
---
docs/html2htpy.md | 207 ++++++++++++++++++++
htpy/html2htpy.py | 408 ++++++++++++++++++++++++++++++++++++++++
mkdocs.yml | 1 +
pyproject.toml | 4 +
tests/test_html2htpy.py | 366 +++++++++++++++++++++++++++++++++++
5 files changed, 986 insertions(+)
create mode 100644 docs/html2htpy.md
create mode 100644 htpy/html2htpy.py
create mode 100644 tests/test_html2htpy.py
diff --git a/docs/html2htpy.md b/docs/html2htpy.md
new file mode 100644
index 0000000..61d1992
--- /dev/null
+++ b/docs/html2htpy.md
@@ -0,0 +1,207 @@
+
+# Convert HTML to htpy code
+
+Maybe you already have a bunch of HTML, or templates that you would like to migrate to htpy.
+We got you covered. The utility command `html2htpy` ships with `htpy`, and can be used to transform existing
+html into Python code (htpy!).
+
+```
+$ html2htpy -h
+usage: html2htpy [-h] [-f {auto,ruff,black,none}] [-i {yes,h,no}] [--no-shorthand] [input]
+
+positional arguments:
+ input input HTML from file or stdin
+
+options:
+ -h, --help show this help message and exit
+ -f {auto,ruff,black,none}, --format {auto,ruff,black,none}
+ Select one of the following formatting options: auto, ruff, black or none
+ -i {yes,h,no}, --imports {yes,h,no}
+ Output mode for imports of found htpy elements
+ --no-shorthand Use explicit `id` and `class_` kwargs instead of the shorthand #id.class syntax
+```
+
+
+Lets say you have an existing HTML file:
+
+```html title="index.html"
+
+
+
+
+
+ htpy Recipes
+
+
+
+
Welcome to the cooking site
+
Your go-to place for delicious recipes!
+
+
+
+
Recipe of the Day: Spaghetti Carbonara
+
This classic Italian dish is quick and easy to make.
+
+```
+
+#### Default shorthand yield `#id.class`
+```py title="$ html2htpy example.html"
+from htpy import p, section
+
+section("#main-section.hero.is-link")[
+ p(".subtitle.is-3.is-spaced")["Welcome"]
+]
+```
+
+#### No shorthand yields kwargs `id`, `class_`
+```py title="$ html2htpy --no-shorthand example.html"
+from htpy import p, section
+
+section(id="main-section", class_="hero is-link")[
+ p(class_="subtitle is-3 is-spaced")["Welcome"]
+]
+```
+
+
+## Template interpolation to f-strings
+
+`html2htpy` will try to convert template variables to pythonic f-strings:
+
+`template {{ variables }}` -> `f"template { variables }"`
+
+Note that other typical template syntax, such as loops `{% for x in y %}`, can not be transformed this way,
+so you will often have to clean up a bit after `html2htpy` is done with its thing.
+
+See the example below:
+
+```html title="jinja.html"
+
+
{{ heading }}
+
Welcome to our cooking site, {{ user.name }}!
+
+
Recipe of the Day: {{ recipe.name }}
+
{{ recipe.description }}
+
+
Instructions:
+
+ {% for step in recipe.steps %}
+
{{ step }}
+ {% endfor %}
+
+
+```
+
+```py title="$ html2htpy jinja.html"
+from htpy import body, h1, h2, h3, li, ol, p
+
+body[
+ h1[f"{ heading }"],
+ p[f"Welcome to our cooking site, { user.name }!"],
+ h2[f"Recipe of the Day: { recipe.name }"],
+ p[f"{ recipe.description }"],
+ h3["Instructions:"],
+ ol[
+ """ {% for step in recipe.steps %} """,
+ li[f"{ step }"],
+ """ {% endfor %} """,
+ ],
+]
+```
+
diff --git a/htpy/html2htpy.py b/htpy/html2htpy.py
new file mode 100644
index 0000000..aba138a
--- /dev/null
+++ b/htpy/html2htpy.py
@@ -0,0 +1,408 @@
+import argparse
+import re
+import shutil
+import subprocess
+import sys
+from abc import ABC, abstractmethod
+from html.parser import HTMLParser
+from typing import Any, Literal
+
+__all__ = ["html2htpy"]
+
+_void_elements = [
+ "area",
+ "base",
+ "br",
+ "col",
+ "embed",
+ "hr",
+ "img",
+ "input",
+ "link",
+ "meta",
+ "param",
+ "source",
+ "track",
+ "wbr",
+]
+
+
+class Tag:
+ def __init__(
+ self,
+ type: str,
+ attrs: list[tuple[str, str | None]],
+ parent: Any | None = None,
+ ):
+ self.html_type = type
+ self.python_type = type
+ if "-" in self.python_type:
+ self.python_type = self.python_type.replace("-", "_")
+
+ self.attrs = attrs
+ self.parent = parent
+ self.children: list[Any | str] = []
+
+ def serialize(self, shorthand_id_class: bool, use_h_prefix: bool) -> str:
+ _positional_attrs: dict[str, str | None] = {}
+ _attrs = ""
+ _kwattrs: list[tuple[str, str | None]] = []
+
+ for a in self.attrs:
+ key = a[0]
+ if key == "class":
+ if shorthand_id_class:
+ _positional_attrs[key] = a[1]
+ else:
+ _kwattrs.append(a)
+
+ elif key == "id":
+ if shorthand_id_class:
+ _positional_attrs[key] = a[1]
+ else:
+ _kwattrs.append(a)
+ else:
+ _kwattrs.append(a)
+
+ if _positional_attrs or _kwattrs:
+ _attrs += "("
+
+ if _positional_attrs:
+ arg0 = ""
+ if "id" in _positional_attrs:
+ if _positional_attrs["id"] is None:
+ raise Exception("Id attribute cannot be none")
+
+ arg0 += "#" + _positional_attrs["id"]
+
+ if "class" in _positional_attrs:
+ if _positional_attrs["class"] is None:
+ raise Exception("Class attribute cannot be none")
+
+ classes = ".".join(_positional_attrs["class"].split(" "))
+ arg0 += "." + classes
+
+ _attrs += '"' + arg0 + '",'
+
+ if _kwattrs:
+ for a in _kwattrs:
+ key = a[0]
+ if "-" in key:
+ key = key.replace("-", "_")
+
+ if key == "class":
+ key = "class_"
+ elif key == "for":
+ key = "for_"
+
+ val = a[1]
+ if not val:
+ _attrs += f"{key}=True,"
+
+ else:
+ _attrs += f'{key}="{val}",'
+
+ if _positional_attrs or _kwattrs:
+ _attrs = _attrs[:-1] + ")"
+
+ _children: str = ""
+ if self.children:
+ _children += "["
+ for c in self.children:
+ if isinstance(c, Tag):
+ _children += c.serialize(shorthand_id_class, use_h_prefix)
+ else:
+ _children += str(c)
+
+ _children += ","
+
+ _children = _children[:-1] + "]"
+
+ if use_h_prefix:
+ return f"h.{self.python_type}{_attrs}{_children}"
+
+ return f"{self.python_type}{_attrs}{_children}"
+
+
+class Formatter(ABC):
+ @abstractmethod
+ def format(self, s: str) -> str:
+ raise NotImplementedError()
+
+
+class BlackFormatter(Formatter):
+ def format(self, s: str) -> str:
+ result = subprocess.run(
+ ["black", "-q", "-"],
+ input=s.encode("utf8"),
+ stdout=subprocess.PIPE,
+ )
+ return result.stdout.decode("utf8")
+
+
+class RuffFormatter(Formatter):
+ def format(self, s: str) -> str:
+ result = subprocess.run(
+ ["ruff", "format", "-"],
+ input=s.encode("utf8"),
+ stdout=subprocess.PIPE,
+ )
+ return result.stdout.decode("utf8")
+
+
+class HTPYParser(HTMLParser):
+ def __init__(self) -> None:
+ self._collected: list[Tag | str] = []
+ self._current: Tag | None = None
+ super().__init__()
+
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+ t = Tag(tag, attrs, parent=self._current)
+
+ if not self._current:
+ self._collected.append(t)
+ else:
+ self._current.children.append(t)
+
+ if tag not in _void_elements:
+ self._current = t
+
+ def handle_startendtag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+ t = Tag(tag, attrs, parent=self._current)
+
+ if not self._current:
+ self._collected.append(t)
+ else:
+ self._current.children.append(t)
+
+ def handle_endtag(self, tag: str) -> None:
+ if not self._current:
+ raise Exception(f"Error parsing html: Closing tag {tag} when not inside any other tag")
+
+ if not self._current.html_type == tag:
+ raise Exception(
+ f"Error parsing html: Closing tag {tag} does not match the "
+ f"currently open tag ({self._current.html_type})"
+ )
+
+ self._current = self._current.parent
+
+ def handle_data(self, data: str) -> None:
+ if not data.isspace():
+ stringified_data = _convert_data_to_string(data)
+
+ if self._current:
+ self._current.children.append(stringified_data)
+ else:
+ self._collected.append(stringified_data)
+
+ def serialize_python(
+ self,
+ shorthand_id_class: bool = False,
+ import_mode: Literal["yes", "h", "no"] = "yes",
+ formatter: Formatter | None = None,
+ ) -> str:
+ o = ""
+
+ use_h_prefix = False
+
+ if import_mode == "yes":
+ unique_tags: set[str] = set()
+
+ def _tags_from_children(parent: Tag) -> None:
+ for c in parent.children:
+ if isinstance(c, Tag):
+ unique_tags.add(c.python_type)
+ _tags_from_children(c)
+
+ for t in self._collected:
+ if isinstance(t, Tag):
+ unique_tags.add(t.python_type)
+ _tags_from_children(t)
+
+ sorted_tags = list(unique_tags)
+ sorted_tags.sort()
+
+ o += f'from htpy import {", ".join(sorted_tags)}\n'
+
+ elif import_mode == "h":
+ o += "import htpy as h\n"
+ use_h_prefix = True
+
+ if len(self._collected) == 1:
+ o += _serialize(self._collected[0], shorthand_id_class, use_h_prefix)
+
+ else:
+ o += "["
+ for t in self._collected:
+ o += _serialize(t, shorthand_id_class, use_h_prefix) + ","
+ o = o[:-1] + "]"
+
+ if formatter:
+ return formatter.format(o)
+ else:
+ return o
+
+
+def html2htpy(
+ html: str,
+ shorthand_id_class: bool = True,
+ import_mode: Literal["yes", "h", "no"] = "yes",
+ formatter: Formatter | None = None,
+) -> str:
+ parser = HTPYParser()
+ parser.feed(html)
+
+ return parser.serialize_python(shorthand_id_class, import_mode, formatter)
+
+
+def _convert_data_to_string(data: str) -> str:
+ _data = str(data)
+
+ is_multiline = "\n" in _data
+
+ _data = _data.replace("\n", "")
+
+ # escape unescaped dblquote: " -> \"
+ _data = re.compile(r'(? { var.xx }
+ # { -> {{
+ # } -> }}
+ template_string_replace_pattern = re.compile(
+ r"(\{\{\s*[\w\.]+\s*\}\}|(? str:
+ captured = match.group(1)
+
+ if captured.startswith("{{"):
+ return captured[1:-1]
+
+ if captured == "{":
+ return "{{"
+
+ return "}}"
+
+ _data = template_string_replace_pattern.sub(replacer, _data)
+ if is_multiline:
+ _data = '""' + _data + '""'
+
+ _data = 'f"' + _data + '"'
+ else:
+ if is_multiline:
+ _data = '""' + _data + '""'
+
+ _data = '"' + _data + '"'
+
+ return _data
+
+
+def _serialize(el: Tag | str, shorthand_id_class: bool, use_h_prefix: bool) -> str:
+ if isinstance(el, Tag):
+ return el.serialize(shorthand_id_class, use_h_prefix)
+ else:
+ return str(el)
+
+
+def _get_formatter(format: Literal["auto", "ruff", "black", "none"]) -> Formatter | None:
+ if format == "ruff":
+ if _is_command_available("ruff"):
+ return RuffFormatter()
+ else:
+ _printerr(
+ "Selected formatter (ruff) is not installed.",
+ )
+ _printerr("Please install it or select another formatter.")
+ _printerr("`html2htpy -h` for help")
+ sys.exit(1)
+
+ if format == "black":
+ if _is_command_available("black"):
+ return BlackFormatter()
+ else:
+ _printerr(
+ "Selected formatter (black) is not installed.",
+ )
+ _printerr("Please install it or select another formatter.")
+ _printerr("`html2htpy -h` for help")
+ sys.exit(1)
+
+ elif format == "auto":
+ if _is_command_available("ruff"):
+ return RuffFormatter()
+ elif _is_command_available("black"):
+ return BlackFormatter()
+
+ return None
+
+
+def _is_command_available(command: str) -> bool:
+ return shutil.which(command) is not None
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(prog="html2htpy")
+
+ parser.add_argument(
+ "-f",
+ "--format",
+ choices=["auto", "ruff", "black", "none"],
+ default="auto",
+ help="Select one of the following formatting options: auto, ruff, black or none",
+ )
+ parser.add_argument(
+ "-i",
+ "--imports",
+ choices=["yes", "h", "no"],
+ help="Output mode for imports of found htpy elements",
+ default="yes",
+ )
+ parser.add_argument(
+ "--no-shorthand",
+ help="Use explicit `id` and `class_` kwargs instead of the shorthand #id.class syntax",
+ action="store_true",
+ )
+ parser.add_argument(
+ "input",
+ type=argparse.FileType("r"),
+ nargs="?",
+ default=sys.stdin,
+ help="input HTML from file or stdin",
+ )
+
+ args = parser.parse_args()
+
+ try:
+ if args.input == sys.stdin:
+ input = args.input.read()
+ elif args.input != sys.stdin:
+ input = args.input.read()
+ else:
+ _printerr(
+ "No input provided. Please supply an input file or stream.",
+ )
+ _printerr("Example usage: `cat index.html | html2htpy`")
+ _printerr("`html2htpy -h` for help")
+ sys.exit(1)
+ except KeyboardInterrupt:
+ _printerr(
+ "\nInterrupted",
+ )
+ sys.exit(1)
+
+ shorthand: bool = False if args.no_shorthand else True
+ imports: Literal["yes", "h", "no"] = args.imports
+
+ formatter = _get_formatter(args.format)
+
+ print(html2htpy(input, shorthand, imports, formatter))
+
+
+def _printerr(value: str) -> None:
+ print(value, file=sys.stderr)
diff --git a/mkdocs.yml b/mkdocs.yml
index fd1a2f4..fb2f2ad 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -16,6 +16,7 @@ nav:
- static-typing.md
- django.md
- streaming.md
+ - html2htpy.md
- faq.md
- references.md
markdown_extensions:
diff --git a/pyproject.toml b/pyproject.toml
index 890873a..40f4b01 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,6 +26,7 @@ optional-dependencies.dev = [
"mypy",
"pyright",
"pytest",
+ "black",
"ruff",
"django",
"django-stubs",
@@ -41,6 +42,9 @@ Repository = "https://github.com/pelme/htpy"
Documentation = "https://htpy.dev"
Issues = "https://github.com/pelme/htpy/issues"
+[project.scripts]
+html2htpy = "htpy.html2htpy:main"
+
[build-system]
requires = ["flit_core >=3.2,<4"]
build-backend = "flit_core.buildapi"
diff --git a/tests/test_html2htpy.py b/tests/test_html2htpy.py
new file mode 100644
index 0000000..cfd0737
--- /dev/null
+++ b/tests/test_html2htpy.py
@@ -0,0 +1,366 @@
+import textwrap
+
+import pytest
+
+from htpy.html2htpy import BlackFormatter, RuffFormatter, html2htpy
+
+
+def test_convert_default_shorthand_id_and_class() -> None:
+ input = """
+
+
This is a paragraph.
+
+ """
+
+ actual = html2htpy(input, import_mode="no")
+ expected = 'div("#div-id.some-class.other-class")[p["This is a paragraph."]]'
+
+ assert actual == expected
+
+
+def test_convert_explicit_id_class_syntas() -> None:
+ input = """
+
+
This is a paragraph.
+
+ """
+
+ actual = html2htpy(input, shorthand_id_class=False, import_mode="no")
+ expected = 'div(id="div-id",class_="some-class other-class")[p["This is a paragraph."]]'
+
+ assert actual == expected
+
+
+nested_html = """
+
+ """
+
+ with pytest.raises(Exception) as e:
+ html2htpy(input)
+
+ assert "Closing tag p does not match the currently open tag (div)" in str(e.value)
+
+
+def test_convert_attributes_without_values() -> None:
+ input = """
+
+
+ """
+ actual = html2htpy(input, import_mode="no")
+ assert actual == """[input(type="checkbox",checked=True),option(selected=True)["Option"]]"""
+
+
+def test_convert_complex_section() -> None:
+ input = """
+
+