OI-wiki · Enter-tainer · Jul 6, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 1, 2024
diff --git a/.gitignore b/.gitignore
diff --git a/compiler-plugin/.gitignore b/compiler-plugin/.gitignore
@@ -0,0 +1,162 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/compiler-plugin/pyproject.toml b/compiler-plugin/pyproject.toml
@@ -0,0 +1,18 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "python_markdown_mark_words_compiler_plugin"
+version = "0.0.1"
+authors = [
+    {name = "HikariLan", email = "[email protected]"},
+]
+description = "A Python-Markdown compiler plugin that put markdown words offset to the output HTML."
+license = {text = "Apache-2.0"}
+dependencies = [
+  "markdown"  
+]
+
+[project.entry-points."markdown.extensions"]
+mark-words = "python_markdown_mark_words_compiler_plugin.extension:MarkWordsExtension"
diff --git a/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/__init__.py b/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/__init__.py
diff --git a/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py b/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py
@@ -0,0 +1,108 @@
+from markdown import Extension
+from markdown.preprocessors import Preprocessor
+from markdown.blockprocessors import BlockProcessor
+import xml.etree.ElementTree as etree
+
+MARK_PREVENT_RECURSION = "\t\t\t\r\r\rMARK_PREVENT_RECURSION\r\r\r\t\t\t"
+
+class MarkWordsExtension(Extension):
+    def extendMarkdown(self, md):
+        meta = {
+            "document_offsets": [],
+            "used_document_offsets": {}
+        }
+        md.preprocessors.register(CalculateDocumentOffsetPreprocessor(md, meta),
+                                   'capture_document', 
+                                   1000) # Highest priority is required because we need to calc words offset from original document
+        md.parser.blockprocessors.register(MarkWordsBlockProcessor(md.parser, meta), 
+                                           'mark_words',
+                                           100) # high priority, usually larger than every other block processor
+
+'''
+A preprocessor to calculate the offset of each line in the document
+'''
+class CalculateDocumentOffsetPreprocessor(Preprocessor):
+    def __init__(self, md, meta):
+        super(CalculateDocumentOffsetPreprocessor, self).__init__(md)
+        self.meta = meta
+
+    def run(self, lines):
+        offset = 0
+        for line_num, line in enumerate(lines):
+            # Skip empty lines
+            if len(line) == 0:
+                offset += 1
+                continue
+            # store the line and offset
+            store = (line, offset, offset + len(line))
+            self.meta["document_offsets"].append(store)
+            self.meta["used_document_offsets"][store] = False
+            ## plus 1 is for the newline character (\n), use the CRLF file is unknown behavior
+            offset += (len(line) + 1)
+        return lines
+
+'''
+A block processor to mark the words in the document and inject the offset of the block to the HTML element
+'''
+class MarkWordsBlockProcessor(BlockProcessor):
+    def __init__(self, parser, meta):
+        super(MarkWordsBlockProcessor, self).__init__(parser)
+        self.meta = meta
+
+    def test(self, parent, block):
+        ## Test if there is any line in the block
+        for line in [line for (line, _, _) in self.meta["document_offsets"]]:
+            if line in block:
+                return True
+        return False
+
+    def run(self, parent: etree.Element, blocks):
+        block = blocks[0]
+
+        # If the first block is handled, remove the marker and return, so that other block processors can process it
+        if MARK_PREVENT_RECURSION in blocks[0]:
+            blocks[0] = blocks[0].replace(MARK_PREVENT_RECURSION, "")
+            return False
+
+        start = None
+        end = None
+        used = {}
+        # Search for the block fragment in the document_offsets
+        for store in self.meta["document_offsets"]:
+            # If already used, skip
+            if(self.meta["used_document_offsets"][store]):
+                continue
+            (line, offset, end_offset) = store
+            # If found one
+            if line in block:
+                # If the line already scanned (usually some lines with same content in different place), skip
+                if line in [line for (line, _, _) in used.keys()]:
+                    continue
+                # If none yet set, set the start offset
+                if start is None:
+                    start = offset
+                    end = end_offset
+                # Or, continuing searching for the end offset until the end of the block
+                else:
+                    end = end_offset
+                # Mark the fragment as used
+                used[store] = True
+            # If end is not found but new line not in block, reset the search and restart from the next line
+            elif end is None:
+                start = None
+                # Clear the used list
+                used = {}
+                continue
+            # If both start and end are both set and no continuously block found, break the loop
+            else:
+                break
+        # If both start and end are found, store the result
+        if start is not None and end is not None:
+            blocks.pop(0)
+            self.meta["used_document_offsets"].update(used)
+            # append MARK_PREVENT_RECURSION to tail of the block to prevent recursion, we don't use a handled flaglist because we don't know if there's some same block in the document
+            self.parser.parseBlocks(parent, [block + MARK_PREVENT_RECURSION])
+            parent[-1].set("data-original-document-start", str(start))
+            parent[-1].set("data-original-document-end", str(end))
+            return True
+        return False
diff --git a/compiler-plugin/test/__main__.py b/compiler-plugin/test/__main__.py
@@ -0,0 +1,51 @@
+import markdown
+from html.parser import HTMLParser
+
+test_cases = {
+    "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent vel nulla ac diam dignissim congue ut sed ligula. Pellentesque aliquet ante sit amet risus iaculis, eget tincidunt nibh volutpat. Etiam non pulvinar enim. Mauris viverra augue urna, non aliquam ligula sodales in. Duis mattis ligula pretium dui bibendum, nec tincidunt neque placerat. Pellentesque eu est malesuada, dictum nulla quis, facilisis lectus. Fusce tempor mi ac tellus dictum porta. Cras venenatis pulvinar turpis. Suspendisse consequat nulla suscipit sagittis pretium.": (0, 544),
+    "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin sed lacus vitae neque vestibulum porttitor id et urna. Quisque nisl nisi, fermentum at justo quis, varius aliquet lorem. Ut fringilla vel purus et fermentum. Mauris ac lacinia nisi, sed ultricies dolor. Nunc ut augue quis eros iaculis tempor vel eu erat. Vestibulum efficitur porta justo. Fusce cursus magna dui, eget posuere neque tristique id. Suspendisse varius mauris arcu, nec congue metus efficitur in. Etiam ac pretium justo. Proin non ante faucibus, mattis mi et, consectetur sapien. Proin feugiat commodo euismod.": (546, 1131),
+    "Morbi neque lectus, faucibus a mattis at, aliquam quis est. Maecenas sed luctus elit. Nam vel consequat magna, ac dictum velit. Quisque non cursus enim, at ullamcorper massa. Integer quam mauris, scelerisque eu luctus et, facilisis nec ante. Proin feugiat vehicula felis at ornare. Maecenas est risus, tempus sit amet fermentum vel, sagittis in tellus. Integer ultrices velit at nulla tincidunt cursus. Curabitur non nunc in erat imperdiet imperdiet id sed felis. Quisque euismod velit a mi pellentesque, sit amet molestie eros dignissim. Morbi tincidunt dui vitae orci viverra, vitae gravida sapien semper. Pellentesque viverra a turpis blandit ornare. Quisque tincidunt quam a est facilisis, a fringilla augue sollicitudin. Pellentesque et eros sed arcu placerat sollicitudin. Donec diam eros, auctor non risus eu, interdum interdum mi.": (1133, 1971)
+}
+
+test_document = """Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent vel nulla ac diam dignissim congue ut sed ligula. Pellentesque aliquet ante sit amet risus iaculis, eget tincidunt nibh volutpat. Etiam non pulvinar enim. Mauris viverra augue urna, non aliquam ligula sodales in. Duis mattis ligula pretium dui bibendum, nec tincidunt neque placerat. Pellentesque eu est malesuada, dictum nulla quis, facilisis lectus. Fusce tempor mi ac tellus dictum porta. Cras venenatis pulvinar turpis. Suspendisse consequat nulla suscipit sagittis pretium.
+
+Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin sed lacus vitae neque vestibulum porttitor id et urna. Quisque nisl nisi, fermentum at justo quis, varius aliquet lorem. Ut fringilla vel purus et fermentum. Mauris ac lacinia nisi, sed ultricies dolor. Nunc ut augue quis eros iaculis tempor vel eu erat. Vestibulum efficitur porta justo. Fusce cursus magna dui, eget posuere neque tristique id. Suspendisse varius mauris arcu, nec congue metus efficitur in. Etiam ac pretium justo. Proin non ante faucibus, mattis mi et, consectetur sapien. Proin feugiat commodo euismod.
+
+Morbi neque lectus, faucibus a mattis at, aliquam quis est. Maecenas sed luctus elit. Nam vel consequat magna, ac dictum velit. Quisque non cursus enim, at ullamcorper massa. Integer quam mauris, scelerisque eu luctus et, facilisis nec ante. Proin feugiat vehicula felis at ornare. Maecenas est risus, tempus sit amet fermentum vel, sagittis in tellus. Integer ultrices velit at nulla tincidunt cursus. Curabitur non nunc in erat imperdiet imperdiet id sed felis. Quisque euismod velit a mi pellentesque, sit amet molestie eros dignissim. Morbi tincidunt dui vitae orci viverra, vitae gravida sapien semper. Pellentesque viverra a turpis blandit ornare. Quisque tincidunt quam a est facilisis, a fringilla augue sollicitudin. Pellentesque et eros sed arcu placerat sollicitudin. Donec diam eros, auctor non risus eu, interdum interdum mi."""
+
+html = markdown.markdown(test_document, extensions=['mark-words'])
+
+class Tester(HTMLParser):
+    start = None
+    end = None
+    data = None
+
+    def handle_starttag(self, tag, attrs):
+        for attr in attrs:
+            if attr[0] == "data-original-document-start":
+                self.start = int(attr[1])
+            if attr[0] == "data-original-document-end":
+                self.end = int(attr[1])
+
+    def handle_data(self, data):
+        self.data = data
+        if(self.start is not None and self.end is not None and self.data is not None):
+            self._test()
+            self._reset()
+
+    def _test(self):
+        if self.start is None or self.end is None or self.data is None:
+            raise AssertionError("Missing data")
+        case = test_cases[self.data]
+        print(f"Testing block offset ({self.start}, {self.end}) == {case}")
+        if self.start != case[0] or self.end != case[1]:
+            raise AssertionError(f"Block offset test failed, expected ({case[0]}, {case[1]}), got ({self.start}, {self.end})")
+
+    def _reset(self):
+        self.start = None
+        self.end = None
+        self.data = None
+
+Tester().feed(html)
+
+print("All tests passed!")
diff --git a/index.html b/index.html