From a230b4ac5b5023c479e5ebe2edec95d90ca71c03 Mon Sep 17 00:00:00 2001 From: HikariLan Date: Mon, 1 Jul 2024 14:00:10 +0800 Subject: [PATCH 01/13] refactor: project --- .gitignore | 36 ----------------- index.html | 13 ------- package.json | 15 -------- public/vite.svg | 1 - src/counter.ts | 9 ----- src/main.ts | 24 ------------ src/style.css | 96 ---------------------------------------------- src/typescript.svg | 1 - src/vite-env.d.ts | 1 - tsconfig.json | 23 ----------- 10 files changed, 219 deletions(-) delete mode 100755 .gitignore delete mode 100644 index.html delete mode 100644 package.json delete mode 100644 public/vite.svg delete mode 100644 src/counter.ts delete mode 100644 src/main.ts delete mode 100644 src/style.css delete mode 100644 src/typescript.svg delete mode 100644 src/vite-env.d.ts delete mode 100644 tsconfig.json diff --git a/.gitignore b/.gitignore deleted file mode 100755 index 826b026a..00000000 --- a/.gitignore +++ /dev/null @@ -1,36 +0,0 @@ -public -.cache -node_modules -*DS_Store -*.env - -.idea/ - -yarn-error.log -.vscode - -__generated__/ -# Logs -logs -*.log -npm-debug.log* -yarn-debug.log* -yarn-error.log* -pnpm-debug.log* -lerna-debug.log* - -node_modules -dist -dist-ssr -*.local - -# Editor directories and files -.vscode/* -!.vscode/extensions.json -.idea -.DS_Store -*.suo -*.ntvs* -*.njsproj -*.sln -*.sw? diff --git a/index.html b/index.html deleted file mode 100644 index 44a93350..00000000 --- a/index.html +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - Vite + TS - - -
- - - diff --git a/package.json b/package.json deleted file mode 100644 index 07c6a13a..00000000 --- a/package.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "name": "feedback-sys", - "private": true, - "version": "0.0.0", - "type": "module", - "scripts": { - "dev": "vite", - "build": "tsc && vite build", - "preview": "vite preview" - }, - "devDependencies": { - "typescript": "^5.2.2", - "vite": "^5.2.0" - } -} diff --git a/public/vite.svg b/public/vite.svg deleted file mode 100644 index e7b8dfb1..00000000 --- a/public/vite.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/src/counter.ts b/src/counter.ts deleted file mode 100644 index 09e5afd2..00000000 --- a/src/counter.ts +++ /dev/null @@ -1,9 +0,0 @@ -export function setupCounter(element: HTMLButtonElement) { - let counter = 0 - const setCounter = (count: number) => { - counter = count - element.innerHTML = `count is ${counter}` - } - element.addEventListener('click', () => setCounter(counter + 1)) - setCounter(0) -} diff --git a/src/main.ts b/src/main.ts deleted file mode 100644 index 791547b0..00000000 --- a/src/main.ts +++ /dev/null @@ -1,24 +0,0 @@ -import './style.css' -import typescriptLogo from './typescript.svg' -import viteLogo from '/vite.svg' -import { setupCounter } from './counter.ts' - -document.querySelector('#app')!.innerHTML = ` -
- - - - - - -

Vite + TypeScript

-
- -
-

- Click on the Vite and TypeScript logos to learn more -

-
-` - -setupCounter(document.querySelector('#counter')!) diff --git a/src/style.css b/src/style.css deleted file mode 100644 index f9c73502..00000000 --- a/src/style.css +++ /dev/null @@ -1,96 +0,0 @@ -:root { - font-family: Inter, system-ui, Avenir, Helvetica, Arial, sans-serif; - line-height: 1.5; - font-weight: 400; - - color-scheme: light dark; - color: rgba(255, 255, 255, 0.87); - background-color: #242424; - - font-synthesis: none; - text-rendering: optimizeLegibility; - -webkit-font-smoothing: antialiased; - -moz-osx-font-smoothing: grayscale; -} - -a { - font-weight: 500; - color: #646cff; - text-decoration: inherit; -} -a:hover { - color: #535bf2; -} - -body { - margin: 0; - display: flex; - place-items: center; - min-width: 320px; - min-height: 100vh; -} - -h1 { - font-size: 3.2em; - line-height: 1.1; -} - -#app { - max-width: 1280px; - margin: 0 auto; - padding: 2rem; - text-align: center; -} - -.logo { - height: 6em; - padding: 1.5em; - will-change: filter; - transition: filter 300ms; -} -.logo:hover { - filter: drop-shadow(0 0 2em #646cffaa); -} -.logo.vanilla:hover { - filter: drop-shadow(0 0 2em #3178c6aa); -} - -.card { - padding: 2em; -} - -.read-the-docs { - color: #888; -} - -button { - border-radius: 8px; - border: 1px solid transparent; - padding: 0.6em 1.2em; - font-size: 1em; - font-weight: 500; - font-family: inherit; - background-color: #1a1a1a; - cursor: pointer; - transition: border-color 0.25s; -} -button:hover { - border-color: #646cff; -} -button:focus, -button:focus-visible { - outline: 4px auto -webkit-focus-ring-color; -} - -@media (prefers-color-scheme: light) { - :root { - color: #213547; - background-color: #ffffff; - } - a:hover { - color: #747bff; - } - button { - background-color: #f9f9f9; - } -} diff --git a/src/typescript.svg b/src/typescript.svg deleted file mode 100644 index d91c910c..00000000 --- a/src/typescript.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/src/vite-env.d.ts b/src/vite-env.d.ts deleted file mode 100644 index 11f02fe2..00000000 --- a/src/vite-env.d.ts +++ /dev/null @@ -1 +0,0 @@ -/// diff --git a/tsconfig.json b/tsconfig.json deleted file mode 100644 index 75abdef2..00000000 --- a/tsconfig.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "compilerOptions": { - "target": "ES2020", - "useDefineForClassFields": true, - "module": "ESNext", - "lib": ["ES2020", "DOM", "DOM.Iterable"], - "skipLibCheck": true, - - /* Bundler mode */ - "moduleResolution": "bundler", - "allowImportingTsExtensions": true, - "resolveJsonModule": true, - "isolatedModules": true, - "noEmit": true, - - /* Linting */ - "strict": true, - "noUnusedLocals": true, - "noUnusedParameters": true, - "noFallthroughCasesInSwitch": true - }, - "include": ["src"] -} From 8f6caebbb3033080830a3be1220d75d9071ab99e Mon Sep 17 00:00:00 2001 From: HikariLan Date: Tue, 2 Jul 2024 01:08:39 +0800 Subject: [PATCH 02/13] feat: the extension --- compiler-plugin/.gitignore | 162 ++++++++++++++++++ compiler-plugin/pyproject.toml | 18 ++ .../__init__.py | 0 .../extension.py | 87 ++++++++++ 4 files changed, 267 insertions(+) create mode 100644 compiler-plugin/.gitignore create mode 100644 compiler-plugin/pyproject.toml create mode 100644 compiler-plugin/src/python_markdown_mark_words_compiler_plugin/__init__.py create mode 100644 compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py diff --git a/compiler-plugin/.gitignore b/compiler-plugin/.gitignore new file mode 100644 index 00000000..82f92755 --- /dev/null +++ b/compiler-plugin/.gitignore @@ -0,0 +1,162 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/compiler-plugin/pyproject.toml b/compiler-plugin/pyproject.toml new file mode 100644 index 00000000..0b619304 --- /dev/null +++ b/compiler-plugin/pyproject.toml @@ -0,0 +1,18 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "python_markdown_mark_words_compiler_plugin" +version = "0.0.1" +authors = [ + {name = "HikariLan", email = "hikarilan@minecraft.kim"}, +] +description = "A Python-Markdown compiler plugin that put markdown words offset to the output HTML." +license = {text = "Apache-2.0"} +dependencies = [ + "markdown" +] + +[project.entry-points."markdown.extensions"] +mark-words = "python_markdown_mark_words_compiler_plugin.extension:MarkWordsExtension" \ No newline at end of file diff --git a/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/__init__.py b/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py b/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py new file mode 100644 index 00000000..7d670c79 --- /dev/null +++ b/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py @@ -0,0 +1,87 @@ +from markdown import Extension +from markdown.preprocessors import Preprocessor +from markdown.blockprocessors import BlockProcessor +import xml.etree.ElementTree as etree + +MARK_PREVENT_RECURSION = "\t\t\t\r\r\rMARK_PREVENT_RECURSION\r\r\r\t\t\t" + +class MarkWordsExtension(Extension): + def extendMarkdown(self, md): + meta = { + "document_offsets": [] + } + md.preprocessors.register(CalculateDocumentOffsetPreprocessor(md, meta), + 'capture_document', + 1000) # Highest priority is required because we need to calc words offset from original document + md.parser.blockprocessors.register(MarkWordsBlockProcessor(md.parser, meta), + 'mark_words', + 100) # high priority, usually larger than every other block processor + +class CalculateDocumentOffsetPreprocessor(Preprocessor): + def __init__(self, md, meta): + super(CalculateDocumentOffsetPreprocessor, self).__init__(md) + self.meta = meta + + def run(self, lines): + offset = 0 + for line_num, line in enumerate(lines): + # Skip empty lines + if len(line) == 0: + offset += 1 + continue + # store the line and offset + self.meta["document_offsets"].append((line, offset, offset + len(line))) + ## plus 1 is for the newline character (\n), use the CRLF file is unknown behavior + offset += (len(line) + 1) + return lines + + +class MarkWordsBlockProcessor(BlockProcessor): + def __init__(self, parser, meta): + super(MarkWordsBlockProcessor, self).__init__(parser) + self.meta = meta + + def test(self, parent, block): + ## Test if there is any line in the block + for line in [line for (line, _, _) in self.meta["document_offsets"]]: + if line in block: + return True + return False + + def run(self, parent: etree.Element, blocks): + block = blocks[0] + + ## If the first block is handled, remove the marker and return, so that other block processors can process it + if MARK_PREVENT_RECURSION in blocks[0]: + blocks[0] = blocks[0].replace(MARK_PREVENT_RECURSION, "") + return False + + start = None + end = None + # Search for the block fragment in the document_offsets + for (line, offset, end_offset) in self.meta["document_offsets"]: + # If found one + if line in block: + # If none yet set, set the start offset + if start is None: + start = offset + end = end_offset + # Or, continuing searching for the end offset until the end of the block + else: + end = end_offset + # If end is not found but new line not in block, reset the search and restart from the next line + elif end is None: + start = None + continue + # If both start and end are both set and no continuously block found, break the loop + else: + break + # If both start and end are found, store the result + if start is not None and end is not None: + blocks.pop(0) + ## append MARK_PREVENT_RECURSION to tail of the block to prevent recursion, we don't use a handled flaglist because we don't know if there's some same block in the document + self.parser.parseBlocks(parent, [block + MARK_PREVENT_RECURSION]) + parent[-1].set("data-original-document-start", str(start)) + parent[-1].set("data-original-document-end", str(end)) + return True + return False \ No newline at end of file From 8554d797bd22acb74de74b8bf57254cd626e93ba Mon Sep 17 00:00:00 2001 From: HikariLan Date: Tue, 2 Jul 2024 01:09:36 +0800 Subject: [PATCH 03/13] test: basic test cases --- compiler-plugin/test/__main__.py | 51 ++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 compiler-plugin/test/__main__.py diff --git a/compiler-plugin/test/__main__.py b/compiler-plugin/test/__main__.py new file mode 100644 index 00000000..80bc2f3c --- /dev/null +++ b/compiler-plugin/test/__main__.py @@ -0,0 +1,51 @@ +import markdown +from html.parser import HTMLParser + +test_cases = { + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent vel nulla ac diam dignissim congue ut sed ligula. Pellentesque aliquet ante sit amet risus iaculis, eget tincidunt nibh volutpat. Etiam non pulvinar enim. Mauris viverra augue urna, non aliquam ligula sodales in. Duis mattis ligula pretium dui bibendum, nec tincidunt neque placerat. Pellentesque eu est malesuada, dictum nulla quis, facilisis lectus. Fusce tempor mi ac tellus dictum porta. Cras venenatis pulvinar turpis. Suspendisse consequat nulla suscipit sagittis pretium.": (0, 544), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin sed lacus vitae neque vestibulum porttitor id et urna. Quisque nisl nisi, fermentum at justo quis, varius aliquet lorem. Ut fringilla vel purus et fermentum. Mauris ac lacinia nisi, sed ultricies dolor. Nunc ut augue quis eros iaculis tempor vel eu erat. Vestibulum efficitur porta justo. Fusce cursus magna dui, eget posuere neque tristique id. Suspendisse varius mauris arcu, nec congue metus efficitur in. Etiam ac pretium justo. Proin non ante faucibus, mattis mi et, consectetur sapien. Proin feugiat commodo euismod.": (546, 1131), + "Morbi neque lectus, faucibus a mattis at, aliquam quis est. Maecenas sed luctus elit. Nam vel consequat magna, ac dictum velit. Quisque non cursus enim, at ullamcorper massa. Integer quam mauris, scelerisque eu luctus et, facilisis nec ante. Proin feugiat vehicula felis at ornare. Maecenas est risus, tempus sit amet fermentum vel, sagittis in tellus. Integer ultrices velit at nulla tincidunt cursus. Curabitur non nunc in erat imperdiet imperdiet id sed felis. Quisque euismod velit a mi pellentesque, sit amet molestie eros dignissim. Morbi tincidunt dui vitae orci viverra, vitae gravida sapien semper. Pellentesque viverra a turpis blandit ornare. Quisque tincidunt quam a est facilisis, a fringilla augue sollicitudin. Pellentesque et eros sed arcu placerat sollicitudin. Donec diam eros, auctor non risus eu, interdum interdum mi.": (1133, 1971) +} + +test_document = """Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent vel nulla ac diam dignissim congue ut sed ligula. Pellentesque aliquet ante sit amet risus iaculis, eget tincidunt nibh volutpat. Etiam non pulvinar enim. Mauris viverra augue urna, non aliquam ligula sodales in. Duis mattis ligula pretium dui bibendum, nec tincidunt neque placerat. Pellentesque eu est malesuada, dictum nulla quis, facilisis lectus. Fusce tempor mi ac tellus dictum porta. Cras venenatis pulvinar turpis. Suspendisse consequat nulla suscipit sagittis pretium. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin sed lacus vitae neque vestibulum porttitor id et urna. Quisque nisl nisi, fermentum at justo quis, varius aliquet lorem. Ut fringilla vel purus et fermentum. Mauris ac lacinia nisi, sed ultricies dolor. Nunc ut augue quis eros iaculis tempor vel eu erat. Vestibulum efficitur porta justo. Fusce cursus magna dui, eget posuere neque tristique id. Suspendisse varius mauris arcu, nec congue metus efficitur in. Etiam ac pretium justo. Proin non ante faucibus, mattis mi et, consectetur sapien. Proin feugiat commodo euismod. + +Morbi neque lectus, faucibus a mattis at, aliquam quis est. Maecenas sed luctus elit. Nam vel consequat magna, ac dictum velit. Quisque non cursus enim, at ullamcorper massa. Integer quam mauris, scelerisque eu luctus et, facilisis nec ante. Proin feugiat vehicula felis at ornare. Maecenas est risus, tempus sit amet fermentum vel, sagittis in tellus. Integer ultrices velit at nulla tincidunt cursus. Curabitur non nunc in erat imperdiet imperdiet id sed felis. Quisque euismod velit a mi pellentesque, sit amet molestie eros dignissim. Morbi tincidunt dui vitae orci viverra, vitae gravida sapien semper. Pellentesque viverra a turpis blandit ornare. Quisque tincidunt quam a est facilisis, a fringilla augue sollicitudin. Pellentesque et eros sed arcu placerat sollicitudin. Donec diam eros, auctor non risus eu, interdum interdum mi.""" + +html = markdown.markdown(test_document, extensions=['mark-words']) + +class Tester(HTMLParser): + start = None + end = None + data = None + + def handle_starttag(self, tag, attrs): + for attr in attrs: + if attr[0] == "data-original-document-start": + self.start = int(attr[1]) + if attr[0] == "data-original-document-end": + self.end = int(attr[1]) + + def handle_data(self, data): + self.data = data + if(self.start is not None and self.end is not None and self.data is not None): + self._test() + self._reset() + + def _test(self): + if self.start is None or self.end is None or self.data is None: + raise AssertionError("Missing data") + case = test_cases[self.data] + print(f"Testing block offset ({self.start}, {self.end}) == {case}") + if self.start != case[0] or self.end != case[1]: + raise AssertionError(f"Block offset test failed, expected ({case[0]}, {case[1]}), got ({self.start}, {self.end})") + + def _reset(self): + self.start = None + self.end = None + self.data = None + +Tester().feed(html) + +print("All tests passed!") \ No newline at end of file From 68f175d195cd5e76fa6fb0538e37233992bb5fa8 Mon Sep 17 00:00:00 2001 From: HikariLan Date: Tue, 2 Jul 2024 15:39:07 +0800 Subject: [PATCH 04/13] fix: offset mismatch when continuously same fragement --- .../extension.py | 33 +++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py b/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py index 7d670c79..8bf7d416 100644 --- a/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py +++ b/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py @@ -8,7 +8,8 @@ class MarkWordsExtension(Extension): def extendMarkdown(self, md): meta = { - "document_offsets": [] + "document_offsets": [], + "used_document_offsets": {} } md.preprocessors.register(CalculateDocumentOffsetPreprocessor(md, meta), 'capture_document', @@ -17,6 +18,9 @@ def extendMarkdown(self, md): 'mark_words', 100) # high priority, usually larger than every other block processor +''' +A preprocessor to calculate the offset of each line in the document +''' class CalculateDocumentOffsetPreprocessor(Preprocessor): def __init__(self, md, meta): super(CalculateDocumentOffsetPreprocessor, self).__init__(md) @@ -30,12 +34,16 @@ def run(self, lines): offset += 1 continue # store the line and offset - self.meta["document_offsets"].append((line, offset, offset + len(line))) + store = (line, offset, offset + len(line)) + self.meta["document_offsets"].append(store) + self.meta["used_document_offsets"][store] = False ## plus 1 is for the newline character (\n), use the CRLF file is unknown behavior offset += (len(line) + 1) return lines - +''' +A block processor to mark the words in the document and inject the offset of the block to the HTML element +''' class MarkWordsBlockProcessor(BlockProcessor): def __init__(self, parser, meta): super(MarkWordsBlockProcessor, self).__init__(parser) @@ -51,17 +59,25 @@ def test(self, parent, block): def run(self, parent: etree.Element, blocks): block = blocks[0] - ## If the first block is handled, remove the marker and return, so that other block processors can process it + # If the first block is handled, remove the marker and return, so that other block processors can process it if MARK_PREVENT_RECURSION in blocks[0]: blocks[0] = blocks[0].replace(MARK_PREVENT_RECURSION, "") return False start = None end = None + used = {} # Search for the block fragment in the document_offsets - for (line, offset, end_offset) in self.meta["document_offsets"]: + for store in self.meta["document_offsets"]: + # If already used, skip + if(self.meta["used_document_offsets"][store]): + continue + (line, offset, end_offset) = store # If found one if line in block: + # If the line already scanned (usually some lines with same content in different place), skip + if line in [line for (line, _, _) in used.keys()]: + continue # If none yet set, set the start offset if start is None: start = offset @@ -69,9 +85,13 @@ def run(self, parent: etree.Element, blocks): # Or, continuing searching for the end offset until the end of the block else: end = end_offset + # Mark the fragment as used + used[store] = True # If end is not found but new line not in block, reset the search and restart from the next line elif end is None: start = None + # Clear the used list + used = {} continue # If both start and end are both set and no continuously block found, break the loop else: @@ -79,7 +99,8 @@ def run(self, parent: etree.Element, blocks): # If both start and end are found, store the result if start is not None and end is not None: blocks.pop(0) - ## append MARK_PREVENT_RECURSION to tail of the block to prevent recursion, we don't use a handled flaglist because we don't know if there's some same block in the document + self.meta["used_document_offsets"].update(used) + # append MARK_PREVENT_RECURSION to tail of the block to prevent recursion, we don't use a handled flaglist because we don't know if there's some same block in the document self.parser.parseBlocks(parent, [block + MARK_PREVENT_RECURSION]) parent[-1].set("data-original-document-start", str(start)) parent[-1].set("data-original-document-end", str(end)) From fab6d144ddabde36790a0f245e983ffc4a675739 Mon Sep 17 00:00:00 2001 From: HikariLan Date: Tue, 2 Jul 2024 22:38:51 +0800 Subject: [PATCH 05/13] refactor: switch to rye as build system --- compiler-plugin/.gitignore | 160 +------------------------- compiler-plugin/.python-version | 1 + compiler-plugin/pyproject.toml | 34 ++++-- compiler-plugin/requirements-dev.lock | 13 +++ compiler-plugin/requirements.lock | 13 +++ 5 files changed, 53 insertions(+), 168 deletions(-) create mode 100644 compiler-plugin/.python-version create mode 100644 compiler-plugin/requirements-dev.lock create mode 100644 compiler-plugin/requirements.lock diff --git a/compiler-plugin/.gitignore b/compiler-plugin/.gitignore index 82f92755..ae8554de 100644 --- a/compiler-plugin/.gitignore +++ b/compiler-plugin/.gitignore @@ -1,162 +1,10 @@ -# Byte-compiled / optimized / DLL files +# python generated files __pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python +*.py[oc] build/ -develop-eggs/ dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version +*.egg-info -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/latest/usage/project/#working-with-version-control -.pdm.toml -.pdm-python -.pdm-build/ - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env +# venv .venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ diff --git a/compiler-plugin/.python-version b/compiler-plugin/.python-version new file mode 100644 index 00000000..871f80a3 --- /dev/null +++ b/compiler-plugin/.python-version @@ -0,0 +1 @@ +3.12.3 diff --git a/compiler-plugin/pyproject.toml b/compiler-plugin/pyproject.toml index 0b619304..39b6ba65 100644 --- a/compiler-plugin/pyproject.toml +++ b/compiler-plugin/pyproject.toml @@ -1,18 +1,28 @@ -[build-system] -requires = ["setuptools"] -build-backend = "setuptools.build_meta" - [project] name = "python_markdown_mark_words_compiler_plugin" version = "0.0.1" -authors = [ - {name = "HikariLan", email = "hikarilan@minecraft.kim"}, -] description = "A Python-Markdown compiler plugin that put markdown words offset to the output HTML." -license = {text = "Apache-2.0"} -dependencies = [ - "markdown" -] +authors = [{ name = "HikariLan", email = "hikarilan@minecraft.kim" }] +license = { text = "Apache-2.0" } +dependencies = ["markdown>=3.6"] +requires-python = ">= 3.8" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.rye] +managed = true +dev-dependencies = [] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.build.targets.wheel] +packages = ["src/python_markdown_mark_words_compiler_plugin"] + +[tool.rye.scripts] +test = "python ./test" [project.entry-points."markdown.extensions"] -mark-words = "python_markdown_mark_words_compiler_plugin.extension:MarkWordsExtension" \ No newline at end of file +mark-words = "python_markdown_mark_words_compiler_plugin.extension:MarkWordsExtension" diff --git a/compiler-plugin/requirements-dev.lock b/compiler-plugin/requirements-dev.lock new file mode 100644 index 00000000..39d92f66 --- /dev/null +++ b/compiler-plugin/requirements-dev.lock @@ -0,0 +1,13 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false +# generate-hashes: false + +-e file:. +markdown==3.6 + # via python-markdown-mark-words-compiler-plugin diff --git a/compiler-plugin/requirements.lock b/compiler-plugin/requirements.lock new file mode 100644 index 00000000..39d92f66 --- /dev/null +++ b/compiler-plugin/requirements.lock @@ -0,0 +1,13 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false +# generate-hashes: false + +-e file:. +markdown==3.6 + # via python-markdown-mark-words-compiler-plugin From 460072be5172d3dc6f4eb55715ed78c02fb1609f Mon Sep 17 00:00:00 2001 From: HikariLan Date: Tue, 2 Jul 2024 22:39:20 +0800 Subject: [PATCH 06/13] fmt: format code --- .../extension.py | 50 ++++++++++--------- compiler-plugin/test/__main__.py | 35 +++++++++---- 2 files changed, 50 insertions(+), 35 deletions(-) diff --git a/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py b/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py index 8bf7d416..8495d7a3 100644 --- a/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py +++ b/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py @@ -5,27 +5,27 @@ MARK_PREVENT_RECURSION = "\t\t\t\r\r\rMARK_PREVENT_RECURSION\r\r\r\t\t\t" + class MarkWordsExtension(Extension): def extendMarkdown(self, md): - meta = { - "document_offsets": [], - "used_document_offsets": {} - } - md.preprocessors.register(CalculateDocumentOffsetPreprocessor(md, meta), - 'capture_document', - 1000) # Highest priority is required because we need to calc words offset from original document - md.parser.blockprocessors.register(MarkWordsBlockProcessor(md.parser, meta), - 'mark_words', - 100) # high priority, usually larger than every other block processor + meta = {"document_offsets": [], "used_document_offsets": {}} + md.preprocessors.register( + CalculateDocumentOffsetPreprocessor(md, meta), "capture_document", 1000 + ) # Highest priority is required because we need to calc words offset from original document + md.parser.blockprocessors.register( + MarkWordsBlockProcessor(md.parser, meta), "mark_words", 100 + ) # high priority, usually larger than every other block processor + -''' -A preprocessor to calculate the offset of each line in the document -''' class CalculateDocumentOffsetPreprocessor(Preprocessor): + """ + A preprocessor to calculate the offset of each line in the document + """ + def __init__(self, md, meta): super(CalculateDocumentOffsetPreprocessor, self).__init__(md) self.meta = meta - + def run(self, lines): offset = 0 for line_num, line in enumerate(lines): @@ -38,39 +38,41 @@ def run(self, lines): self.meta["document_offsets"].append(store) self.meta["used_document_offsets"][store] = False ## plus 1 is for the newline character (\n), use the CRLF file is unknown behavior - offset += (len(line) + 1) + offset += len(line) + 1 return lines -''' -A block processor to mark the words in the document and inject the offset of the block to the HTML element -''' + class MarkWordsBlockProcessor(BlockProcessor): + """ + A block processor to mark the words in the document and inject the offset of the block to the HTML element + """ + def __init__(self, parser, meta): super(MarkWordsBlockProcessor, self).__init__(parser) self.meta = meta - + def test(self, parent, block): ## Test if there is any line in the block for line in [line for (line, _, _) in self.meta["document_offsets"]]: if line in block: return True return False - + def run(self, parent: etree.Element, blocks): block = blocks[0] - + # If the first block is handled, remove the marker and return, so that other block processors can process it if MARK_PREVENT_RECURSION in blocks[0]: blocks[0] = blocks[0].replace(MARK_PREVENT_RECURSION, "") return False - + start = None end = None used = {} # Search for the block fragment in the document_offsets for store in self.meta["document_offsets"]: # If already used, skip - if(self.meta["used_document_offsets"][store]): + if self.meta["used_document_offsets"][store]: continue (line, offset, end_offset) = store # If found one @@ -105,4 +107,4 @@ def run(self, parent: etree.Element, blocks): parent[-1].set("data-original-document-start", str(start)) parent[-1].set("data-original-document-end", str(end)) return True - return False \ No newline at end of file + return False diff --git a/compiler-plugin/test/__main__.py b/compiler-plugin/test/__main__.py index 80bc2f3c..161e767a 100644 --- a/compiler-plugin/test/__main__.py +++ b/compiler-plugin/test/__main__.py @@ -2,9 +2,18 @@ from html.parser import HTMLParser test_cases = { - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent vel nulla ac diam dignissim congue ut sed ligula. Pellentesque aliquet ante sit amet risus iaculis, eget tincidunt nibh volutpat. Etiam non pulvinar enim. Mauris viverra augue urna, non aliquam ligula sodales in. Duis mattis ligula pretium dui bibendum, nec tincidunt neque placerat. Pellentesque eu est malesuada, dictum nulla quis, facilisis lectus. Fusce tempor mi ac tellus dictum porta. Cras venenatis pulvinar turpis. Suspendisse consequat nulla suscipit sagittis pretium.": (0, 544), - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin sed lacus vitae neque vestibulum porttitor id et urna. Quisque nisl nisi, fermentum at justo quis, varius aliquet lorem. Ut fringilla vel purus et fermentum. Mauris ac lacinia nisi, sed ultricies dolor. Nunc ut augue quis eros iaculis tempor vel eu erat. Vestibulum efficitur porta justo. Fusce cursus magna dui, eget posuere neque tristique id. Suspendisse varius mauris arcu, nec congue metus efficitur in. Etiam ac pretium justo. Proin non ante faucibus, mattis mi et, consectetur sapien. Proin feugiat commodo euismod.": (546, 1131), - "Morbi neque lectus, faucibus a mattis at, aliquam quis est. Maecenas sed luctus elit. Nam vel consequat magna, ac dictum velit. Quisque non cursus enim, at ullamcorper massa. Integer quam mauris, scelerisque eu luctus et, facilisis nec ante. Proin feugiat vehicula felis at ornare. Maecenas est risus, tempus sit amet fermentum vel, sagittis in tellus. Integer ultrices velit at nulla tincidunt cursus. Curabitur non nunc in erat imperdiet imperdiet id sed felis. Quisque euismod velit a mi pellentesque, sit amet molestie eros dignissim. Morbi tincidunt dui vitae orci viverra, vitae gravida sapien semper. Pellentesque viverra a turpis blandit ornare. Quisque tincidunt quam a est facilisis, a fringilla augue sollicitudin. Pellentesque et eros sed arcu placerat sollicitudin. Donec diam eros, auctor non risus eu, interdum interdum mi.": (1133, 1971) + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent vel nulla ac diam dignissim congue ut sed ligula. Pellentesque aliquet ante sit amet risus iaculis, eget tincidunt nibh volutpat. Etiam non pulvinar enim. Mauris viverra augue urna, non aliquam ligula sodales in. Duis mattis ligula pretium dui bibendum, nec tincidunt neque placerat. Pellentesque eu est malesuada, dictum nulla quis, facilisis lectus. Fusce tempor mi ac tellus dictum porta. Cras venenatis pulvinar turpis. Suspendisse consequat nulla suscipit sagittis pretium.": ( + 0, + 544, + ), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin sed lacus vitae neque vestibulum porttitor id et urna. Quisque nisl nisi, fermentum at justo quis, varius aliquet lorem. Ut fringilla vel purus et fermentum. Mauris ac lacinia nisi, sed ultricies dolor. Nunc ut augue quis eros iaculis tempor vel eu erat. Vestibulum efficitur porta justo. Fusce cursus magna dui, eget posuere neque tristique id. Suspendisse varius mauris arcu, nec congue metus efficitur in. Etiam ac pretium justo. Proin non ante faucibus, mattis mi et, consectetur sapien. Proin feugiat commodo euismod.": ( + 546, + 1131, + ), + "Morbi neque lectus, faucibus a mattis at, aliquam quis est. Maecenas sed luctus elit. Nam vel consequat magna, ac dictum velit. Quisque non cursus enim, at ullamcorper massa. Integer quam mauris, scelerisque eu luctus et, facilisis nec ante. Proin feugiat vehicula felis at ornare. Maecenas est risus, tempus sit amet fermentum vel, sagittis in tellus. Integer ultrices velit at nulla tincidunt cursus. Curabitur non nunc in erat imperdiet imperdiet id sed felis. Quisque euismod velit a mi pellentesque, sit amet molestie eros dignissim. Morbi tincidunt dui vitae orci viverra, vitae gravida sapien semper. Pellentesque viverra a turpis blandit ornare. Quisque tincidunt quam a est facilisis, a fringilla augue sollicitudin. Pellentesque et eros sed arcu placerat sollicitudin. Donec diam eros, auctor non risus eu, interdum interdum mi.": ( + 1133, + 1971, + ), } test_document = """Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent vel nulla ac diam dignissim congue ut sed ligula. Pellentesque aliquet ante sit amet risus iaculis, eget tincidunt nibh volutpat. Etiam non pulvinar enim. Mauris viverra augue urna, non aliquam ligula sodales in. Duis mattis ligula pretium dui bibendum, nec tincidunt neque placerat. Pellentesque eu est malesuada, dictum nulla quis, facilisis lectus. Fusce tempor mi ac tellus dictum porta. Cras venenatis pulvinar turpis. Suspendisse consequat nulla suscipit sagittis pretium. @@ -13,39 +22,43 @@ Morbi neque lectus, faucibus a mattis at, aliquam quis est. Maecenas sed luctus elit. Nam vel consequat magna, ac dictum velit. Quisque non cursus enim, at ullamcorper massa. Integer quam mauris, scelerisque eu luctus et, facilisis nec ante. Proin feugiat vehicula felis at ornare. Maecenas est risus, tempus sit amet fermentum vel, sagittis in tellus. Integer ultrices velit at nulla tincidunt cursus. Curabitur non nunc in erat imperdiet imperdiet id sed felis. Quisque euismod velit a mi pellentesque, sit amet molestie eros dignissim. Morbi tincidunt dui vitae orci viverra, vitae gravida sapien semper. Pellentesque viverra a turpis blandit ornare. Quisque tincidunt quam a est facilisis, a fringilla augue sollicitudin. Pellentesque et eros sed arcu placerat sollicitudin. Donec diam eros, auctor non risus eu, interdum interdum mi.""" -html = markdown.markdown(test_document, extensions=['mark-words']) +html = markdown.markdown(test_document, extensions=["mark-words"]) + class Tester(HTMLParser): start = None end = None data = None - + def handle_starttag(self, tag, attrs): for attr in attrs: if attr[0] == "data-original-document-start": self.start = int(attr[1]) if attr[0] == "data-original-document-end": self.end = int(attr[1]) - + def handle_data(self, data): self.data = data - if(self.start is not None and self.end is not None and self.data is not None): + if self.start is not None and self.end is not None and self.data is not None: self._test() self._reset() - + def _test(self): if self.start is None or self.end is None or self.data is None: raise AssertionError("Missing data") case = test_cases[self.data] print(f"Testing block offset ({self.start}, {self.end}) == {case}") if self.start != case[0] or self.end != case[1]: - raise AssertionError(f"Block offset test failed, expected ({case[0]}, {case[1]}), got ({self.start}, {self.end})") - + raise AssertionError( + f"Block offset test failed, expected ({case[0]}, {case[1]}), got ({self.start}, {self.end})" + ) + def _reset(self): self.start = None self.end = None self.data = None + Tester().feed(html) -print("All tests passed!") \ No newline at end of file +print("All tests passed!") From e37062d77975e9ab8ab2e9fdec53da091f857eae Mon Sep 17 00:00:00 2001 From: HikariLan Date: Wed, 3 Jul 2024 00:49:54 +0800 Subject: [PATCH 07/13] test: refactor test cases --- .../extension.py | 4 +- compiler-plugin/test/__main__.py | 142 ++++++++++++------ 2 files changed, 102 insertions(+), 44 deletions(-) diff --git a/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py b/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py index 8495d7a3..fa53f525 100644 --- a/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py +++ b/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py @@ -37,7 +37,7 @@ def run(self, lines): store = (line, offset, offset + len(line)) self.meta["document_offsets"].append(store) self.meta["used_document_offsets"][store] = False - ## plus 1 is for the newline character (\n), use the CRLF file is unknown behavior + # plus 1 is for the newline character (\n), use the CRLF file is unknown behavior offset += len(line) + 1 return lines @@ -52,7 +52,7 @@ def __init__(self, parser, meta): self.meta = meta def test(self, parent, block): - ## Test if there is any line in the block + # Test if there is any line in the block for line in [line for (line, _, _) in self.meta["document_offsets"]]: if line in block: return True diff --git a/compiler-plugin/test/__main__.py b/compiler-plugin/test/__main__.py index 161e767a..0c8c277b 100644 --- a/compiler-plugin/test/__main__.py +++ b/compiler-plugin/test/__main__.py @@ -1,64 +1,122 @@ +import textwrap +import unittest import markdown from html.parser import HTMLParser -test_cases = { - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent vel nulla ac diam dignissim congue ut sed ligula. Pellentesque aliquet ante sit amet risus iaculis, eget tincidunt nibh volutpat. Etiam non pulvinar enim. Mauris viverra augue urna, non aliquam ligula sodales in. Duis mattis ligula pretium dui bibendum, nec tincidunt neque placerat. Pellentesque eu est malesuada, dictum nulla quis, facilisis lectus. Fusce tempor mi ac tellus dictum porta. Cras venenatis pulvinar turpis. Suspendisse consequat nulla suscipit sagittis pretium.": ( - 0, - 544, - ), - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin sed lacus vitae neque vestibulum porttitor id et urna. Quisque nisl nisi, fermentum at justo quis, varius aliquet lorem. Ut fringilla vel purus et fermentum. Mauris ac lacinia nisi, sed ultricies dolor. Nunc ut augue quis eros iaculis tempor vel eu erat. Vestibulum efficitur porta justo. Fusce cursus magna dui, eget posuere neque tristique id. Suspendisse varius mauris arcu, nec congue metus efficitur in. Etiam ac pretium justo. Proin non ante faucibus, mattis mi et, consectetur sapien. Proin feugiat commodo euismod.": ( - 546, - 1131, - ), - "Morbi neque lectus, faucibus a mattis at, aliquam quis est. Maecenas sed luctus elit. Nam vel consequat magna, ac dictum velit. Quisque non cursus enim, at ullamcorper massa. Integer quam mauris, scelerisque eu luctus et, facilisis nec ante. Proin feugiat vehicula felis at ornare. Maecenas est risus, tempus sit amet fermentum vel, sagittis in tellus. Integer ultrices velit at nulla tincidunt cursus. Curabitur non nunc in erat imperdiet imperdiet id sed felis. Quisque euismod velit a mi pellentesque, sit amet molestie eros dignissim. Morbi tincidunt dui vitae orci viverra, vitae gravida sapien semper. Pellentesque viverra a turpis blandit ornare. Quisque tincidunt quam a est facilisis, a fringilla augue sollicitudin. Pellentesque et eros sed arcu placerat sollicitudin. Donec diam eros, auctor non risus eu, interdum interdum mi.": ( - 1133, - 1971, - ), -} -test_document = """Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent vel nulla ac diam dignissim congue ut sed ligula. Pellentesque aliquet ante sit amet risus iaculis, eget tincidunt nibh volutpat. Etiam non pulvinar enim. Mauris viverra augue urna, non aliquam ligula sodales in. Duis mattis ligula pretium dui bibendum, nec tincidunt neque placerat. Pellentesque eu est malesuada, dictum nulla quis, facilisis lectus. Fusce tempor mi ac tellus dictum porta. Cras venenatis pulvinar turpis. Suspendisse consequat nulla suscipit sagittis pretium. +class Tester: + def __init__(self, case, test_case: unittest.TestCase): + self.case = case + self.result = markdown.markdown( + self.case["document"], extensions=["mark-words"] + ) + self.test_case = test_case -Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin sed lacus vitae neque vestibulum porttitor id et urna. Quisque nisl nisi, fermentum at justo quis, varius aliquet lorem. Ut fringilla vel purus et fermentum. Mauris ac lacinia nisi, sed ultricies dolor. Nunc ut augue quis eros iaculis tempor vel eu erat. Vestibulum efficitur porta justo. Fusce cursus magna dui, eget posuere neque tristique id. Suspendisse varius mauris arcu, nec congue metus efficitur in. Etiam ac pretium justo. Proin non ante faucibus, mattis mi et, consectetur sapien. Proin feugiat commodo euismod. + def test(self): + ParserTester(self.case, self.test_case).feed(self.result) -Morbi neque lectus, faucibus a mattis at, aliquam quis est. Maecenas sed luctus elit. Nam vel consequat magna, ac dictum velit. Quisque non cursus enim, at ullamcorper massa. Integer quam mauris, scelerisque eu luctus et, facilisis nec ante. Proin feugiat vehicula felis at ornare. Maecenas est risus, tempus sit amet fermentum vel, sagittis in tellus. Integer ultrices velit at nulla tincidunt cursus. Curabitur non nunc in erat imperdiet imperdiet id sed felis. Quisque euismod velit a mi pellentesque, sit amet molestie eros dignissim. Morbi tincidunt dui vitae orci viverra, vitae gravida sapien semper. Pellentesque viverra a turpis blandit ornare. Quisque tincidunt quam a est facilisis, a fringilla augue sollicitudin. Pellentesque et eros sed arcu placerat sollicitudin. Donec diam eros, auctor non risus eu, interdum interdum mi.""" -html = markdown.markdown(test_document, extensions=["mark-words"]) +class ParserTester(HTMLParser): + tag = None + text = None + offset_start = None + offset_end = None - -class Tester(HTMLParser): - start = None - end = None - data = None + def __init__(self, case, test_case: unittest.TestCase): + super().__init__() + self.test_case = test_case + self.case = case + self.idx = 0 def handle_starttag(self, tag, attrs): + self.tag = tag for attr in attrs: if attr[0] == "data-original-document-start": - self.start = int(attr[1]) + self.offset_start = int(attr[1]) if attr[0] == "data-original-document-end": - self.end = int(attr[1]) + self.offset_end = int(attr[1]) def handle_data(self, data): - self.data = data - if self.start is not None and self.end is not None and self.data is not None: - self._test() - self._reset() + self.text = data + + def handle_endtag(self, tag): + self._test() + self._reset() def _test(self): - if self.start is None or self.end is None or self.data is None: - raise AssertionError("Missing data") - case = test_cases[self.data] - print(f"Testing block offset ({self.start}, {self.end}) == {case}") - if self.start != case[0] or self.end != case[1]: - raise AssertionError( - f"Block offset test failed, expected ({case[0]}, {case[1]}), got ({self.start}, {self.end})" - ) + self.test_case.assertEqual( + self.tag, + self.case["expected"][self.idx]["tag"], + msg="Tag mismatch in index " + str(self.idx), + ) + self.test_case.assertEqual( + self.text, + self.case["expected"][self.idx]["text"], + msg="Text mismatch in index " + str(self.idx), + ) + self.test_case.assertEqual( + self.offset_start, + self.case["expected"][self.idx]["offset"][0], + msg="Offset start mismatch in index " + str(self.idx), + ) + self.test_case.assertEqual( + self.offset_end, + self.case["expected"][self.idx]["offset"][1], + msg="Offset end mismatch in index " + str(self.idx), + ) + self.idx += 1 def _reset(self): - self.start = None - self.end = None - self.data = None + self.tag = None + self.text = None + self.offset_start = None + self.offset_end = None + + +class TestParser(unittest.TestCase): + def test_normal(self): + case = { + "document": textwrap.dedent("""\ + # Lorem ipsum + + Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin sed lacus vitae neque vestibulum porttitor id et urna. + + ## Morbi neque lectus + + Morbi neque lectus, faucibus a mattis at, aliquam quis est. Maecenas sed luctus elit."""), + "expected": [ + {"tag": "h1", "text": "Lorem ipsum", "offset": (0, 13)}, + { + "tag": "p", + "text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin sed lacus vitae neque vestibulum porttitor id et urna.", + "offset": (15, 132), + }, + {"tag": "h2", "text": "Morbi neque lectus", "offset": (134, 155)}, + { + "tag": "p", + "text": "Morbi neque lectus, faucibus a mattis at, aliquam quis est. Maecenas sed luctus elit.", + "offset": (157, 242), + }, + ], + } + Tester(case, self).test() + + def test_empty(self): + case = { + "document": "", + "expected": [], + } + Tester(case, self).test() + def test_single(self): + case = { + "document": "Lorem ipsum", + "expected": [ + {"tag": "p", "text": "Lorem ipsum", "offset": (0, 11)}, + ], + } + Tester(case, self).test() -Tester().feed(html) -print("All tests passed!") +if __name__ == "__main__": + unittest.main() From 81a32f7b4247819c9721615440a67d337f6b9570 Mon Sep 17 00:00:00 2001 From: HikariLan Date: Wed, 3 Jul 2024 13:49:35 +0800 Subject: [PATCH 08/13] refactor: rename to python_markdown_document_offsets_injection_extension --- {compiler-plugin => python-markdown-extension}/.gitignore | 0 .../.python-version | 0 .../pyproject.toml | 6 +++--- .../requirements-dev.lock | 2 +- .../requirements.lock | 2 +- .../__init__.py | 0 .../extension.py | 8 ++++---- .../test/__main__.py | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) rename {compiler-plugin => python-markdown-extension}/.gitignore (100%) rename {compiler-plugin => python-markdown-extension}/.python-version (100%) rename {compiler-plugin => python-markdown-extension}/pyproject.toml (71%) rename {compiler-plugin => python-markdown-extension}/requirements-dev.lock (79%) rename {compiler-plugin => python-markdown-extension}/requirements.lock (79%) rename {compiler-plugin/src/python_markdown_mark_words_compiler_plugin => python-markdown-extension/src/python_markdown_document_offsets_injection_extension}/__init__.py (100%) rename {compiler-plugin/src/python_markdown_mark_words_compiler_plugin => python-markdown-extension/src/python_markdown_document_offsets_injection_extension}/extension.py (94%) rename {compiler-plugin => python-markdown-extension}/test/__main__.py (97%) diff --git a/compiler-plugin/.gitignore b/python-markdown-extension/.gitignore similarity index 100% rename from compiler-plugin/.gitignore rename to python-markdown-extension/.gitignore diff --git a/compiler-plugin/.python-version b/python-markdown-extension/.python-version similarity index 100% rename from compiler-plugin/.python-version rename to python-markdown-extension/.python-version diff --git a/compiler-plugin/pyproject.toml b/python-markdown-extension/pyproject.toml similarity index 71% rename from compiler-plugin/pyproject.toml rename to python-markdown-extension/pyproject.toml index 39b6ba65..d3bf8d22 100644 --- a/compiler-plugin/pyproject.toml +++ b/python-markdown-extension/pyproject.toml @@ -1,5 +1,5 @@ [project] -name = "python_markdown_mark_words_compiler_plugin" +name = "python_markdown_document_offsets_injection_extension" version = "0.0.1" description = "A Python-Markdown compiler plugin that put markdown words offset to the output HTML." authors = [{ name = "HikariLan", email = "hikarilan@minecraft.kim" }] @@ -19,10 +19,10 @@ dev-dependencies = [] allow-direct-references = true [tool.hatch.build.targets.wheel] -packages = ["src/python_markdown_mark_words_compiler_plugin"] +packages = ["src/python_markdown_document_offsets_injection_extension"] [tool.rye.scripts] test = "python ./test" [project.entry-points."markdown.extensions"] -mark-words = "python_markdown_mark_words_compiler_plugin.extension:MarkWordsExtension" +document-offsets-injection = "python_markdown_document_offsets_injection_extension.extension:MainExtension" diff --git a/compiler-plugin/requirements-dev.lock b/python-markdown-extension/requirements-dev.lock similarity index 79% rename from compiler-plugin/requirements-dev.lock rename to python-markdown-extension/requirements-dev.lock index 39d92f66..2e8b89cd 100644 --- a/compiler-plugin/requirements-dev.lock +++ b/python-markdown-extension/requirements-dev.lock @@ -10,4 +10,4 @@ -e file:. markdown==3.6 - # via python-markdown-mark-words-compiler-plugin + # via python-markdown-document-offsets-injection-extension diff --git a/compiler-plugin/requirements.lock b/python-markdown-extension/requirements.lock similarity index 79% rename from compiler-plugin/requirements.lock rename to python-markdown-extension/requirements.lock index 39d92f66..2e8b89cd 100644 --- a/compiler-plugin/requirements.lock +++ b/python-markdown-extension/requirements.lock @@ -10,4 +10,4 @@ -e file:. markdown==3.6 - # via python-markdown-mark-words-compiler-plugin + # via python-markdown-document-offsets-injection-extension diff --git a/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/__init__.py b/python-markdown-extension/src/python_markdown_document_offsets_injection_extension/__init__.py similarity index 100% rename from compiler-plugin/src/python_markdown_mark_words_compiler_plugin/__init__.py rename to python-markdown-extension/src/python_markdown_document_offsets_injection_extension/__init__.py diff --git a/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py b/python-markdown-extension/src/python_markdown_document_offsets_injection_extension/extension.py similarity index 94% rename from compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py rename to python-markdown-extension/src/python_markdown_document_offsets_injection_extension/extension.py index fa53f525..bd45c504 100644 --- a/compiler-plugin/src/python_markdown_mark_words_compiler_plugin/extension.py +++ b/python-markdown-extension/src/python_markdown_document_offsets_injection_extension/extension.py @@ -6,14 +6,14 @@ MARK_PREVENT_RECURSION = "\t\t\t\r\r\rMARK_PREVENT_RECURSION\r\r\r\t\t\t" -class MarkWordsExtension(Extension): +class MainExtension(Extension): def extendMarkdown(self, md): meta = {"document_offsets": [], "used_document_offsets": {}} md.preprocessors.register( CalculateDocumentOffsetPreprocessor(md, meta), "capture_document", 1000 ) # Highest priority is required because we need to calc words offset from original document md.parser.blockprocessors.register( - MarkWordsBlockProcessor(md.parser, meta), "mark_words", 100 + OffsetsInjectionBlockProcessor(md.parser, meta), "mark_words", 100 ) # high priority, usually larger than every other block processor @@ -42,13 +42,13 @@ def run(self, lines): return lines -class MarkWordsBlockProcessor(BlockProcessor): +class OffsetsInjectionBlockProcessor(BlockProcessor): """ A block processor to mark the words in the document and inject the offset of the block to the HTML element """ def __init__(self, parser, meta): - super(MarkWordsBlockProcessor, self).__init__(parser) + super(OffsetsInjectionBlockProcessor, self).__init__(parser) self.meta = meta def test(self, parent, block): diff --git a/compiler-plugin/test/__main__.py b/python-markdown-extension/test/__main__.py similarity index 97% rename from compiler-plugin/test/__main__.py rename to python-markdown-extension/test/__main__.py index 0c8c277b..da98c3de 100644 --- a/compiler-plugin/test/__main__.py +++ b/python-markdown-extension/test/__main__.py @@ -8,7 +8,7 @@ class Tester: def __init__(self, case, test_case: unittest.TestCase): self.case = case self.result = markdown.markdown( - self.case["document"], extensions=["mark-words"] + self.case["document"], extensions=["document-offsets-injection"] ) self.test_case = test_case From dac617697d9d9c85fb623e476c57c0be3be44eb0 Mon Sep 17 00:00:00 2001 From: HikariLan Date: Wed, 3 Jul 2024 16:48:54 +0800 Subject: [PATCH 09/13] ci: format and test pr ci: update env ci: refactor ci: working dir ci: install deps ci: set working dir --- ...mat-and-test-python-markdown-extension.yml | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 .github/workflows/check-format-and-test-python-markdown-extension.yml diff --git a/.github/workflows/check-format-and-test-python-markdown-extension.yml b/.github/workflows/check-format-and-test-python-markdown-extension.yml new file mode 100644 index 00000000..df6737de --- /dev/null +++ b/.github/workflows/check-format-and-test-python-markdown-extension.yml @@ -0,0 +1,46 @@ +name: Check PR Format and Test for python-markdown-extension + +on: + pull_request: + branches: + - master + paths: + - python-markdown-extension/** + +jobs: + check-format: + name: Check PR Format + runs-on: ubuntu-latest + defaults: + run: + working-directory: ./python-markdown-extension + steps: + - uses: actions/checkout@v4 + name: Checkout Repo + - uses: eifinger/setup-rye@v3 + name: Setup Rye + with: + enable-cache: true + working-directory: python-markdown-extension + - run: rye sync + name: Install Dependencies + - run: rye fmt --check + name: Check Format + test: + name: Test PR + runs-on: ubuntu-latest + defaults: + run: + working-directory: ./python-markdown-extension + steps: + - uses: actions/checkout@v4 + name: Checkout Repo + - uses: eifinger/setup-rye@v3 + name: Setup Rye + with: + enable-cache: true + working-directory: python-markdown-extension + - run: rye sync + name: Install Dependencies + - run: rye run test + name: Run Tests From cc1b2347fd06fc0b201202b3c662f8916af418b6 Mon Sep 17 00:00:00 2001 From: HikariLan Date: Wed, 3 Jul 2024 23:28:14 +0800 Subject: [PATCH 10/13] chore: type hint and more comment --- .../extension.py | 41 +++++++++++-------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/python-markdown-extension/src/python_markdown_document_offsets_injection_extension/extension.py b/python-markdown-extension/src/python_markdown_document_offsets_injection_extension/extension.py index bd45c504..1885e110 100644 --- a/python-markdown-extension/src/python_markdown_document_offsets_injection_extension/extension.py +++ b/python-markdown-extension/src/python_markdown_document_offsets_injection_extension/extension.py @@ -1,14 +1,15 @@ -from markdown import Extension +from markdown import Extension, Markdown from markdown.preprocessors import Preprocessor from markdown.blockprocessors import BlockProcessor +from markdown.blockparser import BlockParser import xml.etree.ElementTree as etree -MARK_PREVENT_RECURSION = "\t\t\t\r\r\rMARK_PREVENT_RECURSION\r\r\r\t\t\t" +MARK_PREVENT_RECURSION: str = "\t\t\t\r\r\rMARK_PREVENT_RECURSION\r\r\r\t\t\t" class MainExtension(Extension): - def extendMarkdown(self, md): - meta = {"document_offsets": [], "used_document_offsets": {}} + def extendMarkdown(self, md: Markdown): + meta: dict = {"document_offsets": [], "used_document_offsets": {}} md.preprocessors.register( CalculateDocumentOffsetPreprocessor(md, meta), "capture_document", 1000 ) # Highest priority is required because we need to calc words offset from original document @@ -22,19 +23,19 @@ class CalculateDocumentOffsetPreprocessor(Preprocessor): A preprocessor to calculate the offset of each line in the document """ - def __init__(self, md, meta): + def __init__(self, md: Markdown, meta: dict): super(CalculateDocumentOffsetPreprocessor, self).__init__(md) self.meta = meta - def run(self, lines): - offset = 0 - for line_num, line in enumerate(lines): + def run(self, lines: list[str]) -> list[str]: + offset: int = 0 + for line in lines: # Skip empty lines if len(line) == 0: offset += 1 continue # store the line and offset - store = (line, offset, offset + len(line)) + store: tuple[str, int, int] = (line, offset, offset + len(line)) self.meta["document_offsets"].append(store) self.meta["used_document_offsets"][store] = False # plus 1 is for the newline character (\n), use the CRLF file is unknown behavior @@ -47,28 +48,36 @@ class OffsetsInjectionBlockProcessor(BlockProcessor): A block processor to mark the words in the document and inject the offset of the block to the HTML element """ - def __init__(self, parser, meta): + def __init__(self, parser: BlockParser, meta: dict): super(OffsetsInjectionBlockProcessor, self).__init__(parser) self.meta = meta - def test(self, parent, block): + def test(self, _, block) -> bool: # Test if there is any line in the block for line in [line for (line, _, _) in self.meta["document_offsets"]]: if line in block: return True return False - def run(self, parent: etree.Element, blocks): - block = blocks[0] + def run(self, parent: etree.Element, blocks: list[str]) -> bool: + """ + 注入文档中的偏移量到HTML元素中,以便在后续的处理中可以使用这些偏移量来定位文档中的位置。目前的算法如下: + 1. 从文档中查找第一个包含文本的块 + 2. 查找这个块在文档中的位置,这通过遍历文档中的每一行,以找到所有被包含在该块中的行,通过获取这些行的起始和结束位置,来确定这个块在文档中的位置 + 3. 注入这个块的起始和结束位置到HTML元素中,这会先递归的解析这个块,然后再注入这个块的起始和结束位置注入到最后一个被生成的HTML元素中 + 由于递归解析块时该块仍会被本处理器捕获,为了避免循环递归,我们在块的末尾添加了MARK_PREVENT_RECURSION标记,当本处理器再次捕获到这个块时,会直接跳过这个块,并清除这个标记。 + """ + + block: str = blocks[0] # If the first block is handled, remove the marker and return, so that other block processors can process it if MARK_PREVENT_RECURSION in blocks[0]: blocks[0] = blocks[0].replace(MARK_PREVENT_RECURSION, "") return False - start = None - end = None - used = {} + start: int | None = None + end: int | None = None + used: dict[tuple[str, int, int], bool] = {} # Search for the block fragment in the document_offsets for store in self.meta["document_offsets"]: # If already used, skip From 70c02485d45b12b84bd71ac679012a1d18e0c20a Mon Sep 17 00:00:00 2001 From: HikariLan Date: Thu, 4 Jul 2024 01:44:08 +0800 Subject: [PATCH 11/13] test: complete test environment --- python-markdown-extension/pyproject.toml | 9 +- .../requirements-dev.lock | 5 + python-markdown-extension/test/__main__.py | 291 ++++++++++++++++-- 3 files changed, 283 insertions(+), 22 deletions(-) diff --git a/python-markdown-extension/pyproject.toml b/python-markdown-extension/pyproject.toml index d3bf8d22..b641660a 100644 --- a/python-markdown-extension/pyproject.toml +++ b/python-markdown-extension/pyproject.toml @@ -4,7 +4,9 @@ version = "0.0.1" description = "A Python-Markdown compiler plugin that put markdown words offset to the output HTML." authors = [{ name = "HikariLan", email = "hikarilan@minecraft.kim" }] license = { text = "Apache-2.0" } -dependencies = ["markdown>=3.6"] +dependencies = [ + "markdown>=3.6", +] requires-python = ">= 3.8" [build-system] @@ -13,7 +15,10 @@ build-backend = "hatchling.build" [tool.rye] managed = true -dev-dependencies = [] +dev-dependencies = [ + "pygments>=2.18.0", + "pymdown-extensions>=10.8.1", +] [tool.hatch.metadata] allow-direct-references = true diff --git a/python-markdown-extension/requirements-dev.lock b/python-markdown-extension/requirements-dev.lock index 2e8b89cd..a33eba1b 100644 --- a/python-markdown-extension/requirements-dev.lock +++ b/python-markdown-extension/requirements-dev.lock @@ -10,4 +10,9 @@ -e file:. markdown==3.6 + # via pymdown-extensions # via python-markdown-document-offsets-injection-extension +pygments==2.18.0 +pymdown-extensions==10.8.1 +pyyaml==6.0.1 + # via pymdown-extensions diff --git a/python-markdown-extension/test/__main__.py b/python-markdown-extension/test/__main__.py index da98c3de..eb15716d 100644 --- a/python-markdown-extension/test/__main__.py +++ b/python-markdown-extension/test/__main__.py @@ -3,22 +3,127 @@ import markdown from html.parser import HTMLParser +from pymdownx.emoji import to_svg +from pymdownx.slugs import uslugify +from pymdownx.arithmatex import fence_mathjax_format + class Tester: def __init__(self, case, test_case: unittest.TestCase): self.case = case + """ + @see: https://github.com/OI-wiki/OI-wiki/blob/65983038c40716dd0644778fe7875e91c9043618/mkdocs.yml#L586 + + # Extensions + markdown_extensions: + - admonition + - def_list + - footnotes + - meta + - toc: + permalink: "" + slugify: !!python/name:pymdownx.slugs.uslugify + - pymdownx.arithmatex: + generic: true + - pymdownx.caret + - pymdownx.critic + - pymdownx.details + - pymdownx.emoji: + emoji_generator: !!python/name:pymdownx.emoji.to_svg + - pymdownx.highlight: + linenums: true + - pymdownx.inlinehilite + - pymdownx.keys + - pymdownx.magiclink + - pymdownx.mark + - pymdownx.snippets: + check_paths: true + - pymdownx.progressbar + - pymdownx.smartsymbols + - pymdownx.superfences: + custom_fences: + - name: math + class: arithmatex + format: !!python/name:pymdownx.arithmatex.fence_mathjax_format + - pymdownx.tasklist: + custom_checkbox: true + - pymdownx.tilde + - pymdownx.tabbed: + alternate_style: true + """ self.result = markdown.markdown( - self.case["document"], extensions=["document-offsets-injection"] + self.case["document"], + extensions=[ + "document-offsets-injection", + "admonition", + "def_list", + "footnotes", + "meta", + "toc", + "pymdownx.arithmatex", + "pymdownx.caret", + "pymdownx.critic", + "pymdownx.details", + "pymdownx.emoji", + "pymdownx.highlight", + "pymdownx.inlinehilite", + "pymdownx.keys", + "pymdownx.magiclink", + "pymdownx.mark", + "pymdownx.snippets", + "pymdownx.progressbar", + "pymdownx.smartsymbols", + "pymdownx.superfences", + "pymdownx.tasklist", + "pymdownx.tilde", + "pymdownx.tabbed", + ], + extension_configs={ + "toc": { + "permalink": "", + "slugify": uslugify, + }, + "pymdownx.arithmatex": { + "generic": True, + }, + "pymdownx.emoji": { + "emoji_generator": to_svg, + }, + "pymdownx.highlight": { + "linenums": True, + }, + "pymdownx.snippets": { + "check_paths": True, + }, + "pymdownx.superfences": { + "custom_fences": [ + { + "name": "math", + "class": "arithmatex", + "format": fence_mathjax_format, + }, + ], + }, + "pymdownx.tasklist": { + "custom_checkbox": True, + }, + "pymdownx.tabbed": { + "alternate_style": True, + }, + }, ) + print(self.case["document"]) + print(self.result) self.test_case = test_case def test(self): - ParserTester(self.case, self.test_case).feed(self.result) + tester = ParserTester(self.case, self.test_case) + tester.feed(self.result) + tester.check_integrity() class ParserTester(HTMLParser): tag = None - text = None offset_start = None offset_end = None @@ -29,17 +134,23 @@ def __init__(self, case, test_case: unittest.TestCase): self.idx = 0 def handle_starttag(self, tag, attrs): - self.tag = tag + start = None + end = None for attr in attrs: if attr[0] == "data-original-document-start": - self.offset_start = int(attr[1]) + start = int(attr[1]) if attr[0] == "data-original-document-end": - self.offset_end = int(attr[1]) - - def handle_data(self, data): - self.text = data + end = int(attr[1]) + if start is not None and end is not None: + self.tag = tag + self.offset_start = start + self.offset_end = end def handle_endtag(self, tag): + if self.tag != tag: + return # ignore nested tags + if self.idx == len(self.case["expected"]): + return # ignore extra tags self._test() self._reset() @@ -49,11 +160,6 @@ def _test(self): self.case["expected"][self.idx]["tag"], msg="Tag mismatch in index " + str(self.idx), ) - self.test_case.assertEqual( - self.text, - self.case["expected"][self.idx]["text"], - msg="Text mismatch in index " + str(self.idx), - ) self.test_case.assertEqual( self.offset_start, self.case["expected"][self.idx]["offset"][0], @@ -68,10 +174,16 @@ def _test(self): def _reset(self): self.tag = None - self.text = None self.offset_start = None self.offset_end = None + def check_integrity(self): + self.test_case.assertEqual( + self.idx, + len(self.case["expected"]), + msg="Not all tags were found", + ) + class TestParser(unittest.TestCase): def test_normal(self): @@ -85,16 +197,14 @@ def test_normal(self): Morbi neque lectus, faucibus a mattis at, aliquam quis est. Maecenas sed luctus elit."""), "expected": [ - {"tag": "h1", "text": "Lorem ipsum", "offset": (0, 13)}, + {"tag": "h1", "offset": (0, 13)}, { "tag": "p", - "text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin sed lacus vitae neque vestibulum porttitor id et urna.", "offset": (15, 132), }, - {"tag": "h2", "text": "Morbi neque lectus", "offset": (134, 155)}, + {"tag": "h2", "offset": (134, 155)}, { "tag": "p", - "text": "Morbi neque lectus, faucibus a mattis at, aliquam quis est. Maecenas sed luctus elit.", "offset": (157, 242), }, ], @@ -112,7 +222,148 @@ def test_single(self): case = { "document": "Lorem ipsum", "expected": [ - {"tag": "p", "text": "Lorem ipsum", "offset": (0, 11)}, + {"tag": "p", "offset": (0, 11)}, + ], + } + Tester(case, self).test() + + def test_oi_wiki_index(self): + case = { + "document": textwrap.dedent("""\ + disqus: + pagetime: + title: OI Wiki + + ## 欢迎来到 **OI Wiki**![![GitHub watchers](https://img.shields.io/github/watchers/OI-wiki/OI-wiki.svg?style=social&label=Watch)](https://github.com/OI-wiki/OI-wiki) [![GitHub stars](https://img.shields.io/github/stars/OI-wiki/OI-wiki.svg?style=social&label=Stars)](https://github.com/OI-wiki/OI-wiki) + + [![Word Art](images/wordArt.webp)](https://github.com/OI-wiki/OI-wiki) + + **OI**(Olympiad in Informatics,信息学奥林匹克竞赛)在中国起源于 1984 年,是五大高中学科竞赛之一。 + + **ICPC**(International Collegiate Programming Contest,国际大学生程序设计竞赛)由 ICPC 基金会(ICPC Foundation)举办,是最具影响力的大学生计算机竞赛。由于以前 ACM 赞助这个竞赛,也有很多人习惯叫它 ACM 竞赛。 + + **OI Wiki** 致力于成为一个免费开放且持续更新的 **编程竞赛(competitive programming)** 知识整合站点,大家可以在这里获取与竞赛相关的、有趣又实用的知识。我们为大家准备了竞赛中的基础知识、常见题型、解题思路以及常用工具等内容,帮助大家更快速深入地学习编程竞赛中涉及到的知识。 + + 本项目受 [CTF Wiki](https://ctf-wiki.org/) 的启发,在编写过程中参考了诸多资料,在此一并致谢。 + +
+ +
+ + """), + "expected": [ + { + "tag": "h2", + "offset": (34, 332), + }, + { + "tag": "p", + "offset": (334, 404), + }, + { + "tag": "p", + "offset": (406, 473), + }, + { + "tag": "p", + "offset": (475, 620), + }, + { + "tag": "p", + "offset": (622, 778), + }, + { + "tag": "p", + "offset": (780, 844), + }, + # there's a div tag and a script tag in the document, and they will not be considered. + ], + } + Tester(case, self).test() + + def test_oi_wiki_search_dfs(self): + case = { + # I HATE TEXT BLOCKS + "document": textwrap.dedent("""\ + ## 引入 + + DFS 为图论中的概念,详见 [DFS(图论)](../graph/dfs.md) 页面。在 **搜索算法** 中,该词常常指利用递归函数方便地实现暴力枚举的算法,与图论中的 DFS 算法有一定相似之处,但并不完全相同。 + + ## 解释 + + 考虑这个例子: + + ???+ note "例题" + 把正整数 $n$ 分解为 $3$ 个不同的正整数,如 $6=1+2+3$,排在后面的数必须大于等于前面的数,输出所有方案。 + + 对于这个问题,如果不知道搜索,应该怎么办呢? + + 当然是三重循环,参考代码如下: + + ???+ note "实现" + === "C++" + ```cpp + for (int i = 1; i <= n; ++i) + for (int j = i; j <= n; ++j) + for (int k = j; k <= n; ++k) + if (i + j + k == n) printf("%d = %d + %d + %d\\n", n, i, j, k); + ``` + + === "Python" + ```python + for i in range(1, n + 1): + for j in range(i, n + 1): + for k in range(j, n + 1): + if i + j + k == n: + print("%d = %d + %d + %d" % (n, i, j, k)) + ``` + + === "Java" + ```Java + for (int i = 1; i < n + 1; i++) { + for (int j = i; j < n + 1; j++) { + for (int k = j; k < n + 1; k++) { + if (i + j + k == n) System.out.printf("%d = %d + %d + %d%n", n, i, j, k); + } + } + } + ``` + + 那如果是分解成四个整数呢?再加一重循环?"""), + "expected": [ + { + "tag": "h2", + "offset": (0, 5), + }, + { + "tag": "p", + "offset": (7, 117), + }, + { + "tag": "h2", + "offset": (119, 124), + }, + { + "tag": "p", + "offset": (126, 133), + }, + #
has been ignored + { + "tag": "p", + "offset": (217, 239), + }, + { + "tag": "p", + "offset": (241, 256), + }, + #
has been ignored + { + "tag": "p", + "offset": (1094, 1114), + }, ], } Tester(case, self).test() From e357c0496e4325456d6a60b038558a50a9f503d3 Mon Sep 17 00:00:00 2001 From: HikariLan Date: Fri, 5 Jul 2024 00:30:01 +0800 Subject: [PATCH 12/13] chore: ignore .idea --- python-markdown-extension/.gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python-markdown-extension/.gitignore b/python-markdown-extension/.gitignore index ae8554de..bf07c5cb 100644 --- a/python-markdown-extension/.gitignore +++ b/python-markdown-extension/.gitignore @@ -8,3 +8,5 @@ wheels/ # venv .venv + +.idea/ \ No newline at end of file From 5bc32d424cd39cd47f242ca520ec49792a8f1752 Mon Sep 17 00:00:00 2001 From: HikariLan Date: Fri, 5 Jul 2024 00:30:53 +0800 Subject: [PATCH 13/13] refacor: heuristics marking --- .../extension.py | 134 +++++++++++++++++- python-markdown-extension/test/__main__.py | 42 +++--- 2 files changed, 153 insertions(+), 23 deletions(-) diff --git a/python-markdown-extension/src/python_markdown_document_offsets_injection_extension/extension.py b/python-markdown-extension/src/python_markdown_document_offsets_injection_extension/extension.py index 1885e110..a2aa3b5b 100644 --- a/python-markdown-extension/src/python_markdown_document_offsets_injection_extension/extension.py +++ b/python-markdown-extension/src/python_markdown_document_offsets_injection_extension/extension.py @@ -1,3 +1,4 @@ +import re from markdown import Extension, Markdown from markdown.preprocessors import Preprocessor from markdown.blockprocessors import BlockProcessor @@ -6,15 +7,29 @@ MARK_PREVENT_RECURSION: str = "\t\t\t\r\r\rMARK_PREVENT_RECURSION\r\r\r\t\t\t" +MARK_CONTINUE: str = "\t\t\t\r\r\rMARK_CONTINUE\r\r\r\t\t\t" + +# @see: markdown.util.HTML_PLACEHOLDER_RE +# PYTHON_MARKDOWN_HTML_PLACEHOLDER_RE: re.Pattern[str] = re.compile( +# "\u0002wzxhzdk:%s\u0003" % r"([0-9]+)" +# ) + class MainExtension(Extension): def extendMarkdown(self, md: Markdown): - meta: dict = {"document_offsets": [], "used_document_offsets": {}} + meta: dict = { + "document_offsets": [], + "used_document_offsets": {}, + "last_parent": None, + } md.preprocessors.register( CalculateDocumentOffsetPreprocessor(md, meta), "capture_document", 1000 ) # Highest priority is required because we need to calc words offset from original document + md.preprocessors.register( + FixDocumentOffsetPreprocessor(md, meta), "fix_document", 0 + ) # Lowest priority is required because we need to fix the offset after all other block processors md.parser.blockprocessors.register( - OffsetsInjectionBlockProcessor(md.parser, meta), "mark_words", 100 + OffsetsInjectionBlockProcessor(md.parser, meta), "mark_words", 200 ) # high priority, usually larger than every other block processor @@ -32,6 +47,9 @@ def run(self, lines: list[str]) -> list[str]: for line in lines: # Skip empty lines if len(line) == 0: + store: tuple[str, int, int] = (line, offset, offset + 1) + self.meta["document_offsets"].append(store) + self.meta["used_document_offsets"][store] = False offset += 1 continue # store the line and offset @@ -43,6 +61,102 @@ def run(self, lines: list[str]) -> list[str]: return lines +class FixDocumentOffsetPreprocessor(Preprocessor): + """ + A preprocessor to fix the offset of each line after the 3rd party extension processed the document + """ + + def __init__(self, md: Markdown, meta: dict): + super(FixDocumentOffsetPreprocessor, self).__init__(md) + self.meta = meta + + def run(self, lines: list[str]) -> list[str]: + document_offsets: list[tuple[str, int, int]] = self.meta["document_offsets"] + + # 最后一次成功匹配的文档偏移量字典索引末,开区间 + last_success_match_end: int = 0 + num_lines: int = 0 + num_document_offsets: int = 0 + while num_document_offsets < len(document_offsets) and num_lines < len(lines): + line = lines[num_lines] + document_offset: tuple[str, int, int] = document_offsets[ + num_document_offsets + ] + + # 如果精准匹配 + if document_offset[0] == line: + # 匹配该行 + self.match(line, num_document_offsets, num_document_offsets + 1) + # 如果上次成功匹配的原文档偏移量未连续,匹配当前行到这部分未连续的原文档偏移量 + if num_document_offsets > last_success_match_end and num_lines > 0: + self.match( + lines[num_lines - 1], + last_success_match_end, + num_document_offsets, + ) + last_success_match_end = num_document_offsets + 1 + num_lines += 1 + num_document_offsets += 1 + # 如果未能精准匹配,查找该行在原文档偏移量字典中的位置 + else: + remain: list[str] = [ + line for line, _, _ in document_offsets[num_document_offsets:] + ] + # 如果存在这样的行 + if line in remain: + # 找到第一次匹配的位置,匹配该行到此处 + idx = remain.index(line) + num_document_offsets + self.match(line, idx, idx + 1) + # 如果上次成功匹配的原文档偏移量未连续,匹配当前行到这部分未连续的原文档偏移量 + if idx > last_success_match_end and num_lines > 0: + self.match(lines[num_lines - 1], last_success_match_end, idx) + last_success_match_end = idx + 1 + num_lines += 1 + num_document_offsets = idx + 1 + # 如果未找到匹配的位置,继续查找下一行 + else: + num_lines += 1 + + # 如果行匹配完成,但原文档偏移量未匹配完成,匹配剩余的原文档偏移量 + if last_success_match_end < len(document_offsets): + self.match( + lines[num_lines - 1], last_success_match_end, len(document_offsets) + ) + + return lines + + def match( + self, + matched_line: str, + num_document_offsets_start: int, + num_document_offsets_end: int, + ): + """ + 将单个匹配行设置到多个原文档偏移量字典,索引范围为[num_document_offsets_start, num_document_offsets_end) + """ + document_offsets: list[tuple[str, int, int]] = self.meta["document_offsets"] + used_document_offsets: dict[tuple[str, int, int], bool] = self.meta[ + "used_document_offsets" + ] + for i in range(num_document_offsets_start, num_document_offsets_end): + document_offset = document_offsets[i] + # 如果是第一个匹配的原文档偏移量,设置为匹配行,否则设置为 MARK_CONTINUE + if i == num_document_offsets_start: + document_offsets[i] = ( + matched_line, + document_offset[1], + document_offset[2], + ) + else: + document_offsets[i] = ( + MARK_CONTINUE, + document_offset[1], + document_offset[2], + ) + del used_document_offsets[document_offset] + used_document_offsets[document_offsets[i]] = False + + class OffsetsInjectionBlockProcessor(BlockProcessor): """ A block processor to mark the words in the document and inject the offset of the block to the HTML element @@ -80,10 +194,18 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool: used: dict[tuple[str, int, int], bool] = {} # Search for the block fragment in the document_offsets for store in self.meta["document_offsets"]: + # Skip empty lines + if len(store[0]) == 0: + continue # If already used, skip if self.meta["used_document_offsets"][store]: continue (line, offset, end_offset) = store + # 如果收到 MARK_CONTINUE 标记,直接认为该标记之前的行是连续的 + if line == MARK_CONTINUE: + end = end_offset + used[store] = True + continue # If found one if line in block: # If the line already scanned (usually some lines with same content in different place), skip @@ -111,9 +233,15 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool: if start is not None and end is not None: blocks.pop(0) self.meta["used_document_offsets"].update(used) - # append MARK_PREVENT_RECURSION to tail of the block to prevent recursion, we don't use a handled flaglist because we don't know if there's some same block in the document + # append MARK_PREVENT_RECURSION to tail of the block to prevent recursion, we don't use a handled + # flaglist because we don't know if there's some same block in the document self.parser.parseBlocks(parent, [block + MARK_PREVENT_RECURSION]) + # fix multi blocks in same parents + if self.meta["last_parent"] == parent[-1]: + parent[-1].set("data-original-document-end", str(end)) + return True parent[-1].set("data-original-document-start", str(start)) parent[-1].set("data-original-document-end", str(end)) + self.meta["last_parent"] = parent[-1] return True return False diff --git a/python-markdown-extension/test/__main__.py b/python-markdown-extension/test/__main__.py index eb15716d..93f2eb00 100644 --- a/python-markdown-extension/test/__main__.py +++ b/python-markdown-extension/test/__main__.py @@ -112,8 +112,6 @@ def __init__(self, case, test_case: unittest.TestCase): }, }, ) - print(self.case["document"]) - print(self.result) self.test_case = test_case def test(self): @@ -190,11 +188,11 @@ def test_normal(self): case = { "document": textwrap.dedent("""\ # Lorem ipsum - + Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin sed lacus vitae neque vestibulum porttitor id et urna. - + ## Morbi neque lectus - + Morbi neque lectus, faucibus a mattis at, aliquam quis est. Maecenas sed luctus elit."""), "expected": [ {"tag": "h1", "offset": (0, 13)}, @@ -277,32 +275,30 @@ def test_oi_wiki_index(self): }, { "tag": "p", - "offset": (780, 844), + "offset": (780, 1101), # FIXME: Correct one is (780, 1101) }, - # there's a div tag and a script tag in the document, and they will not be considered. ], } Tester(case, self).test() def test_oi_wiki_search_dfs(self): case = { - # I HATE TEXT BLOCKS "document": textwrap.dedent("""\ ## 引入 - + DFS 为图论中的概念,详见 [DFS(图论)](../graph/dfs.md) 页面。在 **搜索算法** 中,该词常常指利用递归函数方便地实现暴力枚举的算法,与图论中的 DFS 算法有一定相似之处,但并不完全相同。 - + ## 解释 - + 考虑这个例子: - + ???+ note "例题" 把正整数 $n$ 分解为 $3$ 个不同的正整数,如 $6=1+2+3$,排在后面的数必须大于等于前面的数,输出所有方案。 - + 对于这个问题,如果不知道搜索,应该怎么办呢? - + 当然是三重循环,参考代码如下: - + ???+ note "实现" === "C++" ```cpp @@ -311,7 +307,7 @@ def test_oi_wiki_search_dfs(self): for (int k = j; k <= n; ++k) if (i + j + k == n) printf("%d = %d + %d + %d\\n", n, i, j, k); ``` - + === "Python" ```python for i in range(1, n + 1): @@ -320,7 +316,7 @@ def test_oi_wiki_search_dfs(self): if i + j + k == n: print("%d = %d + %d + %d" % (n, i, j, k)) ``` - + === "Java" ```Java for (int i = 1; i < n + 1; i++) { @@ -331,7 +327,7 @@ def test_oi_wiki_search_dfs(self): } } ``` - + 那如果是分解成四个整数呢?再加一重循环?"""), "expected": [ { @@ -350,7 +346,10 @@ def test_oi_wiki_search_dfs(self): "tag": "p", "offset": (126, 133), }, - #
has been ignored + { + "tag": "details", + "offset": (135, 215), + }, { "tag": "p", "offset": (217, 239), @@ -359,7 +358,10 @@ def test_oi_wiki_search_dfs(self): "tag": "p", "offset": (241, 256), }, - #
has been ignored + { + "tag": "details", + "offset": (258, 1092), + }, { "tag": "p", "offset": (1094, 1114),