Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add compiler plugin for python-markdown #1

Merged
merged 13 commits into from
Jul 6, 2024
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
name: Check PR Format and Test for python-markdown-extension

on:
pull_request:
branches:
- master
paths:
- python-markdown-extension/**

jobs:
check-format:
name: Check PR Format
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./python-markdown-extension
steps:
- uses: actions/checkout@v4
name: Checkout Repo
- uses: eifinger/setup-rye@v3
name: Setup Rye
with:
enable-cache: true
working-directory: python-markdown-extension
- run: rye sync
name: Install Dependencies
- run: rye fmt --check
name: Check Format
test:
name: Test PR
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./python-markdown-extension
steps:
- uses: actions/checkout@v4
name: Checkout Repo
- uses: eifinger/setup-rye@v3
name: Setup Rye
with:
enable-cache: true
working-directory: python-markdown-extension
- run: rye sync
name: Install Dependencies
- run: rye run test
name: Run Tests
36 changes: 0 additions & 36 deletions .gitignore

This file was deleted.

13 changes: 0 additions & 13 deletions index.html

This file was deleted.

15 changes: 0 additions & 15 deletions package.json

This file was deleted.

1 change: 0 additions & 1 deletion public/vite.svg

This file was deleted.

10 changes: 10 additions & 0 deletions python-markdown-extension/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# python generated files
__pycache__/
*.py[oc]
build/
dist/
wheels/
*.egg-info

# venv
.venv
1 change: 1 addition & 0 deletions python-markdown-extension/.python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.12.3
33 changes: 33 additions & 0 deletions python-markdown-extension/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
[project]
name = "python_markdown_document_offsets_injection_extension"
version = "0.0.1"
description = "A Python-Markdown compiler plugin that put markdown words offset to the output HTML."
authors = [{ name = "HikariLan", email = "[email protected]" }]
license = { text = "Apache-2.0" }
dependencies = [
"markdown>=3.6",
]
requires-python = ">= 3.8"

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.rye]
managed = true
dev-dependencies = [
"pygments>=2.18.0",
"pymdown-extensions>=10.8.1",
]

[tool.hatch.metadata]
allow-direct-references = true

[tool.hatch.build.targets.wheel]
packages = ["src/python_markdown_document_offsets_injection_extension"]

[tool.rye.scripts]
test = "python ./test"

[project.entry-points."markdown.extensions"]
document-offsets-injection = "python_markdown_document_offsets_injection_extension.extension:MainExtension"
18 changes: 18 additions & 0 deletions python-markdown-extension/requirements-dev.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# generated by rye
# use `rye lock` or `rye sync` to update this lockfile
#
# last locked with the following flags:
# pre: false
# features: []
# all-features: false
# with-sources: false
# generate-hashes: false

-e file:.
markdown==3.6
# via pymdown-extensions
# via python-markdown-document-offsets-injection-extension
pygments==2.18.0
pymdown-extensions==10.8.1
pyyaml==6.0.1
# via pymdown-extensions
13 changes: 13 additions & 0 deletions python-markdown-extension/requirements.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# generated by rye
# use `rye lock` or `rye sync` to update this lockfile
#
# last locked with the following flags:
# pre: false
# features: []
# all-features: false
# with-sources: false
# generate-hashes: false

-e file:.
markdown==3.6
# via python-markdown-document-offsets-injection-extension
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from markdown import Extension, Markdown
from markdown.preprocessors import Preprocessor
from markdown.blockprocessors import BlockProcessor
from markdown.blockparser import BlockParser
import xml.etree.ElementTree as etree

MARK_PREVENT_RECURSION: str = "\t\t\t\r\r\rMARK_PREVENT_RECURSION\r\r\r\t\t\t"


class MainExtension(Extension):
def extendMarkdown(self, md: Markdown):
meta: dict = {"document_offsets": [], "used_document_offsets": {}}
md.preprocessors.register(
CalculateDocumentOffsetPreprocessor(md, meta), "capture_document", 1000
) # Highest priority is required because we need to calc words offset from original document
md.parser.blockprocessors.register(
OffsetsInjectionBlockProcessor(md.parser, meta), "mark_words", 100
) # high priority, usually larger than every other block processor


class CalculateDocumentOffsetPreprocessor(Preprocessor):
"""
A preprocessor to calculate the offset of each line in the document
"""

def __init__(self, md: Markdown, meta: dict):
super(CalculateDocumentOffsetPreprocessor, self).__init__(md)
self.meta = meta

def run(self, lines: list[str]) -> list[str]:
offset: int = 0
for line in lines:
# Skip empty lines
if len(line) == 0:
offset += 1
continue
# store the line and offset
store: tuple[str, int, int] = (line, offset, offset + len(line))
self.meta["document_offsets"].append(store)
self.meta["used_document_offsets"][store] = False
# plus 1 is for the newline character (\n), use the CRLF file is unknown behavior
offset += len(line) + 1
Enter-tainer marked this conversation as resolved.
Show resolved Hide resolved
return lines


class OffsetsInjectionBlockProcessor(BlockProcessor):
"""
A block processor to mark the words in the document and inject the offset of the block to the HTML element
"""

def __init__(self, parser: BlockParser, meta: dict):
super(OffsetsInjectionBlockProcessor, self).__init__(parser)
self.meta = meta

def test(self, _, block) -> bool:
# Test if there is any line in the block
for line in [line for (line, _, _) in self.meta["document_offsets"]]:
if line in block:
return True
return False

def run(self, parent: etree.Element, blocks: list[str]) -> bool:
"""
注入文档中的偏移量到HTML元素中,以便在后续的处理中可以使用这些偏移量来定位文档中的位置。目前的算法如下:
1. 从文档中查找第一个包含文本的块
2. 查找这个块在文档中的位置,这通过遍历文档中的每一行,以找到所有被包含在该块中的行,通过获取这些行的起始和结束位置,来确定这个块在文档中的位置
3. 注入这个块的起始和结束位置到HTML元素中,这会先递归的解析这个块,然后再注入这个块的起始和结束位置注入到最后一个被生成的HTML元素中
由于递归解析块时该块仍会被本处理器捕获,为了避免循环递归,我们在块的末尾添加了MARK_PREVENT_RECURSION标记,当本处理器再次捕获到这个块时,会直接跳过这个块,并清除这个标记。
"""

block: str = blocks[0]

# If the first block is handled, remove the marker and return, so that other block processors can process it
if MARK_PREVENT_RECURSION in blocks[0]:
blocks[0] = blocks[0].replace(MARK_PREVENT_RECURSION, "")
return False

start: int | None = None
end: int | None = None
used: dict[tuple[str, int, int], bool] = {}
# Search for the block fragment in the document_offsets
for store in self.meta["document_offsets"]:
# If already used, skip
if self.meta["used_document_offsets"][store]:
continue
(line, offset, end_offset) = store
# If found one
if line in block:
# If the line already scanned (usually some lines with same content in different place), skip
if line in [line for (line, _, _) in used.keys()]:
continue
# If none yet set, set the start offset
if start is None:
start = offset
end = end_offset
# Or, continuing searching for the end offset until the end of the block
else:
end = end_offset
# Mark the fragment as used
used[store] = True
# If end is not found but new line not in block, reset the search and restart from the next line
elif end is None:
start = None
# Clear the used list
used = {}
continue
# If both start and end are both set and no continuously block found, break the loop
else:
break
# If both start and end are found, store the result
if start is not None and end is not None:
blocks.pop(0)
self.meta["used_document_offsets"].update(used)
# append MARK_PREVENT_RECURSION to tail of the block to prevent recursion, we don't use a handled flaglist because we don't know if there's some same block in the document
self.parser.parseBlocks(parent, [block + MARK_PREVENT_RECURSION])
parent[-1].set("data-original-document-start", str(start))
parent[-1].set("data-original-document-end", str(end))
return True
return False
Loading