From 060f600f2da9ceda353ebf6ccf2d99a08f5c404b Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 13 Oct 2023 10:06:23 +0800 Subject: [PATCH 1/2] Fix "A-z" regex range overlaps with "a-z" error --- src/wiktextract/clean.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wiktextract/clean.py b/src/wiktextract/clean.py index 1556eec0..5b3409aa 100644 --- a/src/wiktextract/clean.py +++ b/src/wiktextract/clean.py @@ -1438,7 +1438,7 @@ def repl_1_syntaxhighlight(m): ) title = re.sub(r"(?s)\[\[\s*:?([^]|#<>]+?)\s*(#[^][|<>]*?)?\]\]", repl_1, title) - title = re.sub(r"(?s)\[\[\s*(([a-zA-z0-9]+)\s*:)?\s*([^][#|<>]+?)" + title = re.sub(r"(?s)\[\[\s*(([a-zA-Z0-9]+)\s*:)?\s*([^][#|<>]+?)" r"\s*(#[^][|]*?)?\|?\]\]", repl_link, title) title = re.sub(r"(?s)\[\[\s*([^][|<>]+?)\s*\|" From 01ba40afc08ab0275551ec84b7c1f213defce755 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 13 Oct 2023 14:28:40 +0800 Subject: [PATCH 2/2] Fix regex backtracking alert --- src/wiktextract/clean.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/wiktextract/clean.py b/src/wiktextract/clean.py index 5b3409aa..8b1e9fec 100644 --- a/src/wiktextract/clean.py +++ b/src/wiktextract/clean.py @@ -1376,11 +1376,11 @@ def repl_1_syntaxhighlight(m): # Remove with previewonly class (generated e.g. by {{taxlink|...}}) title = re.sub(r'(?si)]*?\bclass="[^"<>]*?' r'\bpreviewonly\b[^>]*?>' - r'((<[^<>]>[^<>]*]*>)|.)*?', + r'.+?', "", title) # Remove ... title = re.sub(r'(?si)]*?\bclass="[^"]*?\berror\b[^>]*?>' - r'((<.*?]>)|.)*?', + r'.+?', "", title) # Change
and
to newlines. Ditto for tr, li, table, dl, ul, ol title = re.sub(r"(?si)]*>",