diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 3482843b..cac2781e 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -20,7 +20,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ['3.10', '3.11']
+ python-version: ['3.9', '3.10', '3.11', '3.12']
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
@@ -31,10 +31,12 @@ jobs:
python -m pip install -U pip
python -m pip install --use-pep517 '.[dev]'
- run: make test
+ env:
+ PYTHONWARNINGS: default
- name: Remove huge file taxondata_py.html
run: rm -f htmlcov/*_taxondata_py.html
- uses: actions/upload-pages-artifact@v2
- if: github.ref_name == 'master' && matrix.python-version == '3.11'
+ if: github.ref_name == 'master' && matrix.python-version == '3.12'
with:
path: htmlcov
diff --git a/tests/test_de_page.py b/tests/test_de_page.py
index 4451898c..a6b40eac 100644
--- a/tests/test_de_page.py
+++ b/tests/test_de_page.py
@@ -1,16 +1,15 @@
# Tests for parsing a page from the German Wiktionary
import unittest
-
from collections import defaultdict
from wikitextprocessor import Wtp
from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.page import (
+ fix_level_hierarchy_of_subsections,
parse_page,
parse_section,
- fix_level_hierarchy_of_subsections,
)
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext
diff --git a/wiktextract/extractor/de/gloss.py b/wiktextract/extractor/de/gloss.py
index b209f455..ffce2c82 100644
--- a/wiktextract/extractor/de/gloss.py
+++ b/wiktextract/extractor/de/gloss.py
@@ -1,8 +1,7 @@
+import re
from collections import defaultdict
from typing import Dict, List
-import re
-
from wikitextprocessor import NodeKind, WikiNode
from wiktextract.page import clean_node
diff --git a/wiktextract/extractor/de/page.py b/wiktextract/extractor/de/page.py
index 57537790..39d67da4 100644
--- a/wiktextract/extractor/de/page.py
+++ b/wiktextract/extractor/de/page.py
@@ -1,11 +1,9 @@
import copy
import logging
-
from collections import defaultdict
from typing import Dict, List, Union
from wikitextprocessor import NodeKind, WikiNode
-
from wikitextprocessor.parser import LevelNode
from wiktextract.datautils import append_base_data
@@ -322,50 +320,8 @@ def parse_page(
)
page_data = []
- for node in filter(lambda n: isinstance(n, WikiNode), tree.children):
- # ignore certain top level templates
- if node.kind == NodeKind.TEMPLATE:
- template_name = node.template_name
-
- # Mostly meta-templates at the top of the page that do not carry
- # any semantic information
- IGNORE_TOP_LEVEL_TEMPLATES = {
- "Wort der Woche",
- "Siehe auch",
- "erweitern",
- "Abschnitte fehlen",
- "überarbeiten",
- "Zeichen",
- "Wortart fehlt",
- "TOC limit",
- "Neuer Eintrag",
- "Löschantrag/Vorlage",
- "keine Belegstelle/Vorlage",
- "Anmerkung Keilschrift",
- "In Arbeit",
- "Halbgeschützte Seite",
- "anpassen",
- }
-
- if template_name in IGNORE_TOP_LEVEL_TEMPLATES:
- continue
-
- # ignore certain top level magic words
- if node.kind == NodeKind.MAGIC_WORD and node.sarg in {
- "__TOC__",
- "__NOTOC__",
- "__NOEDITSECTION__",
- }:
- continue
-
- if node.kind != NodeKind.LEVEL2:
- wxr.wtp.warning(
- f"Unexpected top-level node: {node}",
- sortid="extractor/de/page/parse_page/61",
- )
- continue
-
- for subtitle_template in node.find_content(NodeKind.TEMPLATE):
+ for level2_node in tree.find_child(NodeKind.LEVEL2):
+ for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE):
# The language sections are marked with
# ==
({{Sprache|}}) ==
# where is the title of the page and is the
@@ -389,6 +345,6 @@ def parse_page(
},
)
page_data.append(copy.deepcopy(base_data))
- parse_section(wxr, page_data, base_data, node.children)
+ parse_section(wxr, page_data, base_data, level2_node.children)
return page_data
diff --git a/wiktextract/extractor/fr/page.py b/wiktextract/extractor/fr/page.py
index d968e3f2..0f797e4a 100644
--- a/wiktextract/extractor/fr/page.py
+++ b/wiktextract/extractor/fr/page.py
@@ -161,22 +161,8 @@ def parse_page(
)
page_data = []
- for node in filter(lambda n: isinstance(n, WikiNode), tree.children):
- # ignore link created by `voir` template at the page top
- if node.kind == NodeKind.TEMPLATE:
- template_name = node.template_name
- if template_name in {"voir", "voir2"} or template_name.startswith(
- "voir/"
- ):
- continue
- if node.kind != NodeKind.LEVEL2:
- wxr.wtp.warning(
- f"Unexpected top-level node: {node}",
- sortid="extractor/fr/page/parse_page/94",
- )
- continue
-
- for subtitle_template in node.find_content(NodeKind.TEMPLATE):
+ for level2_node in tree.find_child(NodeKind.LEVEL2):
+ for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE):
# https://fr.wiktionary.org/wiki/Modèle:langue
# https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues
if subtitle_template.template_name == "langue":
@@ -197,9 +183,9 @@ def parse_page(
base_data.update(categories_and_links)
page_data.append(copy.deepcopy(base_data))
etymology_data: Optional[EtymologyData] = None
- for level_three_node in node.find_child(NodeKind.LEVEL3):
+ for level3_node in level2_node.find_child(NodeKind.LEVEL3):
new_etymology_data = parse_section(
- wxr, page_data, base_data, level_three_node
+ wxr, page_data, base_data, level3_node
)
if new_etymology_data is not None:
etymology_data = new_etymology_data
diff --git a/wiktextract/extractor/zh/headword_line.py b/wiktextract/extractor/zh/headword_line.py
index 24e6516c..223e8630 100644
--- a/wiktextract/extractor/zh/headword_line.py
+++ b/wiktextract/extractor/zh/headword_line.py
@@ -104,9 +104,7 @@ def extract_headword_line(
for span_child in child.find_html(
"strong", attr_name="class", attr_value="headword"
):
- ruby_data, node_without_ruby = extract_ruby(
- wxr, span_child
- )
+ ruby_data, node_without_ruby = extract_ruby(wxr, span_child)
page_data[-1]["forms"].append(
{
"form": clean_node(
@@ -114,7 +112,7 @@ def extract_headword_line(
),
"ruby": ruby_data,
"tags": ["canonical"],
- }
+ }
)
elif child.tag == "b":
# this is a form tag, already inside form parentheses
diff --git a/wiktextract/extractor/zh/page.py b/wiktextract/extractor/zh/page.py
index 3f2866fc..3d726d0e 100644
--- a/wiktextract/extractor/zh/page.py
+++ b/wiktextract/extractor/zh/page.py
@@ -264,30 +264,12 @@ def parse_page(
)
page_data = []
- for node in filter(lambda n: isinstance(n, WikiNode), tree.children):
- # ignore link created by `also` template at the page top
- # also ignore "character info" templates
- if node.kind == NodeKind.TEMPLATE and node.template_name.lower() in {
- "also",
- "see also",
- "亦",
- "character info",
- "character info/new",
- "character info/var",
- }:
- continue
- if node.kind != NodeKind.LEVEL2:
- wxr.wtp.warning(
- f"Unexpected top-level node: {node}",
- sortid="extractor/zh/page/parse_page/503",
- )
- continue
-
+ for level2_node in tree.find_child(NodeKind.LEVEL2):
categories_and_links = defaultdict(list)
- lang_name = clean_node(wxr, categories_and_links, node.largs)
+ lang_name = clean_node(wxr, categories_and_links, level2_node.largs)
if lang_name not in wxr.config.LANGUAGES_BY_NAME:
wxr.wtp.warning(
- f"Unrecognized language name at top-level {lang_name}",
+ f"Unrecognized language name: {lang_name}",
sortid="extractor/zh/page/parse_page/509",
)
lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang_name)
@@ -304,6 +286,6 @@ def parse_page(
)
base_data.update(categories_and_links)
page_data.append(copy.deepcopy(base_data))
- parse_section(wxr, page_data, base_data, node.children)
+ parse_section(wxr, page_data, base_data, level2_node.children)
return page_data