diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3482843b..cac2781e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -20,7 +20,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.10', '3.11'] + python-version: ['3.9', '3.10', '3.11', '3.12'] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v4 @@ -31,10 +31,12 @@ jobs: python -m pip install -U pip python -m pip install --use-pep517 '.[dev]' - run: make test + env: + PYTHONWARNINGS: default - name: Remove huge file taxondata_py.html run: rm -f htmlcov/*_taxondata_py.html - uses: actions/upload-pages-artifact@v2 - if: github.ref_name == 'master' && matrix.python-version == '3.11' + if: github.ref_name == 'master' && matrix.python-version == '3.12' with: path: htmlcov diff --git a/tests/test_de_page.py b/tests/test_de_page.py index 4451898c..a6b40eac 100644 --- a/tests/test_de_page.py +++ b/tests/test_de_page.py @@ -1,16 +1,15 @@ # Tests for parsing a page from the German Wiktionary import unittest - from collections import defaultdict from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig from wiktextract.extractor.de.page import ( + fix_level_hierarchy_of_subsections, parse_page, parse_section, - fix_level_hierarchy_of_subsections, ) from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext diff --git a/wiktextract/extractor/de/gloss.py b/wiktextract/extractor/de/gloss.py index b209f455..ffce2c82 100644 --- a/wiktextract/extractor/de/gloss.py +++ b/wiktextract/extractor/de/gloss.py @@ -1,8 +1,7 @@ +import re from collections import defaultdict from typing import Dict, List -import re - from wikitextprocessor import NodeKind, WikiNode from wiktextract.page import clean_node diff --git a/wiktextract/extractor/de/page.py b/wiktextract/extractor/de/page.py index 57537790..39d67da4 100644 --- a/wiktextract/extractor/de/page.py +++ b/wiktextract/extractor/de/page.py @@ -1,11 +1,9 @@ import copy import logging - from collections import defaultdict from typing import Dict, List, Union from wikitextprocessor import NodeKind, WikiNode - from wikitextprocessor.parser import LevelNode from wiktextract.datautils import append_base_data @@ -322,50 +320,8 @@ def parse_page( ) page_data = [] - for node in filter(lambda n: isinstance(n, WikiNode), tree.children): - # ignore certain top level templates - if node.kind == NodeKind.TEMPLATE: - template_name = node.template_name - - # Mostly meta-templates at the top of the page that do not carry - # any semantic information - IGNORE_TOP_LEVEL_TEMPLATES = { - "Wort der Woche", - "Siehe auch", - "erweitern", - "Abschnitte fehlen", - "überarbeiten", - "Zeichen", - "Wortart fehlt", - "TOC limit", - "Neuer Eintrag", - "Löschantrag/Vorlage", - "keine Belegstelle/Vorlage", - "Anmerkung Keilschrift", - "In Arbeit", - "Halbgeschützte Seite", - "anpassen", - } - - if template_name in IGNORE_TOP_LEVEL_TEMPLATES: - continue - - # ignore certain top level magic words - if node.kind == NodeKind.MAGIC_WORD and node.sarg in { - "__TOC__", - "__NOTOC__", - "__NOEDITSECTION__", - }: - continue - - if node.kind != NodeKind.LEVEL2: - wxr.wtp.warning( - f"Unexpected top-level node: {node}", - sortid="extractor/de/page/parse_page/61", - ) - continue - - for subtitle_template in node.find_content(NodeKind.TEMPLATE): + for level2_node in tree.find_child(NodeKind.LEVEL2): + for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): # The language sections are marked with # == ({{Sprache|<lang_name>}}) == # where <title> is the title of the page and <lang_name> is the @@ -389,6 +345,6 @@ def parse_page( }, ) page_data.append(copy.deepcopy(base_data)) - parse_section(wxr, page_data, base_data, node.children) + parse_section(wxr, page_data, base_data, level2_node.children) return page_data diff --git a/wiktextract/extractor/fr/page.py b/wiktextract/extractor/fr/page.py index d968e3f2..0f797e4a 100644 --- a/wiktextract/extractor/fr/page.py +++ b/wiktextract/extractor/fr/page.py @@ -161,22 +161,8 @@ def parse_page( ) page_data = [] - for node in filter(lambda n: isinstance(n, WikiNode), tree.children): - # ignore link created by `voir` template at the page top - if node.kind == NodeKind.TEMPLATE: - template_name = node.template_name - if template_name in {"voir", "voir2"} or template_name.startswith( - "voir/" - ): - continue - if node.kind != NodeKind.LEVEL2: - wxr.wtp.warning( - f"Unexpected top-level node: {node}", - sortid="extractor/fr/page/parse_page/94", - ) - continue - - for subtitle_template in node.find_content(NodeKind.TEMPLATE): + for level2_node in tree.find_child(NodeKind.LEVEL2): + for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): # https://fr.wiktionary.org/wiki/Modèle:langue # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues if subtitle_template.template_name == "langue": @@ -197,9 +183,9 @@ def parse_page( base_data.update(categories_and_links) page_data.append(copy.deepcopy(base_data)) etymology_data: Optional[EtymologyData] = None - for level_three_node in node.find_child(NodeKind.LEVEL3): + for level3_node in level2_node.find_child(NodeKind.LEVEL3): new_etymology_data = parse_section( - wxr, page_data, base_data, level_three_node + wxr, page_data, base_data, level3_node ) if new_etymology_data is not None: etymology_data = new_etymology_data diff --git a/wiktextract/extractor/zh/headword_line.py b/wiktextract/extractor/zh/headword_line.py index 24e6516c..223e8630 100644 --- a/wiktextract/extractor/zh/headword_line.py +++ b/wiktextract/extractor/zh/headword_line.py @@ -104,9 +104,7 @@ def extract_headword_line( for span_child in child.find_html( "strong", attr_name="class", attr_value="headword" ): - ruby_data, node_without_ruby = extract_ruby( - wxr, span_child - ) + ruby_data, node_without_ruby = extract_ruby(wxr, span_child) page_data[-1]["forms"].append( { "form": clean_node( @@ -114,7 +112,7 @@ def extract_headword_line( ), "ruby": ruby_data, "tags": ["canonical"], - } + } ) elif child.tag == "b": # this is a form <b> tag, already inside form parentheses diff --git a/wiktextract/extractor/zh/page.py b/wiktextract/extractor/zh/page.py index 3f2866fc..3d726d0e 100644 --- a/wiktextract/extractor/zh/page.py +++ b/wiktextract/extractor/zh/page.py @@ -264,30 +264,12 @@ def parse_page( ) page_data = [] - for node in filter(lambda n: isinstance(n, WikiNode), tree.children): - # ignore link created by `also` template at the page top - # also ignore "character info" templates - if node.kind == NodeKind.TEMPLATE and node.template_name.lower() in { - "also", - "see also", - "亦", - "character info", - "character info/new", - "character info/var", - }: - continue - if node.kind != NodeKind.LEVEL2: - wxr.wtp.warning( - f"Unexpected top-level node: {node}", - sortid="extractor/zh/page/parse_page/503", - ) - continue - + for level2_node in tree.find_child(NodeKind.LEVEL2): categories_and_links = defaultdict(list) - lang_name = clean_node(wxr, categories_and_links, node.largs) + lang_name = clean_node(wxr, categories_and_links, level2_node.largs) if lang_name not in wxr.config.LANGUAGES_BY_NAME: wxr.wtp.warning( - f"Unrecognized language name at top-level {lang_name}", + f"Unrecognized language name: {lang_name}", sortid="extractor/zh/page/parse_page/509", ) lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang_name) @@ -304,6 +286,6 @@ def parse_page( ) base_data.update(categories_and_links) page_data.append(copy.deepcopy(base_data)) - parse_section(wxr, page_data, base_data, node.children) + parse_section(wxr, page_data, base_data, level2_node.children) return page_data