Skip to content

Commit

Permalink
Merge pull request #353 from xxyzz/fr
Browse files Browse the repository at this point in the history
Simplify `parse_page()` in extractor code
  • Loading branch information
xxyzz authored Oct 7, 2023
2 parents ca9e913 + bcb8525 commit ffab519
Show file tree
Hide file tree
Showing 7 changed files with 19 additions and 97 deletions.
6 changes: 4 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.10', '3.11']
python-version: ['3.9', '3.10', '3.11', '3.12']
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
Expand All @@ -31,10 +31,12 @@ jobs:
python -m pip install -U pip
python -m pip install --use-pep517 '.[dev]'
- run: make test
env:
PYTHONWARNINGS: default
- name: Remove huge file taxondata_py.html
run: rm -f htmlcov/*_taxondata_py.html
- uses: actions/upload-pages-artifact@v2
if: github.ref_name == 'master' && matrix.python-version == '3.11'
if: github.ref_name == 'master' && matrix.python-version == '3.12'
with:
path: htmlcov

Expand Down
3 changes: 1 addition & 2 deletions tests/test_de_page.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
# Tests for parsing a page from the German Wiktionary

import unittest

from collections import defaultdict

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.page import (
fix_level_hierarchy_of_subsections,
parse_page,
parse_section,
fix_level_hierarchy_of_subsections,
)
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext
Expand Down
3 changes: 1 addition & 2 deletions wiktextract/extractor/de/gloss.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import re
from collections import defaultdict
from typing import Dict, List

import re

from wikitextprocessor import NodeKind, WikiNode

from wiktextract.page import clean_node
Expand Down
50 changes: 3 additions & 47 deletions wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import copy
import logging

from collections import defaultdict
from typing import Dict, List, Union

from wikitextprocessor import NodeKind, WikiNode

from wikitextprocessor.parser import LevelNode

from wiktextract.datautils import append_base_data
Expand Down Expand Up @@ -322,50 +320,8 @@ def parse_page(
)

page_data = []
for node in filter(lambda n: isinstance(n, WikiNode), tree.children):
# ignore certain top level templates
if node.kind == NodeKind.TEMPLATE:
template_name = node.template_name

# Mostly meta-templates at the top of the page that do not carry
# any semantic information
IGNORE_TOP_LEVEL_TEMPLATES = {
"Wort der Woche",
"Siehe auch",
"erweitern",
"Abschnitte fehlen",
"überarbeiten",
"Zeichen",
"Wortart fehlt",
"TOC limit",
"Neuer Eintrag",
"Löschantrag/Vorlage",
"keine Belegstelle/Vorlage",
"Anmerkung Keilschrift",
"In Arbeit",
"Halbgeschützte Seite",
"anpassen",
}

if template_name in IGNORE_TOP_LEVEL_TEMPLATES:
continue

# ignore certain top level magic words
if node.kind == NodeKind.MAGIC_WORD and node.sarg in {
"__TOC__",
"__NOTOC__",
"__NOEDITSECTION__",
}:
continue

if node.kind != NodeKind.LEVEL2:
wxr.wtp.warning(
f"Unexpected top-level node: {node}",
sortid="extractor/de/page/parse_page/61",
)
continue

for subtitle_template in node.find_content(NodeKind.TEMPLATE):
for level2_node in tree.find_child(NodeKind.LEVEL2):
for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE):
# The language sections are marked with
# == <title> ({{Sprache|<lang_name>}}) ==
# where <title> is the title of the page and <lang_name> is the
Expand All @@ -389,6 +345,6 @@ def parse_page(
},
)
page_data.append(copy.deepcopy(base_data))
parse_section(wxr, page_data, base_data, node.children)
parse_section(wxr, page_data, base_data, level2_node.children)

return page_data
22 changes: 4 additions & 18 deletions wiktextract/extractor/fr/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,22 +161,8 @@ def parse_page(
)

page_data = []
for node in filter(lambda n: isinstance(n, WikiNode), tree.children):
# ignore link created by `voir` template at the page top
if node.kind == NodeKind.TEMPLATE:
template_name = node.template_name
if template_name in {"voir", "voir2"} or template_name.startswith(
"voir/"
):
continue
if node.kind != NodeKind.LEVEL2:
wxr.wtp.warning(
f"Unexpected top-level node: {node}",
sortid="extractor/fr/page/parse_page/94",
)
continue

for subtitle_template in node.find_content(NodeKind.TEMPLATE):
for level2_node in tree.find_child(NodeKind.LEVEL2):
for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE):
# https://fr.wiktionary.org/wiki/Modèle:langue
# https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues
if subtitle_template.template_name == "langue":
Expand All @@ -197,9 +183,9 @@ def parse_page(
base_data.update(categories_and_links)
page_data.append(copy.deepcopy(base_data))
etymology_data: Optional[EtymologyData] = None
for level_three_node in node.find_child(NodeKind.LEVEL3):
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
new_etymology_data = parse_section(
wxr, page_data, base_data, level_three_node
wxr, page_data, base_data, level3_node
)
if new_etymology_data is not None:
etymology_data = new_etymology_data
Expand Down
6 changes: 2 additions & 4 deletions wiktextract/extractor/zh/headword_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,17 +104,15 @@ def extract_headword_line(
for span_child in child.find_html(
"strong", attr_name="class", attr_value="headword"
):
ruby_data, node_without_ruby = extract_ruby(
wxr, span_child
)
ruby_data, node_without_ruby = extract_ruby(wxr, span_child)
page_data[-1]["forms"].append(
{
"form": clean_node(
wxr, page_data[-1], node_without_ruby
),
"ruby": ruby_data,
"tags": ["canonical"],
}
}
)
elif child.tag == "b":
# this is a form <b> tag, already inside form parentheses
Expand Down
26 changes: 4 additions & 22 deletions wiktextract/extractor/zh/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,30 +264,12 @@ def parse_page(
)

page_data = []
for node in filter(lambda n: isinstance(n, WikiNode), tree.children):
# ignore link created by `also` template at the page top
# also ignore "character info" templates
if node.kind == NodeKind.TEMPLATE and node.template_name.lower() in {
"also",
"see also",
"亦",
"character info",
"character info/new",
"character info/var",
}:
continue
if node.kind != NodeKind.LEVEL2:
wxr.wtp.warning(
f"Unexpected top-level node: {node}",
sortid="extractor/zh/page/parse_page/503",
)
continue

for level2_node in tree.find_child(NodeKind.LEVEL2):
categories_and_links = defaultdict(list)
lang_name = clean_node(wxr, categories_and_links, node.largs)
lang_name = clean_node(wxr, categories_and_links, level2_node.largs)
if lang_name not in wxr.config.LANGUAGES_BY_NAME:
wxr.wtp.warning(
f"Unrecognized language name at top-level {lang_name}",
f"Unrecognized language name: {lang_name}",
sortid="extractor/zh/page/parse_page/509",
)
lang_code = wxr.config.LANGUAGES_BY_NAME.get(lang_name)
Expand All @@ -304,6 +286,6 @@ def parse_page(
)
base_data.update(categories_and_links)
page_data.append(copy.deepcopy(base_data))
parse_section(wxr, page_data, base_data, node.children)
parse_section(wxr, page_data, base_data, level2_node.children)

return page_data

0 comments on commit ffab519

Please sign in to comment.