From 63ce8797b45d452b564c97c6daad2510c8a33e82 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 28 Nov 2024 11:36:10 +0800 Subject: [PATCH 1/4] [fr] don't extract etymology example section twice --- src/wiktextract/extractor/fr/etymology.py | 5 +---- tests/test_fr_etymology.py | 7 ++++--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/wiktextract/extractor/fr/etymology.py b/src/wiktextract/extractor/fr/etymology.py index c3a3974a..da58da76 100644 --- a/src/wiktextract/extractor/fr/etymology.py +++ b/src/wiktextract/extractor/fr/etymology.py @@ -34,11 +34,8 @@ def extract_etymology( for node_index, node in level_node.find_child( NodeKind.LIST | LEVEL_KIND_FLAGS, True ): - if node.kind in LEVEL_KIND_FLAGS: + if node.kind in LEVEL_KIND_FLAGS and node_index < level_node_index: level_node_index = node_index - title_text = clean_node(wxr, None, node.largs) - if title_text == "Attestations historiques": - extract_etymology_examples(wxr, node, base_data) elif node.kind == NodeKind.LIST: for etymology_item in node.find_child(NodeKind.LIST_ITEM): etymology_data = find_pos_in_etymology_list(wxr, etymology_item) diff --git a/tests/test_fr_etymology.py b/tests/test_fr_etymology.py index f6ed0164..295323f4 100644 --- a/tests/test_fr_etymology.py +++ b/tests/test_fr_etymology.py @@ -9,6 +9,7 @@ insert_etymology_data, ) from wiktextract.extractor.fr.models import WordEntry +from wiktextract.extractor.fr.page import parse_section from wiktextract.wxr_context import WiktextractContext @@ -293,7 +294,7 @@ def test_etymology_examples(self): word_entry = WordEntry( lang="Français", lang_code="fr", word="autrice", pos="noun" ) - extract_etymology(self.wxr, root, word_entry) + parse_section(self.wxr, [], word_entry, root.children[0]) data = word_entry.model_dump(exclude_defaults=True) self.assertEqual( data["etymology_examples"], @@ -398,7 +399,7 @@ def test_etymology_examples_nested_lists(self): word_entry = WordEntry( lang="Français", lang_code="fr", word="drone", pos="noun" ) - extract_etymology(self.wxr, root, word_entry) + parse_section(self.wxr, [], word_entry, root.children[0]) data = word_entry.model_dump(exclude_defaults=True) self.assertEqual( data["etymology_examples"], @@ -423,7 +424,7 @@ def test_etymology_examples_text(self): word_entry = WordEntry( lang="Français", lang_code="fr", word="préavertir", pos="verb" ) - extract_etymology(self.wxr, root, word_entry) + parse_section(self.wxr, [], word_entry, root.children[0]) data = word_entry.model_dump(exclude_defaults=True) self.assertEqual( data["etymology_examples"], From 1eef72d3a1bcaa7c91f6304eae5f6e3a23dba92d Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 28 Nov 2024 14:29:11 +0800 Subject: [PATCH 2/4] =?UTF-8?q?[fr]=20don't=20extract=20"=C3=A9quiv-pour"?= =?UTF-8?q?=20template=20in=20gloss=20list=20as=20tag?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/wiktextract/extractor/fr/form_line.py | 9 ++++--- src/wiktextract/extractor/fr/gloss.py | 23 ++++++++++++---- src/wiktextract/extractor/fr/models.py | 1 + tests/test_fr_gloss.py | 32 +++++++++++++++++++++++ 4 files changed, 57 insertions(+), 8 deletions(-) diff --git a/src/wiktextract/extractor/fr/form_line.py b/src/wiktextract/extractor/fr/form_line.py index 50639e3c..6fa8ad73 100644 --- a/src/wiktextract/extractor/fr/form_line.py +++ b/src/wiktextract/extractor/fr/form_line.py @@ -94,7 +94,7 @@ def extract_form_line( def process_equiv_pour_template( wxr: WiktextractContext, node: TemplateNode, page_data: list[WordEntry] -) -> None: +) -> list[Form]: # equivalent form: https://fr.wiktionary.org/wiki/Modèle:équiv-pour expanded_node = wxr.wtp.parse( wxr.wtp.node_to_wikitext(node), expand_all=True @@ -109,7 +109,7 @@ def process_equiv_pour_template( "une fille": "feminine", "une personne non-binaire": "neuter", } - + forms = [] for child in expanded_node.find_child(NodeKind.ITALIC | NodeKind.HTML): if child.kind == NodeKind.ITALIC: raw_gender_tag = clean_node(wxr, None, child).strip("() ") @@ -127,7 +127,10 @@ def process_equiv_pour_template( else: form_data.raw_tags.append(raw_gender_tag) if len(form_data.form) > 0: - page_data[-1].forms.append(form_data) + if len(page_data) > 0: + page_data[-1].forms.append(form_data) + forms.append(form_data) + return forms def process_zh_mot_template( diff --git a/src/wiktextract/extractor/fr/gloss.py b/src/wiktextract/extractor/fr/gloss.py index 1d387077..bfabbe0a 100644 --- a/src/wiktextract/extractor/fr/gloss.py +++ b/src/wiktextract/extractor/fr/gloss.py @@ -1,3 +1,4 @@ +import re from collections import defaultdict from typing import Optional, Union @@ -31,7 +32,10 @@ def extract_gloss( # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens tag_indexes = set() for index, gloss_node in enumerate(gloss_nodes): - if isinstance(gloss_node, TemplateNode): + if ( + isinstance(gloss_node, TemplateNode) + and gloss_node.template_name != "équiv-pour" + ): categories_data = defaultdict(list) expanded_text = clean_node(wxr, categories_data, gloss_node) if ( @@ -74,7 +78,7 @@ def extract_gloss( ): note_index = index gloss_text = find_alt_of_form( - wxr, gloss_only_nodes[:note_index], page_data[-1].pos, gloss_data + wxr, gloss_only_nodes[:note_index], page_data[-1], gloss_data ) if "form-of" in page_data[-1].tags: find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data) @@ -176,13 +180,14 @@ def process_exemple_template( def find_alt_of_form( wxr: WiktextractContext, - gloss_nodes: list[Union[str, WikiNode]], - pos_type: str, + gloss_nodes: list[str | WikiNode], + word_entry: WordEntry, gloss_data: Sense, ) -> str: """ Return gloss text, remove tag template expanded from "variante *" templates. """ + from .form_line import process_equiv_pour_template alt_of = "" filtered_gloss_nodes = [] @@ -216,10 +221,17 @@ def find_alt_of_form( gloss_data.raw_tags.append(raw_tag) else: filtered_gloss_nodes.append(node) + elif ( + isinstance(gloss_node, TemplateNode) + and gloss_node.template_name == "équiv-pour" + ): + for form_data in process_equiv_pour_template(wxr, gloss_node, []): + form_data.sense_index = len(word_entry.senses) + 1 + word_entry.forms.append(form_data) else: filtered_gloss_nodes.append(gloss_node) - if alt_of == "" and pos_type == "typographic variant": + if alt_of == "" and word_entry.pos == "typographic variant": for gloss_node in filter( lambda n: isinstance(n, WikiNode), gloss_nodes ): @@ -236,6 +248,7 @@ def find_alt_of_form( gloss_data.alt_of.append(AltForm(word=alt_of)) gloss_text = clean_node(wxr, gloss_data, filtered_gloss_nodes) + gloss_text = re.sub(r"\s+\.$", ".", gloss_text) brackets = 0 for char in gloss_text: if char == "(": diff --git a/src/wiktextract/extractor/fr/models.py b/src/wiktextract/extractor/fr/models.py index f02f0945..5c75b19b 100644 --- a/src/wiktextract/extractor/fr/models.py +++ b/src/wiktextract/extractor/fr/models.py @@ -40,6 +40,7 @@ class Form(FrenchBaseModel): ) hiragana: str = "" roman: str = "" + sense_index: int = Field(default=0, ge=0) class Sound(FrenchBaseModel): diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py index 33d61f40..cb75b578 100644 --- a/tests/test_fr_gloss.py +++ b/tests/test_fr_gloss.py @@ -691,3 +691,35 @@ def test_zh_exemple_template_in_list(self): } ], ) + + def test_équiv_pour_in_gloss(self): + self.wxr.wtp.start_page("aumônière") + self.wxr.wtp.add_page( + "Modèle:équiv-pour", + 10, + """''(pour un homme, on dit'' : [[aumônier#fr|aumônier]]'')''""", + ) + root = self.wxr.wtp.parse( + "# gloss {{équiv-pour|un homme|aumônier|lang=fr}}." + ) + page_data = [ + WordEntry( + word="aumônière", + lang_code="fr", + lang="Français", + pos="noun", + ) + ] + extract_gloss(self.wxr, page_data, root.children[0]) + self.assertEqual( + [f.model_dump(exclude_defaults=True) for f in page_data[0].forms], + [ + { + "form": "aumônier", + "tags": ["masculine"], + "source": "form line template 'équiv-pour'", + "sense_index": 1, + } + ], + ) + self.assertEqual(page_data[0].senses[0].glosses, ["gloss."]) From 0fb0f76c331cff83aac09552178a408439c06ffb Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 28 Nov 2024 14:55:42 +0800 Subject: [PATCH 3/4] [fr] extract note data in form line --- src/wiktextract/extractor/fr/etymology.py | 3 +-- src/wiktextract/extractor/fr/form_line.py | 11 +++++++---- src/wiktextract/extractor/fr/gloss.py | 7 +++---- src/wiktextract/extractor/fr/page.py | 6 +++--- src/wiktextract/extractor/fr/tags.py | 18 ++++++++---------- src/wiktextract/extractor/fr/translation.py | 10 ++++------ tests/test_fr_form_line.py | 7 +++++++ 7 files changed, 33 insertions(+), 29 deletions(-) diff --git a/src/wiktextract/extractor/fr/etymology.py b/src/wiktextract/extractor/fr/etymology.py index da58da76..76107585 100644 --- a/src/wiktextract/extractor/fr/etymology.py +++ b/src/wiktextract/extractor/fr/etymology.py @@ -1,6 +1,5 @@ from collections import defaultdict from dataclasses import dataclass, field -from typing import Optional from wikitextprocessor.parser import ( LEVEL_KIND_FLAGS, @@ -85,7 +84,7 @@ def extract_etymology( def find_pos_in_etymology_list( wxr: WiktextractContext, list_item_node: WikiNode -) -> Optional[tuple[str, str, str, list[str]]]: +) -> tuple[str, str, str, list[str]] | None: """ Return tuple of POS id, title, etymology text, categories if the passed list item node starts with italic POS node or POS template, otherwise diff --git a/src/wiktextract/extractor/fr/form_line.py b/src/wiktextract/extractor/fr/form_line.py index 6fa8ad73..b2969ae1 100644 --- a/src/wiktextract/extractor/fr/form_line.py +++ b/src/wiktextract/extractor/fr/form_line.py @@ -1,5 +1,3 @@ -from typing import Union - from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode from ...page import clean_node @@ -17,7 +15,7 @@ def extract_form_line( wxr: WiktextractContext, page_data: list[WordEntry], - nodes: list[Union[WikiNode, str]], + nodes: list[WikiNode | str], ) -> None: """ Ligne de forme @@ -32,7 +30,7 @@ def extract_form_line( pre_template_name = "" for index, node in enumerate(nodes): - if isinstance(node, WikiNode) and node.kind == NodeKind.TEMPLATE: + if isinstance(node, TemplateNode): if node.template_name in IGNORE_TEMPLATES: continue elif node.template_name in PRON_TEMPLATES: @@ -56,6 +54,11 @@ def extract_form_line( continue elif node.template_name == "lien pronominal": process_lien_pronominal(wxr, node, page_data) + elif node.template_name == "note": + note = clean_node(wxr, page_data[-1], nodes[index + 1 :]) + if note != "": + page_data[-1].notes.append(note) + break else: raw_tag = clean_node(wxr, page_data[-1], node) expanded_template = wxr.wtp.parse( diff --git a/src/wiktextract/extractor/fr/gloss.py b/src/wiktextract/extractor/fr/gloss.py index bfabbe0a..2a348a18 100644 --- a/src/wiktextract/extractor/fr/gloss.py +++ b/src/wiktextract/extractor/fr/gloss.py @@ -1,6 +1,5 @@ import re from collections import defaultdict -from typing import Optional, Union from wikitextprocessor import NodeKind, TemplateNode, WikiNode @@ -14,7 +13,7 @@ def extract_gloss( wxr: WiktextractContext, page_data: list[WordEntry], list_node: WikiNode, - parent_sense: Optional[Sense] = None, + parent_sense: Sense | None = None, ) -> None: for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): gloss_nodes = list( @@ -144,7 +143,7 @@ def extract_examples( def process_exemple_template( wxr: WiktextractContext, node: TemplateNode, - gloss_data: Optional[Sense], + gloss_data: Sense | None, time: str = "", ) -> Example: # https://fr.wiktionary.org/wiki/Modèle:exemple @@ -262,7 +261,7 @@ def find_alt_of_form( def find_form_of_word( wxr: WiktextractContext, - gloss_nodes: list[Union[str, WikiNode]], + gloss_nodes: list[str | WikiNode], gloss_data: Sense, ) -> None: # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_variantes diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py index 8175bf9c..6d2794f9 100644 --- a/src/wiktextract/extractor/fr/page.py +++ b/src/wiktextract/extractor/fr/page.py @@ -1,4 +1,4 @@ -from typing import Any, Optional +from typing import Any from wikitextprocessor.parser import ( LEVEL_KIND_FLAGS, @@ -41,7 +41,7 @@ def parse_section( page_data: list[WordEntry], base_data: WordEntry, level_node: WikiNode, -) -> Optional[EtymologyData]: +) -> EtymologyData | None: etymology_data = None for level_node_template in level_node.find_content(NodeKind.TEMPLATE): if level_node_template.template_name == "S": @@ -230,7 +230,7 @@ def parse_page( pos="unknown", categories=categories.get("categories", []), ) - etymology_data: Optional[EtymologyData] = None + etymology_data: EtymologyData | None = None for level3_node in level2_node.find_child(NodeKind.LEVEL3): new_etymology_data = parse_section( wxr, page_data, base_data, level3_node diff --git a/src/wiktextract/extractor/fr/tags.py b/src/wiktextract/extractor/fr/tags.py index d5f572b3..a2a40ea0 100644 --- a/src/wiktextract/extractor/fr/tags.py +++ b/src/wiktextract/extractor/fr/tags.py @@ -2,12 +2,10 @@ # https://fr.wiktionary.org/wiki/Annexe:Glossaire_grammatical # List of templates: # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles -from typing import Union - from .models import WordEntry # https://en.wikipedia.org/wiki/Grammatical_gender -GENDER_TAGS: dict[str, Union[str, list[str]]] = { +GENDER_TAGS: dict[str, str | list[str]] = { "commun": "common", "féminin": "feminine", "masculin": "masculine", @@ -23,7 +21,7 @@ } # https://en.wikipedia.org/wiki/Grammatical_number -NUMBER_TAGS: dict[str, Union[str, list[str]]] = { +NUMBER_TAGS: dict[str, str | list[str]] = { "singulier": "singular", "pluriel": "plural", "duel": "dual", @@ -51,7 +49,7 @@ "volitif": "volitive", } -VERB_FORM_TAGS: dict[str, Union[str, list[str]]] = { +VERB_FORM_TAGS: dict[str, str | list[str]] = { "participe": "participle", "imparfait": "imperfect", "infinitif": "infinitive", @@ -62,7 +60,7 @@ } # https://en.wikipedia.org/wiki/Grammatical_case -CASE_TAGS: dict[str, Union[str, list[str]]] = { +CASE_TAGS: dict[str, str | list[str]] = { "ablatif": "ablative", "accusatif": "accusative", "accusatif génitif": ["accusative", "genitive"], @@ -78,7 +76,7 @@ } # https://en.wikipedia.org/wiki/Grammatical_tense -TENSE_TAGS: dict[str, Union[str, list[str]]] = { +TENSE_TAGS: dict[str, str | list[str]] = { "présent": "present", "passé": "past", "passé simple": "past", @@ -96,7 +94,7 @@ } # https://en.wikipedia.org/wiki/Grammatical_person -PERSON_TAGS: dict[str, Union[str, list[str]]] = { +PERSON_TAGS: dict[str, str | list[str]] = { "1ᵉ personne": "first-person", "1ʳᵉ personne": "first-person", "2ᵉ personne": "second-person", @@ -216,7 +214,7 @@ } # https://en.wikipedia.org/wiki/Voice_(grammar) -VOICE_TAGS: dict[str, Union[str, list[str]]] = { +VOICE_TAGS: dict[str, str | list[str]] = { # https://fr.wiktionary.org/wiki/Modèle:eo-conj "participe actif": ["participle", "active"], "participe passif": ["participle", "passive"], @@ -285,7 +283,7 @@ "imperfectif": "imperfective", # Modèle:imperfectif } -GRAMMATICAL_TAGS: dict[str, Union[str, list[str]]] = { +GRAMMATICAL_TAGS: dict[str, str | list[str]] = { **GENDER_TAGS, **NUMBER_TAGS, **MOOD_TAGS, diff --git a/src/wiktextract/extractor/fr/translation.py b/src/wiktextract/extractor/fr/translation.py index d44ebf7c..08036efb 100644 --- a/src/wiktextract/extractor/fr/translation.py +++ b/src/wiktextract/extractor/fr/translation.py @@ -1,5 +1,3 @@ -from typing import Optional - from mediawiki_langcodes import code_to_name from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode @@ -59,8 +57,8 @@ def extract_translation( def process_italic_node( wxr: WiktextractContext, italic_node: WikiNode, - previous_node: Optional[WikiNode], - translation_data: Optional[Translation], + previous_node: WikiNode | None, + translation_data: Translation | None, ) -> None: # add italic text after a "trad" template as a tag tag = clean_node(wxr, None, italic_node) @@ -83,8 +81,8 @@ def process_translation_templates( template_node: TemplateNode, page_data: list[WordEntry], base_translation_data: Translation, - translation_data: Optional[Translation], -) -> Optional[Translation]: + translation_data: Translation | None, +) -> Translation | None: if template_node.template_name == "trad-fin": # ignore translation end template return diff --git a/tests/test_fr_form_line.py b/tests/test_fr_form_line.py index 34536c72..a3376af9 100644 --- a/tests/test_fr_form_line.py +++ b/tests/test_fr_form_line.py @@ -202,3 +202,10 @@ def test_lien_pronominal(self): [f.model_dump(exclude_defaults=True) for f in page_data[-1].forms], [{"form": "se définir", "tags": ["pronominal"]}], ) + + def test_note(self): + self.wxr.wtp.start_page("autaire") + page_data = [WordEntry(word="autaire", lang_code="fr", lang="Français")] + root = self.wxr.wtp.parse("'''autaire''' {{note}} note") + extract_form_line(self.wxr, page_data, root.children) + self.assertEqual(page_data[-1].notes, ["note"]) From f3bc491493d957825b55e553dc4d424ef0bbeacf Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 28 Nov 2024 16:08:12 +0800 Subject: [PATCH 4/4] [fr] don't add tag in "fr-verbe-flexion" template as form this last cell seems always have the same word as page title --- src/wiktextract/extractor/fr/inflection.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/wiktextract/extractor/fr/inflection.py b/src/wiktextract/extractor/fr/inflection.py index ec679233..604cc017 100644 --- a/src/wiktextract/extractor/fr/inflection.py +++ b/src/wiktextract/extractor/fr/inflection.py @@ -228,6 +228,9 @@ def process_inflection_table( form_data.raw_tags.extend(row_headers) if form_data.form != "": for form in form_data.form.splitlines(): + if form.startswith("(") and form.endswith(")"): + form_data.raw_tags.append(form.strip("()")) + continue new_form_data = form_data.model_copy(deep=True) new_form_data.form = form.removeprefix("ou ") translate_raw_tags(