From f703feec36504db4338673e4e40e5c6cfc05d065 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 26 Sep 2023 12:06:16 +0800 Subject: [PATCH] Combine separated synonyms tag string and template also ignore empty string tag --- tests/test_fr_linkage.py | 25 ++++++++++++++++++++++-- wiktextract/extractor/fr/form_line.py | 2 +- wiktextract/extractor/fr/linkage.py | 28 +++++++++++++++++++++++---- 3 files changed, 48 insertions(+), 7 deletions(-) diff --git a/tests/test_fr_linkage.py b/tests/test_fr_linkage.py index 8ec6e988..c48ba3d2 100644 --- a/tests/test_fr_linkage.py +++ b/tests/test_fr_linkage.py @@ -23,7 +23,7 @@ def tearDown(self) -> None: def test_tags(self): page_data = [defaultdict(list)] - self.wxr.wtp.start_page("") + self.wxr.wtp.start_page("bonjour") self.wxr.wtp.add_page("Modèle:Canada", 10, body="(Canada)") self.wxr.wtp.add_page("Modèle:Louisiane", 10, body="(Louisiane)") root = self.wxr.wtp.parse( @@ -43,7 +43,7 @@ def test_tags(self): def test_zh_synonyms(self): page_data = [defaultdict(list)] - self.wxr.wtp.start_page("") + self.wxr.wtp.start_page("你好") root = self.wxr.wtp.parse( "==== {{S|synonymes}} ====\n* {{zh-lien|你们好|nǐmen hǎo|你們好}} — Bonjour (au pluriel)." ) @@ -63,3 +63,24 @@ def test_zh_synonyms(self): } ], ) + + def test_template_as_partial_tag(self): + page_data = [defaultdict(list)] + self.wxr.wtp.start_page("bonjour") + self.wxr.wtp.add_page("Modèle:lien", 10, body="kwei") + self.wxr.wtp.add_page("Modèle:Canada", 10, body="(Canada)") + self.wxr.wtp.add_page("Modèle:L", 10, body="Atikamekw") + root = self.wxr.wtp.parse( + "==== {{S|synonymes}} ====\n* {{lien|kwei|fr}} {{Canada|nocat=1}} (mot {{L|atj}})" + ) + extract_linkage(self.wxr, page_data, root.children[0], "synonyms") + self.assertEqual( + page_data, + [ + { + "synonyms": [ + {"word": "kwei", "tags": ["Canada", "mot Atikamekw"]} + ] + } + ], + ) diff --git a/wiktextract/extractor/fr/form_line.py b/wiktextract/extractor/fr/form_line.py index 238b61cb..80644cb8 100644 --- a/wiktextract/extractor/fr/form_line.py +++ b/wiktextract/extractor/fr/form_line.py @@ -45,7 +45,7 @@ def extract_form_line( ): # it's the location of the previous IPA template page_data[-1]["sounds"][-1]["tags"].append(tag.strip("()")) - else: + elif len(tag.strip("()")) > 0: page_data[-1]["tags"].append(tag.strip("()")) pre_template_name = node.template_name diff --git a/wiktextract/extractor/fr/linkage.py b/wiktextract/extractor/fr/linkage.py index ba445469..d512dd7b 100644 --- a/wiktextract/extractor/fr/linkage.py +++ b/wiktextract/extractor/fr/linkage.py @@ -16,6 +16,7 @@ def extract_linkage( ) -> None: for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM): linkage_data = defaultdict(list) + pending_tag = "" for index, child_node in enumerate( list_item_node.filter_empty_str_child() ): @@ -28,10 +29,29 @@ def extract_linkage( else: linkage_data["word"] = clean_node(wxr, None, child_node) else: - tag = clean_node(wxr, page_data[-1], child_node).strip("()") + tag = ( + child_node + if isinstance(child_node, str) + else clean_node(wxr, page_data[-1], child_node) + ) + if tag.strip().startswith("(") and not tag.strip().endswith( + ")" + ): + pending_tag = tag + continue + elif not tag.strip().startswith("(") and tag.strip().endswith( + ")" + ): + tag = pending_tag + tag + pending_tag = "" + elif len(pending_tag) > 0: + pending_tag += tag + continue + + tag = tag.strip("() \n") if tag.startswith("— "): linkage_data["translation"] = tag.removeprefix("— ") - else: + elif len(tag) > 0: linkage_data["tags"].append(tag) page_data[-1][linkage_type].append(linkage_data) @@ -53,8 +73,8 @@ def process_lien_template( node: TemplateNode, linkage_data: Dict[str, Union[str, List[str]]], ) -> None: - # https://fr.wiktionary.org/wiki/Modèle:lien - if "dif" in node.template_parameters: + # link word template: https://fr.wiktionary.org/wiki/Modèle:lien + if "dif" in node.template_parameters: # displayed word word = clean_node(wxr, None, node.template_parameters.get("dif")) else: word = clean_node(wxr, None, node.template_parameters.get(1))