Merge pull request #348 from xxyzz/fr

Some fixes for French Wiktionary
tatuylonen · Sep 26, 2023 · 91ae1d0 · 91ae1d0
2 parents e0b24ef + ef2ed10
commit 91ae1d0
Show file tree

Hide file tree

Showing 8 changed files with 124 additions and 41 deletions.
diff --git a/tests/test_fr_inflection.py b/tests/test_fr_inflection.py
@@ -2,7 +2,8 @@
 from collections import defaultdict
 from unittest.mock import patch
 
-from wikitextprocessor import NodeKind, WikiNode, Wtp
+from wikitextprocessor import Wtp
+from wikitextprocessor.parser import TemplateNode
 
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.fr.inflection import extract_inflection
@@ -37,7 +38,7 @@ def tearDown(self) -> None:
     )
     def test_fr_reg(self, mock_node_to_wikitext):
         page_data = [defaultdict(list, {"word": "productrice"})]
-        node = WikiNode(NodeKind.TEMPLATE, 0)
+        node = TemplateNode(0)
         self.wxr.wtp.start_page("productrice")
         extract_inflection(self.wxr, page_data, node, "fr-rég")
         self.assertEqual(
@@ -47,25 +48,25 @@ def test_fr_reg(self, mock_node_to_wikitext):
 
     @patch(
         "wikitextprocessor.Wtp.node_to_wikitext",
-        return_value="""{|
+        return_value="""{|class="flextable flextable-fr-mfsp"
 |-
-|class='invisible'|
-!scope='col'| Singulier
-!scope='col'| Pluriel
-|- class='flextable-fr-m'
-!scope='row'| Masculin
+!scope="col"| Singulier
+!scope="col"| Pluriel
+|- class="flextable-fr-m"
+!scope="row"| Masculin
 |[[animal]]<br>[[Annexe:Prononciation/français|<span>\\a.ni.mal\\</span>]]
 |[[animaux]]<br>[[Annexe:Prononciation/français|<span>\\a.ni.mo\\</span>]]
-|- class='flextable-fr-f'
-!scope='row'| Féminin
+|- class="flextable-fr-f"
+!scope="row"| Féminin
 |[[animale]]<br>[[Annexe:Prononciation/français|<span>\\a.ni.mal\\</span>]]
 |[[animales]]<br>[[Annexe:Prononciation/français|<span>\\a.ni.mal\\</span>]]
 |}""",
     )
     def test_fr_accord_al(self, mock_node_to_wikitext):
         # https://fr.wiktionary.org/wiki/animal#Adjectif
+        self.maxDiff = None
         page_data = [defaultdict(list, {"word": "animal", "lang_code": "fr"})]
-        node = WikiNode(NodeKind.TEMPLATE, 0)
+        node = TemplateNode(0)
         self.wxr.wtp.start_page("animal")
         extract_inflection(self.wxr, page_data, node, "fr-accord-al")
         self.assertEqual(
@@ -101,7 +102,7 @@ def test_fr_accord_al(self, mock_node_to_wikitext):
     def test_multiple_lines_ipa(self, mock_node_to_wikitext):
         # https://fr.wiktionary.org/wiki/ration#Nom_commun_2
         page_data = [defaultdict(list, {"lang_code": "en", "word": "ration"})]
-        node = WikiNode(NodeKind.TEMPLATE, 0)
+        node = TemplateNode(0)
         self.wxr.wtp.start_page("ration")
         extract_inflection(self.wxr, page_data, node, "en-nom-rég")
         self.assertEqual(
@@ -128,7 +129,7 @@ def test_multiple_lines_ipa(self, mock_node_to_wikitext):
     def test_single_line_multiple_ipa(self, mock_node_to_wikitext):
         # https://fr.wiktionary.org/wiki/ration#Verbe
         page_data = [defaultdict(list, {"lang_code": "en", "word": "ration"})]
-        node = WikiNode(NodeKind.TEMPLATE, 0)
+        node = TemplateNode(0)
         self.wxr.wtp.start_page("ration")
         extract_inflection(self.wxr, page_data, node, "en-conj-rég")
         self.assertEqual(
@@ -155,10 +156,32 @@ def test_single_line_multiple_ipa(self, mock_node_to_wikitext):
     def test_invalid_ipa(self, mock_node_to_wikitext):
         # https://fr.wiktionary.org/wiki/animal#Nom_commun_3
         page_data = [defaultdict(list, {"lang_code": "en", "word": "animal"})]
-        node = WikiNode(NodeKind.TEMPLATE, 0)
+        node = TemplateNode(0)
         self.wxr.wtp.start_page("animal")
         extract_inflection(self.wxr, page_data, node, "ast-accord-mf")
         self.assertEqual(
             page_data[-1].get("forms"),
             [{"tags": ["Pluriel"], "form": "animales"}],
         )
+
+    @patch(
+        "wikitextprocessor.Wtp.node_to_wikitext",
+        return_value="""{| class="flextable"
+|-
+! Simplifié
+| <bdi lang="zh-Hans" xml:lang="zh-Hans" class="lang-zh-Hans">[[一万#zh|一万]]</bdi>
+|-
+! Traditionnel
+| <bdi lang="zh-Hant" xml:lang="zh-Hant" class="lang-zh-Hant">[[一萬#zh|一萬]]</bdi>
+|}""",
+    )
+    def test_no_column_headers(self, mock_node_to_wikitext):
+        # https://fr.wiktionary.org/wiki/一万#Nom_commun
+        page_data = [defaultdict(list, {"lang_code": "zh", "word": "一万"})]
+        node = TemplateNode(0)
+        self.wxr.wtp.start_page("一万")
+        extract_inflection(self.wxr, page_data, node, "zh-formes")
+        self.assertEqual(
+            page_data[-1].get("forms"),
+            [{"tags": ["Traditionnel"], "form": "一萬"}],
+        )
diff --git a/tests/test_fr_linkage.py b/tests/test_fr_linkage.py
@@ -84,3 +84,22 @@ def test_template_as_partial_tag(self):
                 }
             ],
         )
+
+    def test_list_item_has_two_words(self):
+        page_data = [defaultdict(list)]
+        self.wxr.wtp.start_page("masse")
+        root = self.wxr.wtp.parse(
+            "==== {{S|dérivés}} ====\n* [[être à la masse]], [[mettre à la masse]]"
+        )
+        extract_linkage(self.wxr, page_data, root.children[0], "derived")
+        self.assertEqual(
+            page_data,
+            [
+                {
+                    "derived": [
+                        {"word": "être à la masse"},
+                        {"word": "mettre à la masse"},
+                    ]
+                }
+            ],
+        )
diff --git a/tests/test_fr_translation.py b/tests/test_fr_translation.py
@@ -70,7 +70,7 @@ def test_template_tag(self):
                         {
                             "code": "ar",
                             "lang": "Arabe",
-                            "word": "مرحبا",
+                            "word": "مرحبًا",
                             "roman": "mrḥbā",
                             "tags": ["Informel"],
                         },
@@ -128,3 +128,31 @@ def test_trad_template_gender_parameter(self):
                 }
             ],
         )
+
+    def test_template_sense_parameter(self):
+        self.wxr.wtp.start_page("masse")
+        self.wxr.wtp.add_page("Modèle:info lex", 10, body="(Finance)")
+        self.wxr.wtp.add_page("Modèle:T", 10, body="Croate")
+        self.wxr.wtp.add_page("Modèle:trad+", 10, body="masa")
+        root = self.wxr.wtp.parse(
+            """=== Traductions ===
+{{trad-début|{{info lex|finance}}|12}}
+* {{T|hr}} : {{trad+|hr|masa}}"""
+        )
+        page_data = [defaultdict(list)]
+        extract_translation(self.wxr, page_data, root.children[0])
+        self.assertEqual(
+            page_data,
+            [
+                {
+                    "translations": [
+                        {
+                            "code": "hr",
+                            "lang": "Croate",
+                            "word": "masa",
+                            "sense": "(Finance)",
+                        },
+                    ]
+                }
+            ],
+        )
diff --git a/wiktextract/extractor/fr/etymology.py b/wiktextract/extractor/fr/etymology.py
@@ -69,7 +69,7 @@ def find_pos_in_etymology_list(
         if (
             index == 0
             and isinstance(node, TemplateNode)
-            and node.template_name == "lien-ancre-étym"
+            and node.template_name in ("lien-ancre-étym", "laé")
         ):
             return clean_node(wxr, None, node).strip("()"), clean_node(
                 wxr, None, child_nodes[index + 1 :]

diff --git a/wiktextract/extractor/fr/inflection.py b/wiktextract/extractor/fr/inflection.py
@@ -46,23 +46,29 @@ def process_inflection_table(
     for row_num, table_row in enumerate(
         table_node.find_child(NodeKind.TABLE_ROW)
     ):
-        if (
-            row_num != 0
-            and len(list(table_row.filter_empty_str_child()))
-            == len(column_headers) + 1
-        ):
+        table_row_nodes = list(table_row.filter_empty_str_child())
+        first_row_has_data_cell = False
+        if row_num == 0:
+            first_row_has_data_cell = not any(
+                isinstance(cell, WikiNode)
+                and cell.kind == NodeKind.TABLE_CELL
+                for cell in table_row_nodes
+            )
+
+        if row_num != 0 and len(table_row_nodes) == len(column_headers) + 1:
             # data row has one more column then header: "fr-accord-al" template
             column_headers.insert(0, "")
 
         row_header = ""
-        for column_num, table_cell in enumerate(
-            table_row.filter_empty_str_child()
-        ):
+        for column_num, table_cell in enumerate(table_row_nodes):
             form_data = defaultdict(list)
             if isinstance(table_cell, WikiNode):
                 if table_cell.kind == NodeKind.TABLE_HEADER_CELL:
                     table_header_text = clean_node(wxr, None, table_cell)
-                    if row_num == 0:
+                    if row_num == 0 and first_row_has_data_cell:
+                        # if cells of the first row are not all header cells
+                        # then the header cells are row headers but not column
+                        # headers
                         column_headers.append(table_header_text)
                     elif (
                         column_num == 0

diff --git a/wiktextract/extractor/fr/linkage.py b/wiktextract/extractor/fr/linkage.py
@@ -20,11 +20,8 @@ def extract_linkage(
         for index, child_node in enumerate(
             list_item_node.filter_empty_str_child()
         ):
-            if index == 0:
-                if (
-                    isinstance(child_node, WikiNode)
-                    and child_node.kind == NodeKind.TEMPLATE
-                ):
+            if index == 0 or "word" not in linkage_data:
+                if isinstance(child_node, TemplateNode):
                     process_linkage_template(wxr, child_node, linkage_data)
                 else:
                     linkage_data["word"] = clean_node(wxr, None, child_node)
@@ -44,6 +41,11 @@ def extract_linkage(
                 ):
                     tag = pending_tag + tag
                     pending_tag = ""
+                elif tag.strip() == ",":
+                    # list item has more than one word
+                    page_data[-1][linkage_type].append(linkage_data)
+                    linkage_data = defaultdict(list)
+                    continue
                 elif len(pending_tag) > 0:
                     pending_tag += tag
                     continue
@@ -74,11 +76,11 @@ def process_lien_template(
     linkage_data: Dict[str, Union[str, List[str]]],
 ) -> None:
     # link word template: https://fr.wiktionary.org/wiki/Modèle:lien
-    if "dif" in node.template_parameters:  # displayed word
-        word = clean_node(wxr, None, node.template_parameters.get("dif"))
-    else:
-        word = clean_node(wxr, None, node.template_parameters.get(1))
-
+    word = clean_node(
+        wxr,
+        None,
+        node.template_parameters.get("dif", node.template_parameters.get(1)),
+    )
     linkage_data["word"] = word
     if "tr" in node.template_parameters:
         linkage_data["roman"] = clean_node(

diff --git a/wiktextract/extractor/fr/page.py b/wiktextract/extractor/fr/page.py
@@ -120,6 +120,9 @@ def process_pos_block(
                     process_exemple_template(
                         wxr, child, page_data[-1]["senses"][-1]
                     )
+                elif template_name.startswith(("zh-mot", "ja-mot")):
+                    # skip form line templates
+                    continue
                 elif template_name.startswith(f"{lang_code}-"):
                     extract_inflection(wxr, page_data, child, template_name)
             elif child.kind == NodeKind.BOLD:

diff --git a/wiktextract/extractor/fr/translation.py b/wiktextract/extractor/fr/translation.py
@@ -70,12 +70,10 @@ def process_translation_templates(
         return
     elif template_node.template_name == "trad-début":
         # translation box start: https://fr.wiktionary.org/wiki/Modèle:trad-début
-        translation_sense_wikitext = template_node.template_parameters.get(
-            1, ""
-        )
-        if len(translation_sense_wikitext) > 0:
+        sense_parameter = template_node.template_parameters.get(1)
+        if sense_parameter is not None:
             base_translation_data["sense"] = clean_node(
-                wxr, None, translation_sense_wikitext
+                wxr, None, sense_parameter
             )
     elif template_node.template_name == "T":
         # Translation language: https://fr.wiktionary.org/wiki/Modèle:T
@@ -86,7 +84,11 @@ def process_translation_templates(
     elif template_node.template_name.startswith("trad"):
         # Translation term: https://fr.wiktionary.org/wiki/Modèle:trad
         translation_term = clean_node(
-            wxr, None, template_node.template_parameters.get(2)
+            wxr,
+            None,
+            template_node.template_parameters.get(
+                "dif", template_node.template_parameters.get(2)
+            ),
         )
         translation_roman = clean_node(
             wxr,