Merge pull request #945 from xxyzz/it

[it] extract zh and ja example lists, extract translation section
tatuylonen · Dec 12, 2024 · 0ddf970 · 0ddf970
2 parents ba9f46d + 76e7557
commit 0ddf970
Show file tree

Hide file tree

Showing 8 changed files with 360 additions and 10 deletions.
diff --git a/src/wiktextract/data/overrides/it.json b/src/wiktextract/data/overrides/it.json
@@ -0,0 +1,7 @@
+{
+  "Template:-trad1-": {
+    "body": "===Traduzione===\n",
+    "namespace_id": 10,
+    "need_pre_expand": true
+  }
+}
diff --git a/src/wiktextract/extractor/it/example.py b/src/wiktextract/extractor/it/example.py
@@ -1,24 +1,98 @@
-from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor import NodeKind, TemplateNode, WikiNode
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
+from ..ruby import extract_ruby
 from .models import Example, Sense
 
 
 def extract_example_list_item(
-    wxr: WiktextractContext, sense: Sense, list_item: WikiNode
+    wxr: WiktextractContext, sense: Sense, list_item: WikiNode, lang_code: str
 ) -> None:
-    example = Example()
-    for node in list_item.children:
-        if isinstance(node, WikiNode):
+    examples = []
+    before_italic = True
+    text_nodes = []
+    roman = ""
+    translation = ""
+    for index, node in enumerate(list_item.children):
+        if (
+            isinstance(node, TemplateNode)
+            and node.template_name == "zh-tradsem"
+        ):
+            examples.extend(extract_zh_tradsem(wxr, node))
+        elif isinstance(node, WikiNode):
             match node.kind:
                 case NodeKind.ITALIC:
-                    example.text = clean_node(wxr, sense, node)
+                    if lang_code in ["zh", "ja"]:
+                        if before_italic:
+                            roman = clean_node(wxr, sense, node)
+                            before_italic = False
+                    else:
+                        examples.append(
+                            Example(text=clean_node(wxr, sense, node))
+                        )
                 case NodeKind.LIST:
                     for tr_list_item in node.find_child(NodeKind.LIST_ITEM):
-                        example.translation = clean_node(
+                        translation = clean_node(
                             wxr, sense, tr_list_item.children
                         )
+                case _ if lang_code in ["zh", "ja"]:
+                    if before_italic:
+                        text_nodes.append(node)
+        elif (
+            isinstance(node, str) and lang_code in ["zh", "ja"] and "-" in node
+        ):
+            translation = clean_node(
+                wxr,
+                sense,
+                wxr.wtp.node_to_wikitext(
+                    [node[node.index("-") + 1 :]]
+                    + list_item.children[index + 1 :]
+                ),
+            )
+            break
+        elif lang_code in ["zh", "ja"] and len(examples) == 0 and before_italic:
+            text_nodes.append(node)
+
+    if lang_code in ["zh", "ja"] and len(examples) == 0 and len(text_nodes) > 0:
+        expanded_nodes = wxr.wtp.parse(
+            wxr.wtp.node_to_wikitext(text_nodes), expand_all=True
+        )
+        example = Example()
+        example.ruby, node_without_ruby = extract_ruby(
+            wxr, expanded_nodes.children
+        )
+        example.text = (
+            clean_node(wxr, sense, node_without_ruby)
+            .replace(" ", "")
+            .strip("(")
+        )
+        examples.append(example)
+
+    for example in examples:
+        if roman != "":
+            example.roman = roman
+        if translation != "":
+            example.translation = translation
+        if example.text != "":
+            sense.examples.append(example)
+
+
+def extract_zh_tradsem(
+    wxr: WiktextractContext, t_node: TemplateNode
+) -> list[Example]:
+    # https://it.wiktionary.org/wiki/Template:zh-tradsem
+    examples = []
+    for arg_index in [1, 2]:
+        arg_value = clean_node(
+            wxr, None, t_node.template_parameters.get(arg_index, "")
+        ).replace(" ", "")
+        if arg_value != "":
+            example = Example(text=arg_value)
+            if arg_index == 1:
+                example.tags.append("Traditional Chinese")
+            elif arg_index == 2:
+                example.tags.append("Simplified Chinese")
+            examples.append(example)
 
-    if example.text != "":
-        sense.examples.append(example)
+    return examples
diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py
@@ -14,6 +14,12 @@ class Example(ItalianBaseModel):
     text: str = ""
     translation: str = ""
     ref: str = ""
+    ruby: list[tuple[str, ...]] = Field(
+        default=[], description="Japanese Kanji and furigana"
+    )
+    roman: str = ""
+    tags: list[str] = []
+    raw_tags: list[str] = []
 
 
 class Sense(ItalianBaseModel):
@@ -24,6 +30,19 @@ class Sense(ItalianBaseModel):
     examples: list[Example] = []
 
 
+class Translation(ItalianBaseModel):
+    lang_code: str = Field(
+        default="",
+        description="Wiktionary language code of the translation term",
+    )
+    lang: str = Field(default="", description="Translation language name")
+    word: str = Field(default="", description="Translation term")
+    sense: str = Field(default="", description="Translation gloss")
+    tags: list[str] = []
+    raw_tags: list[str] = []
+    roman: str = ""
+
+
 class WordEntry(ItalianBaseModel):
     model_config = ConfigDict(title="Italian Wiktionary")
     word: str = Field(description="Word string", min_length=1)
@@ -35,3 +54,4 @@ class WordEntry(ItalianBaseModel):
     categories: list[str] = []
     tags: list[str] = []
     raw_tags: list[str] = []
+    translations: list[Translation] = []
diff --git a/src/wiktextract/extractor/it/page.py b/src/wiktextract/extractor/it/page.py
@@ -7,6 +7,7 @@
 from .models import Sense, WordEntry
 from .pos import extract_pos_section
 from .section_titles import POS_DATA
+from .translation import extract_translation_section
 
 
 def parse_section(
@@ -18,6 +19,8 @@ def parse_section(
     title_text = clean_node(wxr, None, level_node.largs)
     if title_text in POS_DATA:
         extract_pos_section(wxr, page_data, base_data, level_node, title_text)
+    elif title_text == "Traduzione":
+        extract_translation_section(wxr, page_data, level_node)
 
     for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
         parse_section(wxr, page_data, base_data, next_level)

diff --git a/src/wiktextract/extractor/it/pos.py b/src/wiktextract/extractor/it/pos.py
@@ -47,7 +47,18 @@ def extract_gloss_list_item(
         elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
             if node.sarg.endswith("*"):
                 for example_list_item in node.find_child(NodeKind.LIST_ITEM):
-                    extract_example_list_item(wxr, sense, example_list_item)
+                    extract_example_list_item(
+                        wxr, sense, example_list_item, word_entry.lang_code
+                    )
+            elif (
+                node.sarg.endswith(":")
+                and len(sense.examples) > 0
+                and sense.examples[-1].translation == ""
+            ):
+                for tr_list_item in node.find_child(NodeKind.LIST_ITEM):
+                    sense.examples[-1].translation = clean_node(
+                        wxr, sense, tr_list_item.children
+                    )
         else:
             gloss_nodes.append(node)
     gloss_str = clean_node(wxr, sense, gloss_nodes)

diff --git a/src/wiktextract/extractor/it/translation.py b/src/wiktextract/extractor/it/translation.py
@@ -0,0 +1,85 @@
+import re
+
+from mediawiki_langcodes import name_to_code
+from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Translation, WordEntry
+
+
+def extract_translation_section(
+    wxr: WiktextractContext,
+    page_data: list[WordEntry],
+    level_node: LevelNode,
+) -> None:
+    sense = ""
+    translations = []
+    cats = {}
+    for node in level_node.children:
+        if isinstance(node, TemplateNode) and node.template_name == "Trad1":
+            sense = clean_node(wxr, cats, node.template_parameters.get(1, ""))
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+            for list_item in node.find_child(NodeKind.LIST_ITEM):
+                translations.extend(
+                    extract_translation_list_item(wxr, list_item, sense)
+                )
+
+    for data in page_data:
+        if data.lang_code == page_data[-1].lang_code:
+            data.translations.extend(translations)
+            data.categories.extend(cats.get("categories", []))
+
+
+TR_GENDER_TAGS = {
+    "c": "common",
+    "f": "feminine",
+    "m": "masculine",
+    "n": "neuter",
+}
+
+
+def extract_translation_list_item(
+    wxr: WiktextractContext, list_item: WikiNode, sense: str
+) -> list[Translation]:
+    translations = []
+    lang_name = "unknown"
+    lang_code = "unknown"
+    before_colon = True
+    for index, node in enumerate(list_item.children):
+        if before_colon and isinstance(node, str) and ":" in node:
+            before_colon = False
+            lang_name = clean_node(wxr, None, list_item.children[:index])
+            for n in list_item.children[:index]:
+                if isinstance(n, TemplateNode):
+                    lang_code = n.template_name
+                    break
+            if lang_code == "unknown":
+                new_lang_code = name_to_code(lang_name, "it")
+                if new_lang_code != "":
+                    lang_code = new_lang_code
+        elif not before_colon and isinstance(node, WikiNode):
+            match node.kind:
+                case NodeKind.LINK:
+                    word = clean_node(wxr, None, node)
+                    if word != "":
+                        translations.append(
+                            Translation(
+                                word=word,
+                                sense=sense,
+                                lang=lang_name,
+                                lang_code=lang_code,
+                            )
+                        )
+                case NodeKind.ITALIC:
+                    raw_tag = clean_node(wxr, None, node)
+                    if raw_tag in TR_GENDER_TAGS and len(translations) > 0:
+                        translations[-1].tags.append(TR_GENDER_TAGS[raw_tag])
+                    elif raw_tag != "" and len(translations) > 0:
+                        translations[-1].raw_tags.append(raw_tag)
+        elif not before_colon and isinstance(node, str):
+            m = re.search(r"\((.+)\)", node)
+            if m is not None and len(translations) > 0:
+                translations[-1].roman = m.group(1)
+
+    return translations
diff --git a/tests/test_it_example.py b/tests/test_it_example.py
@@ -43,3 +43,99 @@ def test_list_example(self):
                 }
             ],
         )
+
+    def test_all_in_one_line(self):
+        self.wxr.wtp.add_page("Template:-zh-", 10, "Cinese")
+        data = parse_page(
+            self.wxr,
+            "幼虫",
+            """== {{-zh-}} ==
+===Sostantivo===
+# larva
+#* [[苍蝇]] [[的]]'''幼虫''' ''cāngyíng de '''yòuchóng''''' - [[larva]] di [[mosca]], [[bigattino]]""",
+        )
+        self.assertEqual(
+            data[0]["senses"],
+            [
+                {
+                    "glosses": ["larva"],
+                    "examples": [
+                        {
+                            "text": "苍蝇的幼虫",
+                            "roman": "cāngyíng de yòuchóng",
+                            "translation": "larva di mosca, bigattino",
+                        }
+                    ],
+                }
+            ],
+        )
+
+    def test_ja_r(self):
+        self.wxr.wtp.add_page("Template:-ja-", 10, "Giapponese")
+        self.wxr.wtp.add_page(
+            "Template:ja-r",
+            10,
+            """{{#switch:{{{1}}}
+| 今 = <span class="Jpan" lang="ja">[[今#Giapponese|<span><ruby>今<rp>&nbsp;(</rp><rt>いま</rt><rp>)</rp></ruby></span>]]</span>
+| 行く = <span class="Jpan" lang="ja">[[行く#Giapponese|<span><ruby>行<rp>&nbsp;(</rp><rt>い</rt><rp>)</rp></ruby>く</span>]]</span>
+| よ = <span class="Jpan" lang="ja">[[よ#Giapponese|<span>よ</span>]]</span>
+}}""",
+        )
+        data = parse_page(
+            self.wxr,
+            "行く",
+            """== {{-ja-}} ==
+===Verbo===
+# andare
+#* {{ja-r|今|いま|rom=-}}'''{{ja-r|行く|いく|rom=-}}'''{{ja-r|よ|rom=-}}! (''ima '''iku''' yo!'')
+#: ''sto '''andando'''!''""",
+        )
+        self.assertEqual(
+            data[0]["senses"],
+            [
+                {
+                    "glosses": ["andare"],
+                    "examples": [
+                        {
+                            "text": "今行くよ!",
+                            "roman": "ima iku yo!",
+                            "translation": "sto andando!",
+                            "ruby": [("今", "いま"), ("行", "い")],
+                        }
+                    ],
+                }
+            ],
+        )
+
+    def test_zh_tradsem(self):
+        self.wxr.wtp.add_page("Template:-zh-", 10, "Cinese")
+        data = parse_page(
+            self.wxr,
+            "可能",
+            """== {{-zh-}} ==
+===Aggettivo===
+# probabile
+#* {{zh-tradsem|[[一]] [[個]]'''可能'''[[的]] [[事件]]|[[一]] [[个]]'''可能'''[[的]] [[事件]]}} ''yī ge '''kěnéng''' de shìjiàn'' - un [[evento]] [[possibile]]""",
+        )
+        self.assertEqual(
+            data[0]["senses"],
+            [
+                {
+                    "glosses": ["probabile"],
+                    "examples": [
+                        {
+                            "text": "一個可能的事件",
+                            "roman": "yī ge kěnéng de shìjiàn",
+                            "translation": "un evento possibile",
+                            "tags": ["Traditional Chinese"],
+                        },
+                        {
+                            "text": "一个可能的事件",
+                            "roman": "yī ge kěnéng de shìjiàn",
+                            "translation": "un evento possibile",
+                            "tags": ["Simplified Chinese"],
+                        },
+                    ],
+                }
+            ],
+        )