Merge pull request #946 from xxyzz/it

[it] extract forms line, etymology, pronunciation sections
tatuylonen · Dec 13, 2024 · 8a39820 · 8a39820
2 parents 0ddf970 + e175500
commit 8a39820
Show file tree

Hide file tree

Showing 11 changed files with 394 additions and 3 deletions.
diff --git a/src/wiktextract/data/overrides/it.json b/src/wiktextract/data/overrides/it.json
@@ -3,5 +3,10 @@
     "body": "===Traduzione===\n",
     "namespace_id": 10,
     "need_pre_expand": true
+  },
+  "Template:-ref-": {
+    "body": "===Note / Riferimenti===\n",
+    "namespace_id": 10,
+    "need_pre_expand": true
   }
 }
diff --git a/src/wiktextract/extractor/it/etymology.py b/src/wiktextract/extractor/it/etymology.py
@@ -0,0 +1,47 @@
+from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Example, WordEntry
+
+
+def extract_etymology_section(
+    wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
+) -> None:
+    etymology_texts = []
+    for list_node in level_node.find_child(NodeKind.LIST):
+        for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+            e_str = clean_node(wxr, None, list_item.children)
+            if e_str != "":
+                etymology_texts.append(e_str)
+
+    if len(etymology_texts) == 0:
+        e_str = clean_node(
+            wxr, None, list(level_node.invert_find_child(LEVEL_KIND_FLAGS))
+        )
+        if e_str != "":
+            etymology_texts.append(e_str)
+
+    for data in page_data:
+        if data.lang_code == page_data[-1].lang_code:
+            data.etymology_texts.extend(etymology_texts)
+
+
+def extract_citation_section(
+    wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
+) -> None:
+    examples = []
+    for t_node in level_node.find_child(NodeKind.TEMPLATE):
+        if t_node.template_name.lower() == "quote":
+            example = Example()
+            example.text = clean_node(
+                wxr, None, t_node.template_parameters.get(1, "")
+            )
+            example.ref = clean_node(
+                wxr, None, t_node.template_parameters.get(2, "")
+            )
+            if example.text != "":
+                examples.append(example)
+    for data in page_data:
+        if data.lang_code == page_data[-1].lang_code:
+            data.etymology_examples.extend(examples)
diff --git a/src/wiktextract/extractor/it/inflection.py b/src/wiktextract/extractor/it/inflection.py
@@ -0,0 +1,24 @@
+from wikitextprocessor import TemplateNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Form, WordEntry
+
+
+def extract_tabs_template(
+    wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
+) -> None:
+    # https://it.wiktionary.org/wiki/Template:Tabs
+    tags = [
+        ["masculine", "singular"],
+        ["masculine", "plural"],
+        ["feminine", "singular"],
+        ["feminine", "plural"],
+    ]
+    for arg_name in range(1, 5):
+        arg_value = clean_node(
+            wxr, None, node.template_parameters.get(arg_name, "")
+        )
+        if arg_value not in ["", wxr.wtp.title]:
+            form = Form(form=arg_value, tags=tags[arg_name - 1])
+            word_entry.forms.append(form)
diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py
@@ -43,6 +43,25 @@ class Translation(ItalianBaseModel):
     roman: str = ""
 
 
+class Form(ItalianBaseModel):
+    form: str = ""
+    tags: list[str] = []
+    raw_tags: list[str] = []
+
+
+class Sound(ItalianBaseModel):
+    ipa: str = Field(default="", description="International Phonetic Alphabet")
+    audio: str = Field(default="", description="Audio file name")
+    wav_url: str = ""
+    oga_url: str = ""
+    ogg_url: str = ""
+    mp3_url: str = ""
+    opus_url: str = ""
+    flac_url: str = ""
+    tags: list[str] = []
+    raw_tags: list[str] = []
+
+
 class WordEntry(ItalianBaseModel):
     model_config = ConfigDict(title="Italian Wiktionary")
     word: str = Field(description="Word string", min_length=1)
@@ -55,3 +74,8 @@ class WordEntry(ItalianBaseModel):
     tags: list[str] = []
     raw_tags: list[str] = []
     translations: list[Translation] = []
+    forms: list[Form] = []
+    etymology_texts: list[str] = []
+    etymology_examples: list[Example] = []
+    hyphenation: str = ""
+    sounds: list[Sound] = []
diff --git a/src/wiktextract/extractor/it/page.py b/src/wiktextract/extractor/it/page.py
@@ -4,9 +4,11 @@
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
+from .etymology import extract_citation_section, extract_etymology_section
 from .models import Sense, WordEntry
 from .pos import extract_pos_section
 from .section_titles import POS_DATA
+from .sound import extract_hyphenation_section, extract_pronunciation_section
 from .translation import extract_translation_section
 
 
@@ -21,6 +23,14 @@ def parse_section(
         extract_pos_section(wxr, page_data, base_data, level_node, title_text)
     elif title_text == "Traduzione":
         extract_translation_section(wxr, page_data, level_node)
+    elif title_text == "Etimologia / Derivazione":
+        extract_etymology_section(wxr, page_data, level_node)
+    elif title_text == "Citazione":
+        extract_citation_section(wxr, page_data, level_node)
+    elif title_text == "Sillabazione":
+        extract_hyphenation_section(wxr, page_data, level_node)
+    elif title_text == "Pronuncia":
+        extract_pronunciation_section(wxr, page_data, level_node)
 
     for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
         parse_section(wxr, page_data, base_data, next_level)
@@ -37,6 +47,8 @@ def parse_page(
     for level2_node in tree.find_child(NodeKind.LEVEL2):
         lang_cats = {}
         lang_name = clean_node(wxr, lang_cats, level2_node.largs)
+        if lang_name in ["Altri progetti", "Note / Riferimenti"]:
+            continue
         lang_code = "unknown"
         for lang_template in level2_node.find_content(NodeKind.TEMPLATE):
             lang_code = lang_template.template_name.strip("-")

diff --git a/src/wiktextract/extractor/it/pos.py b/src/wiktextract/extractor/it/pos.py
@@ -5,6 +5,7 @@
 from .example import extract_example_list_item
 from .models import Sense, WordEntry
 from .section_titles import POS_DATA
+from .tag_form_line import extract_tag_form_line_nodes
 
 
 def extract_pos_section(
@@ -22,10 +23,22 @@ def extract_pos_section(
     for link_node in level_node.find_child(NodeKind.LINK):
         clean_node(wxr, page_data[-1], link_node)
 
-    for list_node in level_node.find_child(NodeKind.LIST):
-        if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
-            for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+    first_gloss_list_index = len(level_node.children)
+    for index, node in enumerate(level_node.children):
+        if (
+            isinstance(node, WikiNode)
+            and node.kind == NodeKind.LIST
+            and node.sarg.startswith("#")
+            and node.sarg.endswith("#")
+        ):
+            for list_item in node.find_child(NodeKind.LIST_ITEM):
                 extract_gloss_list_item(wxr, page_data[-1], list_item)
+            if index < first_gloss_list_index:
+                first_gloss_list_index = index
+
+    extract_tag_form_line_nodes(
+        wxr, page_data[-1], level_node.children[:first_gloss_list_index]
+    )
 
 
 def extract_gloss_list_item(

diff --git a/src/wiktextract/extractor/it/sound.py b/src/wiktextract/extractor/it/sound.py
@@ -0,0 +1,47 @@
+from wikitextprocessor import LevelNode, NodeKind
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from ..share import set_sound_file_url_fields
+from .models import Sound, WordEntry
+
+
+def extract_hyphenation_section(
+    wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
+) -> None:
+    hyphenation = ""
+    for list_node in level_node.find_child(NodeKind.LIST):
+        for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+            hyphenation = clean_node(wxr, None, list_item.children)
+    for data in page_data:
+        if data.lang_code == page_data[-1].lang_code:
+            data.hyphenation = hyphenation
+
+
+def extract_pronunciation_section(
+    wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
+) -> None:
+    sounds = []
+    for t_node in level_node.find_child(NodeKind.TEMPLATE):
+        match t_node.template_name.lower():
+            case "ipa":
+                ipa = clean_node(
+                    wxr, None, t_node.template_parameters.get(1, "")
+                )
+                if ipa != "":
+                    sounds.append(Sound(ipa=ipa))
+            case "audio":
+                sound_file = clean_node(
+                    wxr, None, t_node.template_parameters.get(1, "")
+                )
+                if sound_file != "":
+                    if len(sounds) > 0:
+                        set_sound_file_url_fields(wxr, sound_file, sounds[-1])
+                    else:
+                        sound = Sound()
+                        set_sound_file_url_fields(wxr, sound_file, sound)
+                        sounds.append(sound)
+
+    for data in page_data:
+        if data.lang_code == page_data[-1].lang_code:
+            data.sounds.extend(sounds)
diff --git a/src/wiktextract/extractor/it/tag_form_line.py b/src/wiktextract/extractor/it/tag_form_line.py
@@ -0,0 +1,52 @@
+from wikitextprocessor import NodeKind, TemplateNode, WikiNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .inflection import extract_tabs_template
+from .models import Form, WordEntry
+
+
+def extract_tag_form_line_nodes(
+    wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str]
+) -> None:
+    # https://it.wiktionary.org/wiki/Wikizionario:Manuale_di_stile#Genere_e_numero,_declinazione_o_paradigma
+    for node in nodes:
+        if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
+            extract_italic_tag_node(wxr, word_entry, node)
+        elif isinstance(node, TemplateNode):
+            match node.template_name.lower():
+                case "tabs":
+                    extract_tabs_template(wxr, word_entry, node)
+                case "linkp":
+                    form = clean_node(
+                        wxr, None, node.template_parameters.get(1, "")
+                    )
+                    if form != "":
+                        word_entry.forms.append(
+                            Form(form=form, tags=["plural"])
+                        )
+
+
+ITALIC_TAGS = {
+    "c": "common",
+    "coll": "collective",
+    "f": "feminine",
+    "m": "masculine",
+    "n": "neuter",
+    "pl": "plural",
+    "sing": "singular",
+    "prom": "common",
+    "inv": "invariable",
+}
+
+
+def extract_italic_tag_node(
+    wxr: WiktextractContext, word_entry: WordEntry, node: WikiNode
+) -> None:
+    # https://it.wiktionary.org/wiki/Wikizionario:Genere
+    italic_str = clean_node(wxr, None, node)
+    for raw_tag in italic_str.split():
+        if raw_tag in ITALIC_TAGS:
+            word_entry.tags.append(ITALIC_TAGS[raw_tag])
+        else:
+            word_entry.raw_tags.append(raw_tag)
diff --git a/tests/test_it_etymology.py b/tests/test_it_etymology.py
@@ -0,0 +1,62 @@
+from unittest import TestCase
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.it.page import parse_page
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestItGloss(TestCase):
+    maxDiff = None
+
+    def setUp(self) -> None:
+        self.wxr = WiktextractContext(
+            Wtp(lang_code="it"),
+            WiktionaryConfig(
+                dump_file_lang_code="it", capture_language_codes=None
+            ),
+        )
+
+    def test_quote_template(self):
+        self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
+        data = parse_page(
+            self.wxr,
+            "cane",
+            """== {{-it-}} ==
+===Sostantivo===
+# {{Term|mammalogia|it}} [[animale]]
+===Etimologia / Derivazione===
+dal latino canis
+====Citazione====
+{{Quote
+|Cane affamato non teme bastone
+|[[q:Giovanni Verga|Giovanni Verga]]}}""",
+        )
+        self.assertEqual(data[0]["etymology_texts"], ["dal latino canis"])
+        self.assertEqual(
+            data[0]["etymology_examples"],
+            [
+                {
+                    "text": "Cane affamato non teme bastone",
+                    "ref": "Giovanni Verga",
+                }
+            ],
+        )
+
+    def test_list(self):
+        self.wxr.wtp.add_page("Template:-la-", 10, "Latino")
+        data = parse_page(
+            self.wxr,
+            "cane",
+            """== {{-it-}} ==
+===Sostantivo, forma flessa===
+# {{Term|mammalogia|it}} [[animale]]
+===Etimologia / Derivazione===
+* (sostantivo) vedi [[canis#Latino|canis]]
+* (voce verbale) vedi [[cano#Latino|canō]]""",
+        )
+        self.assertEqual(
+            data[0]["etymology_texts"],
+            ["(sostantivo) vedi canis", "(voce verbale) vedi canō"],
+        )