[de] extract nested sound lists

tatuylonen · Dec 2, 2024 · 27dc26a · 27dc26a
1 parent 95d2be1
commit 27dc26a
Show file tree

Hide file tree

Showing 7 changed files with 122 additions and 390 deletions.
diff --git a/src/wiktextract/extractor/de/models.py b/src/wiktextract/extractor/de/models.py
@@ -105,28 +105,16 @@ class Sense(BaseModelWrap):
 
 class Sound(BaseModelWrap):
     ipa: str = Field(default="", description="International Phonetic Alphabet")
-    # phonetic_transcription: list[str] = Field(
-    #     default=[], description="Phonetic transcription, less exact than IPA."
-    # )
     audio: str = Field(default="", description="Audio file name")
     wav_url: str = Field(default="")
     ogg_url: str = Field(default="")
     mp3_url: str = Field(default="")
     oga_url: str = Field(default="")
     flac_url: str = Field(default="")
-    lang_code: str = Field(default="", description="Wiktionary language code")
-    lang: str = Field(default="", description="Localized language name")
-    # roman: list[str] = Field(
-    #     default=[], description="Translitaration to Roman characters"
-    # )
-    # syllabic: list[str] = Field(
-    #     default=[], description="Syllabic transcription"
-    # )
-    raw_tags: list[str] = Field(
-        default=[], description="Specifying the variant of the pronunciation"
-    )
+    raw_tags: list[str] = []
     tags: list[str] = []
     rhymes: str = ""
+    categories: list[str] = Field(default=[], exclude=True)
 
 
 class Form(BaseModelWrap):

diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py
@@ -13,7 +13,7 @@
 from .inflection import extract_inf_table_template
 from .linkage import extract_linkages
 from .models import Sense, WordEntry
-from .pronunciation import extract_pronunciation
+from .pronunciation import extract_pronunciation_section
 from .section_titles import FORM_TITLES, LINKAGE_TITLES, POS_SECTIONS
 from .translation import extract_translation
 
@@ -43,7 +43,7 @@ def parse_section(
                 level_node,
             )
         elif wxr.config.capture_pronunciation and section_name == "Aussprache":
-            extract_pronunciation(
+            extract_pronunciation_section(
                 wxr,
                 page_data[-1] if len(page_data) > 0 else base_data,
                 level_node,

diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py
@@ -1,214 +1,86 @@
-from typing import Union
-
-from mediawiki_langcodes import code_to_name
 from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
-from ..share import create_audio_url_dict
+from ..share import set_sound_file_url_fields
 from .models import Sound, WordEntry
+from .tags import translate_raw_tags
 
 
-def extract_pronunciation(
+def extract_pronunciation_section(
     wxr: WiktextractContext,
     word_entry: WordEntry,
     level_node: LevelNode,
-):
-    for list_node in level_node.find_child(NodeKind.LIST):
-        sound_data: list[Sound] = [Sound()]
-
-        for not_list_item_node in list_node.invert_find_child(
-            NodeKind.LIST_ITEM
-        ):
-            wxr.wtp.debug(
-                f"Found unexpected non-list-item node in pronunciation "
-                f"section: {not_list_item_node}",
-                sortid="extractor/de/pronunciation/extract_pronunciation/28",
-            )
-
-        for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
-            children = list(list_item_node.filter_empty_str_child())
-            if len(children) == 0:
-                continue
-
-            head_template, rest = children[0], children[1:]
-            if (
-                not isinstance(head_template, WikiNode)
-                or head_template.kind != NodeKind.TEMPLATE
-                or not rest
-            ):
-                wxr.wtp.debug(
-                    f"Found unexpected non-template node in pronunciation "
-                    f"section: {head_template}",
-                    sortid="extractor/de/pronunciation/43",
-                )
-                continue
-            if head_template.template_name == "IPA":
-                process_ipa(wxr, sound_data, rest)
-            elif head_template.template_name == "Hörbeispiele":
-                sound_data.append(Sound())
-                process_hoerbeispiele(wxr, sound_data, rest)
-            elif head_template.template_name == "Reime":
-                process_rhymes(wxr, sound_data, rest, word_entry)
-            else:
-                wxr.wtp.debug(
-                    "Unexpected template in pronunciation section: "
-                    f"{head_template} with content {rest}",
-                    sortid="extractor/de/pronunciation/58)",
-                )
-
-        # Remove empty entries
-        sound_data = [
-            entry
-            for entry in sound_data
-            if entry.model_dump(exclude_defaults=True) != {}
-        ]
-        if len(sound_data) > 0:
-            word_entry.sounds.extend(sound_data)
-
-    for non_list_node in level_node.invert_find_child(NodeKind.LIST):
-        wxr.wtp.debug(
-            "Unexpected non-list node in pronunciation section: "
-            f"{non_list_node}",
-            sortid="extractor/de/pronunciation/extract_pronunciation/64",
-        )
-
-
-def process_ipa(
-    wxr: WiktextractContext,
-    sound_data: list[Sound],
-    nodes: list[Union[WikiNode, str]],
-):
-    for node in nodes:
-        if is_template_node_with_name(node, "Lautschrift"):
-            process_lautschrift_template(wxr, sound_data, node)
-        elif is_tag_node(node):
-            append_tag(wxr, sound_data[-1], node)
-        elif is_new_sound_data_entry_sep(node):
-            sound_data.append(Sound())
-        else:
-            wxr.wtp.debug(
-                f"Found unexpected non-Lautschrift node in IPA section: {node}",
-                sortid="extractor/de/pronunciation/process_ipa/57",
-            )
-
-
-def process_lautschrift_template(
-    wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode
 ) -> None:
-    template_parameters = node.template_parameters
-
-    ipa = template_parameters.get(1, "")
-
-    lang_code = template_parameters.get("spr")
-    if lang_code:
-        lang = code_to_name(lang_code, "de")
-        new_data = {
-            "lang_code": lang_code,
-            "lang": lang,
-        }
-    else:
-        new_data = dict()
-
-    new_data["ipa"] = ipa
-
-    add_sound_data_without_appending_to_existing_properties(
-        wxr,
-        sound_data,
-        new_data,
-    )
-
-
-def process_hoerbeispiele(
-    wxr: WiktextractContext,
-    sound_data: list[Sound],
-    nodes: list[Union[str, WikiNode]],
-):
-    for node in nodes:
-        if is_template_node_with_name(node, "Audio"):
-            process_audio_template(wxr, sound_data, node)
-        elif is_tag_node(node):
-            append_tag(wxr, sound_data[-1], node)
-        elif is_new_sound_data_entry_sep(node):
-            sound_data.append(Sound())
-        else:
-            wxr.wtp.debug(
-                f"Found unexpected node in Hoerbeispiele section: {node}",
-                sortid="extractor/de/pronunciation/process_hoerbeispiele/193",
-            )
-
-
-def process_audio_template(
-    wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode
-):
-    audio_file = node.template_parameters.get(1, "").strip()
-    if len(audio_file) > 0:
-        add_sound_data_without_appending_to_existing_properties(
-            wxr, sound_data, create_audio_url_dict(audio_file)
-        )
-
-
-def process_rhymes(
-    wxr: WiktextractContext,
-    sound_data: list[Sound],
-    nodes: list[WikiNode],
-    word_entry: WordEntry,
-):
-    for node in nodes:
-        if isinstance(node, TemplateNode) and node.template_name == "Reim":
-            # https://de.wiktionary.org/wiki/Vorlage:Reime
-            rhyme = clean_node(wxr, word_entry, node)
-            if rhyme != "":
-                sound_data.append(Sound(rhymes=rhyme))
-
-
-def is_template_node_with_name(node: Union[WikiNode, str], template_name: str):
-    return (
-        isinstance(node, WikiNode)
-        and node.kind == NodeKind.TEMPLATE
-        and node.template_name == template_name
-    )
-
-
-def add_sound_data_without_appending_to_existing_properties(
-    wxr: WiktextractContext,
-    sound_data: list[Sound],
-    new_sound_data: dict,
-):
-    """Creates a new IPA data entry if properties exist in previous entry."""
-    if any(
-        [
-            key in sound_data[-1].model_dump(exclude_defaults=True)
-            for key in new_sound_data.keys()
-        ]
+    for list_node in level_node.find_child(NodeKind.LIST):
+        for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+            for sound in extract_pron_list_item(wxr, list_item):
+                word_entry.sounds.append(sound)
+                word_entry.categories.extend(sound.categories)
+
+
+def extract_pron_list_item(
+    wxr: WiktextractContext, list_item: WikiNode
+) -> list[Sound]:
+    raw_tags = []
+    sounds = []
+    for node in list_item.find_child(
+        NodeKind.TEMPLATE | NodeKind.ITALIC | NodeKind.LIST
     ):
-        sound_data.append(Sound())
-
-    for key, value in new_sound_data.items():
-        if key in sound_data[-1].model_fields:
-            if isinstance(value, str):
-                setattr(sound_data[-1], key, value)
-            else:
-                getattr(sound_data[-1], key).extend(value)
-        else:
-            wxr.wtp.debug(
-                f"Unexpected key {key} for Sound",
-                sortid="extractor/de/pronunciation/196",
-            )
-
-
-def is_tag_node(node: Union[WikiNode, str]):
-    return isinstance(node, WikiNode) and node.kind in [
-        NodeKind.TEMPLATE,
-        NodeKind.ITALIC,
-    ]
-
-
-def append_tag(wxr: WiktextractContext, sound_data: Sound, node: WikiNode):
-    tag = clean_node(wxr, None, node)
-    if tag != "":
-        sound_data.raw_tags.append(tag)
-
-
-def is_new_sound_data_entry_sep(node: Union[WikiNode, str]):
-    return isinstance(node, str) and node.strip() in [",", ";"]
+        match node.kind:
+            case NodeKind.ITALIC:
+                node_text = clean_node(wxr, None, node)
+                if node_text.endswith(":"):
+                    raw_tags.append(node_text.removesuffix(":"))
+            case NodeKind.LIST:
+                for next_list_item in node.find_child(NodeKind.LIST_ITEM):
+                    sounds.extend(extract_pron_list_item(wxr, next_list_item))
+            case NodeKind.TEMPLATE:
+                match node.template_name:
+                    case "Lautschrift":
+                        ipa = clean_node(
+                            wxr,
+                            None,
+                            node.template_parameters.get(1, ""),
+                        )
+                        if ipa != "":
+                            sounds.append(Sound(ipa=ipa))
+                            clean_node(wxr, sounds[-1], node)
+                    case "Audio":
+                        new_sound = extract_audio_template(wxr, node)
+                        if new_sound is not None:
+                            sounds.append(new_sound)
+                    case "Reim":
+                        rhyme = clean_node(
+                            wxr,
+                            None,
+                            node.template_parameters.get(1, ""),
+                        )
+                        if rhyme != "":
+                            sounds.append(Sound(rhymes=rhyme))
+                            clean_node(wxr, sounds[-1], node)
+
+    for sound in sounds:
+        sound.raw_tags.extend(raw_tags)
+        translate_raw_tags(sound)
+    return sounds
+
+
+def extract_audio_template(
+    wxr: WiktextractContext, t_node: TemplateNode
+) -> Sound | None:
+    # https://de.wiktionary.org/wiki/Vorlage:Audio
+    filename = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+    if filename.strip() == "":
+        return None
+    sound = Sound()
+    set_sound_file_url_fields(wxr, filename, sound)
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    for link_node in expanded_node.find_child(NodeKind.LINK):
+        link_str = clean_node(wxr, None, link_node)
+        if "(" in link_str:
+            sound.raw_tags.append(link_str[link_str.index("(") + 1:].strip(")"))
+    clean_node(wxr, sound, expanded_node)
+    return sound
diff --git a/src/wiktextract/extractor/de/tags.py b/src/wiktextract/extractor/de/tags.py
@@ -40,6 +40,7 @@
     # "das": "",
     "Dativ": "dative",
     # "DDR": "",
+    "Deutschland": "Germany",
     # "der": "",
     "dichter.": "poetic",
     # "die": "",

diff --git a/tests/test_de_example.py b/tests/test_de_example.py
@@ -260,7 +260,7 @@ def test_tag_list(self):
                 {
                     "examples": [
                         {
-                            "raw_tags": ["Deutschland"],
+                            "tags": ["Germany"],
                             "text": "„Den ganzen ‚Feber‘ hörte man lapidar",
                         }
                     ],

diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py
@@ -230,14 +230,13 @@ def test_italit_node_multiple_raw_tags(self):
             [
                 {
                     "raw_tags": [
-                        "Deutschland",
                         "Fernsehen",
                         "Kurzwort",
                         "Akronym",
                     ],
                     "glosses": ["für das erste Fernsehprogramm der ARD"],
                     "sense_index": "2",
-                    "tags": ["colloquial"],
+                    "tags": ["Germany", "colloquial"],
                 },
             ],
         )