Extract pronunciation data from German Wiktionary

Move list_node loop to section extractors This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. Solve issues from code review
tatuylonen · Oct 16, 2023 · 0a348e5 · 0a348e5
1 parent 3b85751
commit 0a348e5
Show file tree

Hide file tree

Showing 7 changed files with 481 additions and 62 deletions.
diff --git a/src/wiktextract/extractor/de/example.py b/src/wiktextract/extractor/de/example.py
@@ -3,6 +3,7 @@
 
 
 from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor.parser import LevelNode
 from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
 
 from wiktextract.page import clean_node
@@ -12,42 +13,43 @@
 def extract_examples(
     wxr: WiktextractContext,
     page_data: List[Dict],
-    list_node: WikiNode,
+    level_node: LevelNode,
 ) -> None:
-    for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
-        example_data = defaultdict(str)
-
-        ref_nodes = find_and_remove_child(
-            list_item_node,
-            NodeKind.HTML,
-            lambda html_node: html_node.tag == "ref",
+    for list_node in level_node.find_child(NodeKind.LIST):
+        for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
+            example_data = defaultdict(str)
+
+            ref_nodes = find_and_remove_child(
+                list_item_node,
+                NodeKind.HTML,
+                lambda html_node: html_node.tag == "ref",
+            )
+            for ref_node in ref_nodes:
+                extract_reference(wxr, example_data, ref_node)
+
+            example_text = clean_node(wxr, {}, list_item_node.children)
+
+            senseid, example_text = match_senseid(example_text)
+
+            if example_text:
+                example_data["text"] = example_text
+
+            if senseid:
+                for sense in page_data[-1]["senses"]:
+                    if sense["senseid"] == senseid:
+                        sense["examples"].append(example_data)
+
+            else:
+                if example_data:
+                    wxr.wtp.debug(
+                        f"Found example data without senseid and text: {example_data}",
+                        sortid="extractor/de/examples/extract_examples/28",
+                    )
+    for non_list_node in level_node.invert_find_child(NodeKind.LIST):
+        wxr.wtp.debug(
+            f"Found unexpected non-list node in example section: {non_list_node}",
+            sortid="extractor/de/examples/extract_examples/33",
         )
-        for ref_node in ref_nodes:
-            extract_reference(wxr, example_data, ref_node)
-
-        example_text = clean_node(wxr, {}, list_item_node.children)
-
-        senseid, example_text = match_senseid(example_text)
-
-        if example_text:
-            example_data["text"] = example_text
-
-        if senseid:
-            sense_data = [
-                sense
-                for sense in page_data[-1]["senses"]
-                if sense["senseid"] == senseid
-            ]
-
-            for sense in sense_data:
-                sense["examples"].append(example_data)
-
-        else:
-            if example_data:
-                wxr.wtp.debug(
-                    f"Found example data without senseid and text: {example_data}",
-                    sortid="extractor/de/examples/extract_examples/28",
-                )
 
 
 def extract_reference(
@@ -67,20 +69,14 @@ def extract_reference(
     elif len(template_nodes) == 1:
         template_node = template_nodes[0]
 
-        # Sometimes the title is dynamically generated from the template name,
-        # so we preset the title. If specified in the template, it will be
-        # overwritten.
-        reference_data["titel"] = template_node.largs[0][0].strip()
-
-        for arg in template_node.largs[1:]:
-            arg = clean_node(wxr, {}, arg)
-            if not arg.strip():
-                continue
-            splits = arg.split("=", 1)
-            if len(splits) != 2:
-                continue
-            arg_name, arg_value = arg.split("=", 1)
-            if arg_name.strip() and arg_value.strip():
-                reference_data[arg_name.lower()] = arg_value
+        # Most reference templates follow the Literatur template and use named
+        # parameters. We extract them here.
+        # https://de.wiktionary.org/wiki/Vorlage:Literatur
+        for key, value in template_node.template_parameters.items():
+            if isinstance(key, str):
+                reference_data[key.lower()] = clean_node(wxr, {}, value)
+
+        # XXX: Treat other templates as well.
+        # E.g. https://de.wiktionary.org/wiki/Vorlage:Ref-OWID
 
     example_data["ref"] = reference_data
diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py
@@ -3,13 +3,29 @@
 from typing import Dict, List
 
 from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor.parser import LevelNode
 from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
 
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
 
 def extract_glosses(
+    wxr: WiktextractContext,
+    page_data: List[Dict],
+    level_node: LevelNode,
+) -> None:
+    for list_node in level_node.find_child(NodeKind.LIST):
+        process_gloss_list_item(wxr, page_data, list_node)
+
+    for non_list_node in level_node.invert_find_child(NodeKind.LIST):
+        wxr.wtp.debug(
+            f"Found unexpected non-list node in pronunciation section: {non_list_node}",
+            sortid="extractor/de/pronunciation/extract_pronunciation/64",
+        )
+
+
+def process_gloss_list_item(
     wxr: WiktextractContext,
     page_data: List[Dict],
     list_node: WikiNode,
@@ -54,7 +70,11 @@ def extract_glosses(
             senseid, gloss_text = match_senseid(gloss_text)
 
             if senseid:
-                senseid if senseid[0].isnumeric() else parent_senseid + senseid
+                senseid = (
+                    senseid
+                    if senseid[0].isnumeric()
+                    else parent_senseid + senseid
+                )
                 gloss_data["senseid"] = senseid
             else:
                 wxr.wtp.debug(
@@ -71,7 +91,7 @@ def extract_glosses(
                 page_data[-1]["senses"].append(gloss_data)
 
             for sub_list_node in sub_glosses_list_nodes:
-                extract_glosses(
+                process_gloss_list_item(
                     wxr,
                     page_data,
                     sub_list_node,

diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py
@@ -7,6 +7,7 @@
 from wikitextprocessor.parser import LevelNode
 
 from wiktextract.datautils import append_base_data
+from wiktextract.extractor.de.pronunciation import extract_pronunciation
 from wiktextract.wxr_context import WiktextractContext
 
 from .gloss import extract_glosses
@@ -71,9 +72,10 @@ def parse_section(
         wxr.wtp.start_subsection(section_name)
         if section_name == "Bedeutungen":
             extract_glosses(wxr, page_data, level_node)
+        if section_name == "Aussprache":
+            extract_pronunciation(wxr, page_data, level_node)
         if section_name == "Beispiele":
-            for list_node in level_node.find_child(NodeKind.LIST):
-                extract_examples(wxr, page_data, list_node)
+            extract_examples(wxr, page_data, level_node)
 
 
 FORM_POS = {

diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py
@@ -0,0 +1,189 @@
+from collections import defaultdict
+from typing import Dict, List, Union
+
+from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor.parser import LevelNode
+from wiktextract.extractor.share import create_audio_url_dict
+
+from wiktextract.page import clean_node
+from wiktextract.wxr_context import WiktextractContext
+
+
+def extract_pronunciation(
+    wxr: WiktextractContext,
+    page_data: List[Dict],
+    level_node: LevelNode,
+):
+    for list_node in level_node.find_child(NodeKind.LIST):
+        sound_data = [defaultdict(list)]
+
+        for not_list_item_node in list_node.invert_find_child(
+            NodeKind.LIST_ITEM
+        ):
+            wxr.wtp.debug(
+                f"Found unexpected non-list-item node in pronunciation section: {not_list_item_node}",
+                sortid="extractor/de/pronunciation/extract_pronunciation/28",
+            )
+
+        for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
+            children = list(list_item_node.filter_empty_str_child())
+            print(children)
+            if len(children) == 0:
+                continue
+
+            head_template, rest = children[0], children[1:]
+            if (
+                not isinstance(head_template, WikiNode)
+                or head_template.kind != NodeKind.TEMPLATE
+                or not rest
+            ):
+                wxr.wtp.debug(
+                    f"Found unexpected non-template node in pronunciation section: {head_template}",
+                    sortid="extractor/de/pronunciation/extract_pronunciation/37",
+                )
+                continue
+            if head_template.template_name == "IPA":
+                process_ipa(wxr, sound_data, rest)
+            elif head_template.template_name == "Hörbeispiele":
+                sound_data.append(defaultdict(list))
+                process_hoerbeispiele(wxr, sound_data, rest)
+            elif head_template.template_name == "Reime":
+                process_rhymes(wxr, sound_data, rest)
+            else:
+                wxr.wtp.debug(
+                    f"Found unexpected template in pronunciation section: {head_template} with content {rest}",
+                    sortid="extractor/de/pronunciation/extract_pronunciation/45)",
+                )
+
+        # Remove empty entries
+        sound_data = [entry for entry in sound_data if entry != {}]
+        if len(sound_data) > 0:
+            page_data[-1]["sounds"].extend(sound_data)
+
+    for non_list_node in level_node.invert_find_child(NodeKind.LIST):
+        wxr.wtp.debug(
+            f"Found unexpected non-list node in pronunciation section: {non_list_node}",
+            sortid="extractor/de/pronunciation/extract_pronunciation/64",
+        )
+
+
+def process_ipa(
+    wxr: WiktextractContext,
+    sound_data: List[Dict],
+    nodes: List[Union[WikiNode, str]],
+):
+    if not nodes:
+        return
+
+    head_node = nodes.pop(0)
+
+    if is_template_node_with_name(head_node, "Lautschrift"):
+        process_lautschrift_template(wxr, sound_data, head_node)
+    elif is_tag_node(head_node):
+        append_tag(wxr, sound_data, head_node)
+    elif is_new_sound_data_entry_sep(head_node):
+        sound_data.append(defaultdict(list))
+    else:
+        wxr.wtp.debug(
+            f"Found unexpected non-Lautschrift node in IPA section: {head_node}",
+            sortid="extractor/de/pronunciation/process_ipa/57",
+        )
+
+    if nodes:
+        process_ipa(wxr, sound_data, nodes)
+
+
+def process_lautschrift_template(
+    wxr: WiktextractContext, sound_data: List[Dict], node
+):
+    template_parameters = node.template_parameters
+
+    ipa = template_parameters.get(1)
+
+    lang_code = template_parameters.get("spr")
+    if lang_code:
+        language = wxr.wtp.LANGUAGES_BY_CODE[lang_code]
+        add_sound_data_without_appending_to_existing_properties(
+            sound_data,
+            {
+                "ipa": [ipa],
+                "lang_code": lang_code,
+                "language": language,
+            },
+        )
+    else:
+        sound_data[-1]["ipa"].append(ipa)
+
+
+def process_hoerbeispiele(
+    wxr: WiktextractContext, sound_data: List[Dict], nodes: List[WikiNode]
+):
+    for node in nodes:
+        if is_template_node_with_name(node, "Audio"):
+            process_audio_template(wxr, sound_data, node)
+        elif is_tag_node(node):
+            append_tag(wxr, sound_data, node)
+        elif is_new_sound_data_entry_sep(node):
+            sound_data.append(defaultdict(list))
+        else:
+            wxr.wtp.debug(
+                f"Found unexpected node in Hoerbeispiele section: {node}",
+                sortid="extractor/de/pronunciation/process_hoerbeispiele/193",
+            )
+
+
+def process_audio_template(
+    wxr: WiktextractContext, sound_data: List[Dict], node
+):
+    audio_file = node.template_parameters.get(1)
+    if audio_file:
+        add_sound_data_without_appending_to_existing_properties(
+            sound_data, create_audio_url_dict(audio_file)
+        )
+
+
+def process_rhymes(
+    wxr: WiktextractContext, sound_data: List[Dict], nodes: List[WikiNode]
+):
+    # XXX: Extract rhymes from the referenced rhymes page
+    pass
+
+
+def is_template_node_with_name(node: Union[WikiNode, str], template_name: str):
+    return (
+        isinstance(node, WikiNode)
+        and node.kind == NodeKind.TEMPLATE
+        and node.template_name == template_name
+    )
+
+
+def add_sound_data_without_appending_to_existing_properties(
+    sound_data: List[Dict],
+    new_sound_data: Dict,
+):
+    """Creates a new IPA data entry if properties exist in previous entry."""
+    if any([key in sound_data[-1] for key in new_sound_data.keys()]):
+        sound_data.append(defaultdict(list))
+
+    for key, value in new_sound_data.items():
+        if isinstance(value, str):
+            sound_data[-1][key] = value
+        else:
+            sound_data[-1][key].extend(value)
+
+
+def is_tag_node(node: Union[WikiNode, str]):
+    return isinstance(node, WikiNode) and node.kind in [
+        NodeKind.TEMPLATE,
+        NodeKind.ITALIC,
+    ]
+
+
+def append_tag(wxr: WiktextractContext, sound_data: Dict, node: WikiNode):
+    tag = clean_node(wxr, {}, node).strip()
+    if tag:
+        sound_data[-1]["tags"].append(tag)
+
+
+def is_new_sound_data_entry_sep(node: Union[WikiNode, str]):
+    return isinstance(node, str) and node.strip() in [",", ";"]