Skip to content

Commit

Permalink
[de] extract nested sound lists
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Dec 2, 2024
1 parent 95d2be1 commit 27dc26a
Show file tree
Hide file tree
Showing 7 changed files with 122 additions and 390 deletions.
16 changes: 2 additions & 14 deletions src/wiktextract/extractor/de/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,28 +105,16 @@ class Sense(BaseModelWrap):

class Sound(BaseModelWrap):
ipa: str = Field(default="", description="International Phonetic Alphabet")
# phonetic_transcription: list[str] = Field(
# default=[], description="Phonetic transcription, less exact than IPA."
# )
audio: str = Field(default="", description="Audio file name")
wav_url: str = Field(default="")
ogg_url: str = Field(default="")
mp3_url: str = Field(default="")
oga_url: str = Field(default="")
flac_url: str = Field(default="")
lang_code: str = Field(default="", description="Wiktionary language code")
lang: str = Field(default="", description="Localized language name")
# roman: list[str] = Field(
# default=[], description="Translitaration to Roman characters"
# )
# syllabic: list[str] = Field(
# default=[], description="Syllabic transcription"
# )
raw_tags: list[str] = Field(
default=[], description="Specifying the variant of the pronunciation"
)
raw_tags: list[str] = []
tags: list[str] = []
rhymes: str = ""
categories: list[str] = Field(default=[], exclude=True)


class Form(BaseModelWrap):
Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from .inflection import extract_inf_table_template
from .linkage import extract_linkages
from .models import Sense, WordEntry
from .pronunciation import extract_pronunciation
from .pronunciation import extract_pronunciation_section
from .section_titles import FORM_TITLES, LINKAGE_TITLES, POS_SECTIONS
from .translation import extract_translation

Expand Down Expand Up @@ -43,7 +43,7 @@ def parse_section(
level_node,
)
elif wxr.config.capture_pronunciation and section_name == "Aussprache":
extract_pronunciation(
extract_pronunciation_section(
wxr,
page_data[-1] if len(page_data) > 0 else base_data,
level_node,
Expand Down
276 changes: 74 additions & 202 deletions src/wiktextract/extractor/de/pronunciation.py
Original file line number Diff line number Diff line change
@@ -1,214 +1,86 @@
from typing import Union

from mediawiki_langcodes import code_to_name
from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from ..share import create_audio_url_dict
from ..share import set_sound_file_url_fields
from .models import Sound, WordEntry
from .tags import translate_raw_tags


def extract_pronunciation(
def extract_pronunciation_section(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: LevelNode,
):
for list_node in level_node.find_child(NodeKind.LIST):
sound_data: list[Sound] = [Sound()]

for not_list_item_node in list_node.invert_find_child(
NodeKind.LIST_ITEM
):
wxr.wtp.debug(
f"Found unexpected non-list-item node in pronunciation "
f"section: {not_list_item_node}",
sortid="extractor/de/pronunciation/extract_pronunciation/28",
)

for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
children = list(list_item_node.filter_empty_str_child())
if len(children) == 0:
continue

head_template, rest = children[0], children[1:]
if (
not isinstance(head_template, WikiNode)
or head_template.kind != NodeKind.TEMPLATE
or not rest
):
wxr.wtp.debug(
f"Found unexpected non-template node in pronunciation "
f"section: {head_template}",
sortid="extractor/de/pronunciation/43",
)
continue
if head_template.template_name == "IPA":
process_ipa(wxr, sound_data, rest)
elif head_template.template_name == "Hörbeispiele":
sound_data.append(Sound())
process_hoerbeispiele(wxr, sound_data, rest)
elif head_template.template_name == "Reime":
process_rhymes(wxr, sound_data, rest, word_entry)
else:
wxr.wtp.debug(
"Unexpected template in pronunciation section: "
f"{head_template} with content {rest}",
sortid="extractor/de/pronunciation/58)",
)

# Remove empty entries
sound_data = [
entry
for entry in sound_data
if entry.model_dump(exclude_defaults=True) != {}
]
if len(sound_data) > 0:
word_entry.sounds.extend(sound_data)

for non_list_node in level_node.invert_find_child(NodeKind.LIST):
wxr.wtp.debug(
"Unexpected non-list node in pronunciation section: "
f"{non_list_node}",
sortid="extractor/de/pronunciation/extract_pronunciation/64",
)


def process_ipa(
wxr: WiktextractContext,
sound_data: list[Sound],
nodes: list[Union[WikiNode, str]],
):
for node in nodes:
if is_template_node_with_name(node, "Lautschrift"):
process_lautschrift_template(wxr, sound_data, node)
elif is_tag_node(node):
append_tag(wxr, sound_data[-1], node)
elif is_new_sound_data_entry_sep(node):
sound_data.append(Sound())
else:
wxr.wtp.debug(
f"Found unexpected non-Lautschrift node in IPA section: {node}",
sortid="extractor/de/pronunciation/process_ipa/57",
)


def process_lautschrift_template(
wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode
) -> None:
template_parameters = node.template_parameters

ipa = template_parameters.get(1, "")

lang_code = template_parameters.get("spr")
if lang_code:
lang = code_to_name(lang_code, "de")
new_data = {
"lang_code": lang_code,
"lang": lang,
}
else:
new_data = dict()

new_data["ipa"] = ipa

add_sound_data_without_appending_to_existing_properties(
wxr,
sound_data,
new_data,
)


def process_hoerbeispiele(
wxr: WiktextractContext,
sound_data: list[Sound],
nodes: list[Union[str, WikiNode]],
):
for node in nodes:
if is_template_node_with_name(node, "Audio"):
process_audio_template(wxr, sound_data, node)
elif is_tag_node(node):
append_tag(wxr, sound_data[-1], node)
elif is_new_sound_data_entry_sep(node):
sound_data.append(Sound())
else:
wxr.wtp.debug(
f"Found unexpected node in Hoerbeispiele section: {node}",
sortid="extractor/de/pronunciation/process_hoerbeispiele/193",
)


def process_audio_template(
wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode
):
audio_file = node.template_parameters.get(1, "").strip()
if len(audio_file) > 0:
add_sound_data_without_appending_to_existing_properties(
wxr, sound_data, create_audio_url_dict(audio_file)
)


def process_rhymes(
wxr: WiktextractContext,
sound_data: list[Sound],
nodes: list[WikiNode],
word_entry: WordEntry,
):
for node in nodes:
if isinstance(node, TemplateNode) and node.template_name == "Reim":
# https://de.wiktionary.org/wiki/Vorlage:Reime
rhyme = clean_node(wxr, word_entry, node)
if rhyme != "":
sound_data.append(Sound(rhymes=rhyme))


def is_template_node_with_name(node: Union[WikiNode, str], template_name: str):
return (
isinstance(node, WikiNode)
and node.kind == NodeKind.TEMPLATE
and node.template_name == template_name
)


def add_sound_data_without_appending_to_existing_properties(
wxr: WiktextractContext,
sound_data: list[Sound],
new_sound_data: dict,
):
"""Creates a new IPA data entry if properties exist in previous entry."""
if any(
[
key in sound_data[-1].model_dump(exclude_defaults=True)
for key in new_sound_data.keys()
]
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
for sound in extract_pron_list_item(wxr, list_item):
word_entry.sounds.append(sound)
word_entry.categories.extend(sound.categories)


def extract_pron_list_item(
wxr: WiktextractContext, list_item: WikiNode
) -> list[Sound]:
raw_tags = []
sounds = []
for node in list_item.find_child(
NodeKind.TEMPLATE | NodeKind.ITALIC | NodeKind.LIST
):
sound_data.append(Sound())

for key, value in new_sound_data.items():
if key in sound_data[-1].model_fields:
if isinstance(value, str):
setattr(sound_data[-1], key, value)
else:
getattr(sound_data[-1], key).extend(value)
else:
wxr.wtp.debug(
f"Unexpected key {key} for Sound",
sortid="extractor/de/pronunciation/196",
)


def is_tag_node(node: Union[WikiNode, str]):
return isinstance(node, WikiNode) and node.kind in [
NodeKind.TEMPLATE,
NodeKind.ITALIC,
]


def append_tag(wxr: WiktextractContext, sound_data: Sound, node: WikiNode):
tag = clean_node(wxr, None, node)
if tag != "":
sound_data.raw_tags.append(tag)


def is_new_sound_data_entry_sep(node: Union[WikiNode, str]):
return isinstance(node, str) and node.strip() in [",", ";"]
match node.kind:
case NodeKind.ITALIC:
node_text = clean_node(wxr, None, node)
if node_text.endswith(":"):
raw_tags.append(node_text.removesuffix(":"))
case NodeKind.LIST:
for next_list_item in node.find_child(NodeKind.LIST_ITEM):
sounds.extend(extract_pron_list_item(wxr, next_list_item))
case NodeKind.TEMPLATE:
match node.template_name:
case "Lautschrift":
ipa = clean_node(
wxr,
None,
node.template_parameters.get(1, ""),
)
if ipa != "":
sounds.append(Sound(ipa=ipa))
clean_node(wxr, sounds[-1], node)
case "Audio":
new_sound = extract_audio_template(wxr, node)
if new_sound is not None:
sounds.append(new_sound)
case "Reim":
rhyme = clean_node(
wxr,
None,
node.template_parameters.get(1, ""),
)
if rhyme != "":
sounds.append(Sound(rhymes=rhyme))
clean_node(wxr, sounds[-1], node)

for sound in sounds:
sound.raw_tags.extend(raw_tags)
translate_raw_tags(sound)
return sounds


def extract_audio_template(
wxr: WiktextractContext, t_node: TemplateNode
) -> Sound | None:
# https://de.wiktionary.org/wiki/Vorlage:Audio
filename = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
if filename.strip() == "":
return None
sound = Sound()
set_sound_file_url_fields(wxr, filename, sound)
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for link_node in expanded_node.find_child(NodeKind.LINK):
link_str = clean_node(wxr, None, link_node)
if "(" in link_str:
sound.raw_tags.append(link_str[link_str.index("(") + 1:].strip(")"))
clean_node(wxr, sound, expanded_node)
return sound
1 change: 1 addition & 0 deletions src/wiktextract/extractor/de/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
# "das": "",
"Dativ": "dative",
# "DDR": "",
"Deutschland": "Germany",
# "der": "",
"dichter.": "poetic",
# "die": "",
Expand Down
2 changes: 1 addition & 1 deletion tests/test_de_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def test_tag_list(self):
{
"examples": [
{
"raw_tags": ["Deutschland"],
"tags": ["Germany"],
"text": "„Den ganzen ‚Feber‘ hörte man lapidar",
}
],
Expand Down
3 changes: 1 addition & 2 deletions tests/test_de_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,14 +230,13 @@ def test_italit_node_multiple_raw_tags(self):
[
{
"raw_tags": [
"Deutschland",
"Fernsehen",
"Kurzwort",
"Akronym",
],
"glosses": ["für das erste Fernsehprogramm der ARD"],
"sense_index": "2",
"tags": ["colloquial"],
"tags": ["Germany", "colloquial"],
},
],
)
Expand Down
Loading

0 comments on commit 27dc26a

Please sign in to comment.