Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[de] extract nested sound lists #931

Merged
merged 1 commit into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 2 additions & 14 deletions src/wiktextract/extractor/de/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,28 +105,16 @@ class Sense(BaseModelWrap):

class Sound(BaseModelWrap):
ipa: str = Field(default="", description="International Phonetic Alphabet")
# phonetic_transcription: list[str] = Field(
# default=[], description="Phonetic transcription, less exact than IPA."
# )
audio: str = Field(default="", description="Audio file name")
wav_url: str = Field(default="")
ogg_url: str = Field(default="")
mp3_url: str = Field(default="")
oga_url: str = Field(default="")
flac_url: str = Field(default="")
lang_code: str = Field(default="", description="Wiktionary language code")
lang: str = Field(default="", description="Localized language name")
# roman: list[str] = Field(
# default=[], description="Translitaration to Roman characters"
# )
# syllabic: list[str] = Field(
# default=[], description="Syllabic transcription"
# )
raw_tags: list[str] = Field(
default=[], description="Specifying the variant of the pronunciation"
)
raw_tags: list[str] = []
tags: list[str] = []
rhymes: str = ""
categories: list[str] = Field(default=[], exclude=True)


class Form(BaseModelWrap):
Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from .inflection import extract_inf_table_template
from .linkage import extract_linkages
from .models import Sense, WordEntry
from .pronunciation import extract_pronunciation
from .pronunciation import extract_pronunciation_section
from .section_titles import FORM_TITLES, LINKAGE_TITLES, POS_SECTIONS
from .translation import extract_translation

Expand Down Expand Up @@ -43,7 +43,7 @@ def parse_section(
level_node,
)
elif wxr.config.capture_pronunciation and section_name == "Aussprache":
extract_pronunciation(
extract_pronunciation_section(
wxr,
page_data[-1] if len(page_data) > 0 else base_data,
level_node,
Expand Down
276 changes: 74 additions & 202 deletions src/wiktextract/extractor/de/pronunciation.py
Original file line number Diff line number Diff line change
@@ -1,214 +1,86 @@
from typing import Union

from mediawiki_langcodes import code_to_name
from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from ..share import create_audio_url_dict
from ..share import set_sound_file_url_fields
from .models import Sound, WordEntry
from .tags import translate_raw_tags


def extract_pronunciation(
def extract_pronunciation_section(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: LevelNode,
):
for list_node in level_node.find_child(NodeKind.LIST):
sound_data: list[Sound] = [Sound()]

for not_list_item_node in list_node.invert_find_child(
NodeKind.LIST_ITEM
):
wxr.wtp.debug(
f"Found unexpected non-list-item node in pronunciation "
f"section: {not_list_item_node}",
sortid="extractor/de/pronunciation/extract_pronunciation/28",
)

for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
children = list(list_item_node.filter_empty_str_child())
if len(children) == 0:
continue

head_template, rest = children[0], children[1:]
if (
not isinstance(head_template, WikiNode)
or head_template.kind != NodeKind.TEMPLATE
or not rest
):
wxr.wtp.debug(
f"Found unexpected non-template node in pronunciation "
f"section: {head_template}",
sortid="extractor/de/pronunciation/43",
)
continue
if head_template.template_name == "IPA":
process_ipa(wxr, sound_data, rest)
elif head_template.template_name == "Hörbeispiele":
sound_data.append(Sound())
process_hoerbeispiele(wxr, sound_data, rest)
elif head_template.template_name == "Reime":
process_rhymes(wxr, sound_data, rest, word_entry)
else:
wxr.wtp.debug(
"Unexpected template in pronunciation section: "
f"{head_template} with content {rest}",
sortid="extractor/de/pronunciation/58)",
)

# Remove empty entries
sound_data = [
entry
for entry in sound_data
if entry.model_dump(exclude_defaults=True) != {}
]
if len(sound_data) > 0:
word_entry.sounds.extend(sound_data)

for non_list_node in level_node.invert_find_child(NodeKind.LIST):
wxr.wtp.debug(
"Unexpected non-list node in pronunciation section: "
f"{non_list_node}",
sortid="extractor/de/pronunciation/extract_pronunciation/64",
)


def process_ipa(
wxr: WiktextractContext,
sound_data: list[Sound],
nodes: list[Union[WikiNode, str]],
):
for node in nodes:
if is_template_node_with_name(node, "Lautschrift"):
process_lautschrift_template(wxr, sound_data, node)
elif is_tag_node(node):
append_tag(wxr, sound_data[-1], node)
elif is_new_sound_data_entry_sep(node):
sound_data.append(Sound())
else:
wxr.wtp.debug(
f"Found unexpected non-Lautschrift node in IPA section: {node}",
sortid="extractor/de/pronunciation/process_ipa/57",
)


def process_lautschrift_template(
wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode
) -> None:
template_parameters = node.template_parameters

ipa = template_parameters.get(1, "")

lang_code = template_parameters.get("spr")
if lang_code:
lang = code_to_name(lang_code, "de")
new_data = {
"lang_code": lang_code,
"lang": lang,
}
else:
new_data = dict()

new_data["ipa"] = ipa

add_sound_data_without_appending_to_existing_properties(
wxr,
sound_data,
new_data,
)


def process_hoerbeispiele(
wxr: WiktextractContext,
sound_data: list[Sound],
nodes: list[Union[str, WikiNode]],
):
for node in nodes:
if is_template_node_with_name(node, "Audio"):
process_audio_template(wxr, sound_data, node)
elif is_tag_node(node):
append_tag(wxr, sound_data[-1], node)
elif is_new_sound_data_entry_sep(node):
sound_data.append(Sound())
else:
wxr.wtp.debug(
f"Found unexpected node in Hoerbeispiele section: {node}",
sortid="extractor/de/pronunciation/process_hoerbeispiele/193",
)


def process_audio_template(
wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode
):
audio_file = node.template_parameters.get(1, "").strip()
if len(audio_file) > 0:
add_sound_data_without_appending_to_existing_properties(
wxr, sound_data, create_audio_url_dict(audio_file)
)


def process_rhymes(
wxr: WiktextractContext,
sound_data: list[Sound],
nodes: list[WikiNode],
word_entry: WordEntry,
):
for node in nodes:
if isinstance(node, TemplateNode) and node.template_name == "Reim":
# https://de.wiktionary.org/wiki/Vorlage:Reime
rhyme = clean_node(wxr, word_entry, node)
if rhyme != "":
sound_data.append(Sound(rhymes=rhyme))


def is_template_node_with_name(node: Union[WikiNode, str], template_name: str):
return (
isinstance(node, WikiNode)
and node.kind == NodeKind.TEMPLATE
and node.template_name == template_name
)


def add_sound_data_without_appending_to_existing_properties(
wxr: WiktextractContext,
sound_data: list[Sound],
new_sound_data: dict,
):
"""Creates a new IPA data entry if properties exist in previous entry."""
if any(
[
key in sound_data[-1].model_dump(exclude_defaults=True)
for key in new_sound_data.keys()
]
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
for sound in extract_pron_list_item(wxr, list_item):
word_entry.sounds.append(sound)
word_entry.categories.extend(sound.categories)


def extract_pron_list_item(
wxr: WiktextractContext, list_item: WikiNode
) -> list[Sound]:
raw_tags = []
sounds = []
for node in list_item.find_child(
NodeKind.TEMPLATE | NodeKind.ITALIC | NodeKind.LIST
):
sound_data.append(Sound())

for key, value in new_sound_data.items():
if key in sound_data[-1].model_fields:
if isinstance(value, str):
setattr(sound_data[-1], key, value)
else:
getattr(sound_data[-1], key).extend(value)
else:
wxr.wtp.debug(
f"Unexpected key {key} for Sound",
sortid="extractor/de/pronunciation/196",
)


def is_tag_node(node: Union[WikiNode, str]):
return isinstance(node, WikiNode) and node.kind in [
NodeKind.TEMPLATE,
NodeKind.ITALIC,
]


def append_tag(wxr: WiktextractContext, sound_data: Sound, node: WikiNode):
tag = clean_node(wxr, None, node)
if tag != "":
sound_data.raw_tags.append(tag)


def is_new_sound_data_entry_sep(node: Union[WikiNode, str]):
return isinstance(node, str) and node.strip() in [",", ";"]
match node.kind:
case NodeKind.ITALIC:
node_text = clean_node(wxr, None, node)
if node_text.endswith(":"):
raw_tags.append(node_text.removesuffix(":"))
case NodeKind.LIST:
for next_list_item in node.find_child(NodeKind.LIST_ITEM):
sounds.extend(extract_pron_list_item(wxr, next_list_item))
case NodeKind.TEMPLATE:
match node.template_name:
case "Lautschrift":
ipa = clean_node(
wxr,
None,
node.template_parameters.get(1, ""),
)
if ipa != "":
sounds.append(Sound(ipa=ipa))
clean_node(wxr, sounds[-1], node)
case "Audio":
new_sound = extract_audio_template(wxr, node)
if new_sound is not None:
sounds.append(new_sound)
case "Reim":
rhyme = clean_node(
wxr,
None,
node.template_parameters.get(1, ""),
)
if rhyme != "":
sounds.append(Sound(rhymes=rhyme))
clean_node(wxr, sounds[-1], node)

for sound in sounds:
sound.raw_tags.extend(raw_tags)
translate_raw_tags(sound)
return sounds


def extract_audio_template(
wxr: WiktextractContext, t_node: TemplateNode
) -> Sound | None:
# https://de.wiktionary.org/wiki/Vorlage:Audio
filename = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
if filename.strip() == "":
return None
sound = Sound()
set_sound_file_url_fields(wxr, filename, sound)
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for link_node in expanded_node.find_child(NodeKind.LINK):
link_str = clean_node(wxr, None, link_node)
if "(" in link_str:
sound.raw_tags.append(link_str[link_str.index("(") + 1:].strip(")"))
clean_node(wxr, sound, expanded_node)
return sound
1 change: 1 addition & 0 deletions src/wiktextract/extractor/de/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
# "das": "",
"Dativ": "dative",
# "DDR": "",
"Deutschland": "Germany",
# "der": "",
"dichter.": "poetic",
# "die": "",
Expand Down
2 changes: 1 addition & 1 deletion tests/test_de_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def test_tag_list(self):
{
"examples": [
{
"raw_tags": ["Deutschland"],
"tags": ["Germany"],
"text": "„Den ganzen ‚Feber‘ hörte man lapidar",
}
],
Expand Down
3 changes: 1 addition & 2 deletions tests/test_de_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,14 +230,13 @@ def test_italit_node_multiple_raw_tags(self):
[
{
"raw_tags": [
"Deutschland",
"Fernsehen",
"Kurzwort",
"Akronym",
],
"glosses": ["für das erste Fernsehprogramm der ARD"],
"sense_index": "2",
"tags": ["colloquial"],
"tags": ["Germany", "colloquial"],
},
],
)
Expand Down
Loading