diff --git a/src/wiktextract/extractor/de/models.py b/src/wiktextract/extractor/de/models.py
index a2ec3d27..ded7a29f 100644
--- a/src/wiktextract/extractor/de/models.py
+++ b/src/wiktextract/extractor/de/models.py
@@ -105,28 +105,16 @@ class Sense(BaseModelWrap):
class Sound(BaseModelWrap):
ipa: str = Field(default="", description="International Phonetic Alphabet")
- # phonetic_transcription: list[str] = Field(
- # default=[], description="Phonetic transcription, less exact than IPA."
- # )
audio: str = Field(default="", description="Audio file name")
wav_url: str = Field(default="")
ogg_url: str = Field(default="")
mp3_url: str = Field(default="")
oga_url: str = Field(default="")
flac_url: str = Field(default="")
- lang_code: str = Field(default="", description="Wiktionary language code")
- lang: str = Field(default="", description="Localized language name")
- # roman: list[str] = Field(
- # default=[], description="Translitaration to Roman characters"
- # )
- # syllabic: list[str] = Field(
- # default=[], description="Syllabic transcription"
- # )
- raw_tags: list[str] = Field(
- default=[], description="Specifying the variant of the pronunciation"
- )
+ raw_tags: list[str] = []
tags: list[str] = []
rhymes: str = ""
+ categories: list[str] = Field(default=[], exclude=True)
class Form(BaseModelWrap):
diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py
index 28772313..784f01dc 100644
--- a/src/wiktextract/extractor/de/page.py
+++ b/src/wiktextract/extractor/de/page.py
@@ -13,7 +13,7 @@
from .inflection import extract_inf_table_template
from .linkage import extract_linkages
from .models import Sense, WordEntry
-from .pronunciation import extract_pronunciation
+from .pronunciation import extract_pronunciation_section
from .section_titles import FORM_TITLES, LINKAGE_TITLES, POS_SECTIONS
from .translation import extract_translation
@@ -43,7 +43,7 @@ def parse_section(
level_node,
)
elif wxr.config.capture_pronunciation and section_name == "Aussprache":
- extract_pronunciation(
+ extract_pronunciation_section(
wxr,
page_data[-1] if len(page_data) > 0 else base_data,
level_node,
diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py
index 1058af3f..db461eae 100644
--- a/src/wiktextract/extractor/de/pronunciation.py
+++ b/src/wiktextract/extractor/de/pronunciation.py
@@ -1,214 +1,86 @@
-from typing import Union
-
-from mediawiki_langcodes import code_to_name
from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode
from ...page import clean_node
from ...wxr_context import WiktextractContext
-from ..share import create_audio_url_dict
+from ..share import set_sound_file_url_fields
from .models import Sound, WordEntry
+from .tags import translate_raw_tags
-def extract_pronunciation(
+def extract_pronunciation_section(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: LevelNode,
-):
- for list_node in level_node.find_child(NodeKind.LIST):
- sound_data: list[Sound] = [Sound()]
-
- for not_list_item_node in list_node.invert_find_child(
- NodeKind.LIST_ITEM
- ):
- wxr.wtp.debug(
- f"Found unexpected non-list-item node in pronunciation "
- f"section: {not_list_item_node}",
- sortid="extractor/de/pronunciation/extract_pronunciation/28",
- )
-
- for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
- children = list(list_item_node.filter_empty_str_child())
- if len(children) == 0:
- continue
-
- head_template, rest = children[0], children[1:]
- if (
- not isinstance(head_template, WikiNode)
- or head_template.kind != NodeKind.TEMPLATE
- or not rest
- ):
- wxr.wtp.debug(
- f"Found unexpected non-template node in pronunciation "
- f"section: {head_template}",
- sortid="extractor/de/pronunciation/43",
- )
- continue
- if head_template.template_name == "IPA":
- process_ipa(wxr, sound_data, rest)
- elif head_template.template_name == "Hörbeispiele":
- sound_data.append(Sound())
- process_hoerbeispiele(wxr, sound_data, rest)
- elif head_template.template_name == "Reime":
- process_rhymes(wxr, sound_data, rest, word_entry)
- else:
- wxr.wtp.debug(
- "Unexpected template in pronunciation section: "
- f"{head_template} with content {rest}",
- sortid="extractor/de/pronunciation/58)",
- )
-
- # Remove empty entries
- sound_data = [
- entry
- for entry in sound_data
- if entry.model_dump(exclude_defaults=True) != {}
- ]
- if len(sound_data) > 0:
- word_entry.sounds.extend(sound_data)
-
- for non_list_node in level_node.invert_find_child(NodeKind.LIST):
- wxr.wtp.debug(
- "Unexpected non-list node in pronunciation section: "
- f"{non_list_node}",
- sortid="extractor/de/pronunciation/extract_pronunciation/64",
- )
-
-
-def process_ipa(
- wxr: WiktextractContext,
- sound_data: list[Sound],
- nodes: list[Union[WikiNode, str]],
-):
- for node in nodes:
- if is_template_node_with_name(node, "Lautschrift"):
- process_lautschrift_template(wxr, sound_data, node)
- elif is_tag_node(node):
- append_tag(wxr, sound_data[-1], node)
- elif is_new_sound_data_entry_sep(node):
- sound_data.append(Sound())
- else:
- wxr.wtp.debug(
- f"Found unexpected non-Lautschrift node in IPA section: {node}",
- sortid="extractor/de/pronunciation/process_ipa/57",
- )
-
-
-def process_lautschrift_template(
- wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode
) -> None:
- template_parameters = node.template_parameters
-
- ipa = template_parameters.get(1, "")
-
- lang_code = template_parameters.get("spr")
- if lang_code:
- lang = code_to_name(lang_code, "de")
- new_data = {
- "lang_code": lang_code,
- "lang": lang,
- }
- else:
- new_data = dict()
-
- new_data["ipa"] = ipa
-
- add_sound_data_without_appending_to_existing_properties(
- wxr,
- sound_data,
- new_data,
- )
-
-
-def process_hoerbeispiele(
- wxr: WiktextractContext,
- sound_data: list[Sound],
- nodes: list[Union[str, WikiNode]],
-):
- for node in nodes:
- if is_template_node_with_name(node, "Audio"):
- process_audio_template(wxr, sound_data, node)
- elif is_tag_node(node):
- append_tag(wxr, sound_data[-1], node)
- elif is_new_sound_data_entry_sep(node):
- sound_data.append(Sound())
- else:
- wxr.wtp.debug(
- f"Found unexpected node in Hoerbeispiele section: {node}",
- sortid="extractor/de/pronunciation/process_hoerbeispiele/193",
- )
-
-
-def process_audio_template(
- wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode
-):
- audio_file = node.template_parameters.get(1, "").strip()
- if len(audio_file) > 0:
- add_sound_data_without_appending_to_existing_properties(
- wxr, sound_data, create_audio_url_dict(audio_file)
- )
-
-
-def process_rhymes(
- wxr: WiktextractContext,
- sound_data: list[Sound],
- nodes: list[WikiNode],
- word_entry: WordEntry,
-):
- for node in nodes:
- if isinstance(node, TemplateNode) and node.template_name == "Reim":
- # https://de.wiktionary.org/wiki/Vorlage:Reime
- rhyme = clean_node(wxr, word_entry, node)
- if rhyme != "":
- sound_data.append(Sound(rhymes=rhyme))
-
-
-def is_template_node_with_name(node: Union[WikiNode, str], template_name: str):
- return (
- isinstance(node, WikiNode)
- and node.kind == NodeKind.TEMPLATE
- and node.template_name == template_name
- )
-
-
-def add_sound_data_without_appending_to_existing_properties(
- wxr: WiktextractContext,
- sound_data: list[Sound],
- new_sound_data: dict,
-):
- """Creates a new IPA data entry if properties exist in previous entry."""
- if any(
- [
- key in sound_data[-1].model_dump(exclude_defaults=True)
- for key in new_sound_data.keys()
- ]
+ for list_node in level_node.find_child(NodeKind.LIST):
+ for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+ for sound in extract_pron_list_item(wxr, list_item):
+ word_entry.sounds.append(sound)
+ word_entry.categories.extend(sound.categories)
+
+
+def extract_pron_list_item(
+ wxr: WiktextractContext, list_item: WikiNode
+) -> list[Sound]:
+ raw_tags = []
+ sounds = []
+ for node in list_item.find_child(
+ NodeKind.TEMPLATE | NodeKind.ITALIC | NodeKind.LIST
):
- sound_data.append(Sound())
-
- for key, value in new_sound_data.items():
- if key in sound_data[-1].model_fields:
- if isinstance(value, str):
- setattr(sound_data[-1], key, value)
- else:
- getattr(sound_data[-1], key).extend(value)
- else:
- wxr.wtp.debug(
- f"Unexpected key {key} for Sound",
- sortid="extractor/de/pronunciation/196",
- )
-
-
-def is_tag_node(node: Union[WikiNode, str]):
- return isinstance(node, WikiNode) and node.kind in [
- NodeKind.TEMPLATE,
- NodeKind.ITALIC,
- ]
-
-
-def append_tag(wxr: WiktextractContext, sound_data: Sound, node: WikiNode):
- tag = clean_node(wxr, None, node)
- if tag != "":
- sound_data.raw_tags.append(tag)
-
-
-def is_new_sound_data_entry_sep(node: Union[WikiNode, str]):
- return isinstance(node, str) and node.strip() in [",", ";"]
+ match node.kind:
+ case NodeKind.ITALIC:
+ node_text = clean_node(wxr, None, node)
+ if node_text.endswith(":"):
+ raw_tags.append(node_text.removesuffix(":"))
+ case NodeKind.LIST:
+ for next_list_item in node.find_child(NodeKind.LIST_ITEM):
+ sounds.extend(extract_pron_list_item(wxr, next_list_item))
+ case NodeKind.TEMPLATE:
+ match node.template_name:
+ case "Lautschrift":
+ ipa = clean_node(
+ wxr,
+ None,
+ node.template_parameters.get(1, ""),
+ )
+ if ipa != "":
+ sounds.append(Sound(ipa=ipa))
+ clean_node(wxr, sounds[-1], node)
+ case "Audio":
+ new_sound = extract_audio_template(wxr, node)
+ if new_sound is not None:
+ sounds.append(new_sound)
+ case "Reim":
+ rhyme = clean_node(
+ wxr,
+ None,
+ node.template_parameters.get(1, ""),
+ )
+ if rhyme != "":
+ sounds.append(Sound(rhymes=rhyme))
+ clean_node(wxr, sounds[-1], node)
+
+ for sound in sounds:
+ sound.raw_tags.extend(raw_tags)
+ translate_raw_tags(sound)
+ return sounds
+
+
+def extract_audio_template(
+ wxr: WiktextractContext, t_node: TemplateNode
+) -> Sound | None:
+ # https://de.wiktionary.org/wiki/Vorlage:Audio
+ filename = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+ if filename.strip() == "":
+ return None
+ sound = Sound()
+ set_sound_file_url_fields(wxr, filename, sound)
+ expanded_node = wxr.wtp.parse(
+ wxr.wtp.node_to_wikitext(t_node), expand_all=True
+ )
+ for link_node in expanded_node.find_child(NodeKind.LINK):
+ link_str = clean_node(wxr, None, link_node)
+ if "(" in link_str:
+ sound.raw_tags.append(link_str[link_str.index("(") + 1:].strip(")"))
+ clean_node(wxr, sound, expanded_node)
+ return sound
diff --git a/src/wiktextract/extractor/de/tags.py b/src/wiktextract/extractor/de/tags.py
index e55fc783..c1488dd1 100644
--- a/src/wiktextract/extractor/de/tags.py
+++ b/src/wiktextract/extractor/de/tags.py
@@ -40,6 +40,7 @@
# "das": "",
"Dativ": "dative",
# "DDR": "",
+ "Deutschland": "Germany",
# "der": "",
"dichter.": "poetic",
# "die": "",
diff --git a/tests/test_de_example.py b/tests/test_de_example.py
index bcab8f26..5e3d124b 100644
--- a/tests/test_de_example.py
+++ b/tests/test_de_example.py
@@ -260,7 +260,7 @@ def test_tag_list(self):
{
"examples": [
{
- "raw_tags": ["Deutschland"],
+ "tags": ["Germany"],
"text": "„Den ganzen ‚Feber‘ hörte man lapidar",
}
],
diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py
index f868f446..d9bebf69 100644
--- a/tests/test_de_gloss.py
+++ b/tests/test_de_gloss.py
@@ -230,14 +230,13 @@ def test_italit_node_multiple_raw_tags(self):
[
{
"raw_tags": [
- "Deutschland",
"Fernsehen",
"Kurzwort",
"Akronym",
],
"glosses": ["für das erste Fernsehprogramm der ARD"],
"sense_index": "2",
- "tags": ["colloquial"],
+ "tags": ["Germany", "colloquial"],
},
],
)
diff --git a/tests/test_de_pronunciation.py b/tests/test_de_pronunciation.py
index e20b6b90..956d2749 100644
--- a/tests/test_de_pronunciation.py
+++ b/tests/test_de_pronunciation.py
@@ -3,12 +3,7 @@
from wikitextprocessor import Wtp
from wiktextract.config import WiktionaryConfig
-from wiktextract.extractor.de.models import Sound
-from wiktextract.extractor.de.pronunciation import (
- process_hoerbeispiele,
- process_ipa,
- process_lautschrift_template,
-)
+from wiktextract.extractor.de.page import parse_page
from wiktextract.wxr_context import WiktextractContext
@@ -17,173 +12,50 @@ class TestDEPronunciation(unittest.TestCase):
def setUp(self) -> None:
self.wxr = WiktextractContext(
- Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de")
+ Wtp(lang_code="de"),
+ WiktionaryConfig(
+ dump_file_lang_code="de", capture_language_codes=None
+ ),
)
def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
- def test_de_process_ipa(self):
- test_cases = [
- {
- "input": "{{Lautschrift|ipa1}}",
- "expected": [
- {
- "ipa": "ipa1",
- }
- ],
- },
- {
- "input": "{{Lautschrift|ipa1|spr=de}}",
- "expected": [
- {
- "ipa": "ipa1",
- "lang": "Deutsch",
- "lang_code": "de",
- }
- ],
- },
- {
- "input": "{{Lautschrift|ipa1}} {{Lautschrift|ipa2}}{{Lautschrift|ipa3|spr=de}}",
- "expected": [
- {"ipa": "ipa1"},
- {"ipa": "ipa2"},
- {
- "ipa": "ipa3",
- "lang": "Deutsch",
- "lang_code": "de",
- },
- ],
- },
- {
- "input": "{{Lautschrift|ipa1}}, ''tag1'' {{Lautschrift|ipa2}}",
- "expected": [
- {"ipa": "ipa1"},
- {"ipa": "ipa2", "raw_tags": ["tag1"]},
- ],
- },
- ]
-
- for case in test_cases:
- with self.subTest(case=case):
- self.wxr.wtp.start_page("")
- self.wxr.wtp.add_page("Vorlage:IPA", 10, "")
- self.wxr.wtp.add_page("Vorlage:Lautschrift", 10, "(Deutsch)")
-
- root = self.wxr.wtp.parse(case["input"])
-
- sound_data = [Sound()]
-
- process_ipa(
- self.wxr, sound_data, list(root.filter_empty_str_child())
- )
-
- sounds = [
- s.model_dump(exclude_defaults=True) for s in sound_data
- ]
- self.assertEqual(sounds, case["expected"])
-
- def test_de_process_hoerbeispiele(self):
- # https://de.wiktionary.org/wiki/Beispiel
- filename1 = "De-Beispiel.ogg"
- # https://de.wiktionary.org/wiki/butineur
- filename2 = "LL-Q150 (fra)-WikiLucas00-butineur.wav"
- test_cases = [
- {
- "input": "{{Audio|" + filename1 + "}}",
- "expected": [
- {
- "audio": filename1,
- "mp3_url": None, # None = we don't care about exact val
- "ogg_url": None,
- }
- ],
- },
- {
- "input": "{{Audio|"
- + filename1
- + "}} {{Audio|"
- + filename2
- + "}}",
- "expected": [
- {
- "audio": filename1,
- "mp3_url": None,
- "ogg_url": None,
- },
- {
- "audio": filename2,
- "ogg_url": None,
- "mp3_url": None,
- "wav_url": None,
- },
- ],
- },
- {
- "input": "{{Audio|"
- + filename1
- + "}} ''tag1'', ''tag2'' {{Audio|"
- + filename2
- + "}}",
- "expected": [
- {
- "audio": filename1,
- "mp3_url": None,
- "ogg_url": None,
- "raw_tags": ["tag1"],
- },
- {
- "audio": filename2,
- "mp3_url": None,
- "ogg_url": None,
- "wav_url": None,
- "raw_tags": ["tag2"],
- },
- ],
- },
- ]
-
- for case in test_cases:
- with self.subTest(case=case):
- self.wxr.wtp.start_page("")
- self.wxr.wtp.add_page("Vorlage:IPA", 10, "")
- self.wxr.wtp.add_page("Vorlage:Audio", 10, "")
-
- root = self.wxr.wtp.parse(case["input"])
-
- sound_data = [Sound()]
-
- process_hoerbeispiele(
- self.wxr, sound_data, list(root.filter_empty_str_child())
- )
-
- sounds = [
- s.model_dump(exclude_defaults=True) for s in sound_data
- ]
- self.assertSoundDataMatchesExpected(sounds, case["expected"])
-
- def assertSoundDataMatchesExpected(self, sound_data, expected):
+ def test_normal_page(self):
+ self.wxr.wtp.add_page(
+ "Vorlage:Audio",
+ 10,
+ """[[Datei:Loudspeaker.svg|15px|Lautsprecherbild|link=]] [[Media:De-at-Hund.ogg|Hund (Österreich)]] ([[:Datei:De-at-Hund.ogg|Info]])[[Kategorie:Wiktionary:Audio-Datei]]""",
+ )
+ data = parse_page(
+ self.wxr,
+ "Hund",
+ """== Hund ({{Sprache|Deutsch}}) ==
+=== {{Wortart|Substantiv|Deutsch}}, {{m}} ===
+==== Aussprache ====
+:{{IPA}} {{Lautschrift|hʊnt}}
+:{{Hörbeispiele}} {{Audio|De-at-Hund.ogg|spr=at}}
+:{{Reime}} {{Reim|ʊnt|Deutsch}}
+==== Bedeutungen ====
+:[1] [[Haustier]]""",
+ )
+ self.assertEqual(data[0]["sounds"][0], {"ipa": "hʊnt"})
+ self.assertEqual(data[0]["sounds"][1]["audio"], "De-at-Hund.ogg")
+ self.assertEqual(data[0]["sounds"][1]["tags"], ["Austrian German"])
+ self.assertEqual(data[0]["sounds"][2], {"rhymes": "ʊnt"})
+
+ def test_nested_lists(self):
+ data = parse_page(
+ self.wxr,
+ "Garage",
+ """== Garage ({{Sprache|Deutsch}}) ==
+=== {{Wortart|Substantiv|Deutsch}}, {{f}} ===
+==== Aussprache ====
+:{{IPA}}
+::''[[Deutschland]]:'' {{Lautschrift|ɡaˈʁaːʒə}}
+==== Bedeutungen ====
+:[1] [[Raum]]""",
+ )
self.assertEqual(
- len(sound_data),
- len(expected),
- f"Mismatch in number of sound data entries{sound_data}",
+ data[0]["sounds"][0], {"ipa": "ɡaˈʁaːʒə", "tags": ["Germany"]}
)
-
- for data, exp in zip(sound_data, expected):
- for key, value in exp.items():
- if value is None:
- self.assertIn(key, data)
- else:
- self.assertEqual(data[key], value)
-
- for key in data:
- self.assertIn(key, exp)
- if exp[key] is not None:
- self.assertEqual(data[key], exp[key])
-
- def test_empty_ipa_in_lautschrift(self):
- self.wxr.wtp.start_page("BU")
- root = self.wxr.wtp.parse("{{Lautschrift}}")
- sound_data = [Sound()]
- process_lautschrift_template(self.wxr, sound_data, root.children[0])
- self.assertEqual(sound_data[0].model_dump(exclude_defaults=True), {})