diff --git a/src/wiktextract/extractor/de/models.py b/src/wiktextract/extractor/de/models.py index a2ec3d27..ded7a29f 100644 --- a/src/wiktextract/extractor/de/models.py +++ b/src/wiktextract/extractor/de/models.py @@ -105,28 +105,16 @@ class Sense(BaseModelWrap): class Sound(BaseModelWrap): ipa: str = Field(default="", description="International Phonetic Alphabet") - # phonetic_transcription: list[str] = Field( - # default=[], description="Phonetic transcription, less exact than IPA." - # ) audio: str = Field(default="", description="Audio file name") wav_url: str = Field(default="") ogg_url: str = Field(default="") mp3_url: str = Field(default="") oga_url: str = Field(default="") flac_url: str = Field(default="") - lang_code: str = Field(default="", description="Wiktionary language code") - lang: str = Field(default="", description="Localized language name") - # roman: list[str] = Field( - # default=[], description="Translitaration to Roman characters" - # ) - # syllabic: list[str] = Field( - # default=[], description="Syllabic transcription" - # ) - raw_tags: list[str] = Field( - default=[], description="Specifying the variant of the pronunciation" - ) + raw_tags: list[str] = [] tags: list[str] = [] rhymes: str = "" + categories: list[str] = Field(default=[], exclude=True) class Form(BaseModelWrap): diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py index 28772313..784f01dc 100644 --- a/src/wiktextract/extractor/de/page.py +++ b/src/wiktextract/extractor/de/page.py @@ -13,7 +13,7 @@ from .inflection import extract_inf_table_template from .linkage import extract_linkages from .models import Sense, WordEntry -from .pronunciation import extract_pronunciation +from .pronunciation import extract_pronunciation_section from .section_titles import FORM_TITLES, LINKAGE_TITLES, POS_SECTIONS from .translation import extract_translation @@ -43,7 +43,7 @@ def parse_section( level_node, ) elif wxr.config.capture_pronunciation and section_name == "Aussprache": - extract_pronunciation( + extract_pronunciation_section( wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node, diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py index 1058af3f..db461eae 100644 --- a/src/wiktextract/extractor/de/pronunciation.py +++ b/src/wiktextract/extractor/de/pronunciation.py @@ -1,214 +1,86 @@ -from typing import Union - -from mediawiki_langcodes import code_to_name from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode from ...page import clean_node from ...wxr_context import WiktextractContext -from ..share import create_audio_url_dict +from ..share import set_sound_file_url_fields from .models import Sound, WordEntry +from .tags import translate_raw_tags -def extract_pronunciation( +def extract_pronunciation_section( wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode, -): - for list_node in level_node.find_child(NodeKind.LIST): - sound_data: list[Sound] = [Sound()] - - for not_list_item_node in list_node.invert_find_child( - NodeKind.LIST_ITEM - ): - wxr.wtp.debug( - f"Found unexpected non-list-item node in pronunciation " - f"section: {not_list_item_node}", - sortid="extractor/de/pronunciation/extract_pronunciation/28", - ) - - for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): - children = list(list_item_node.filter_empty_str_child()) - if len(children) == 0: - continue - - head_template, rest = children[0], children[1:] - if ( - not isinstance(head_template, WikiNode) - or head_template.kind != NodeKind.TEMPLATE - or not rest - ): - wxr.wtp.debug( - f"Found unexpected non-template node in pronunciation " - f"section: {head_template}", - sortid="extractor/de/pronunciation/43", - ) - continue - if head_template.template_name == "IPA": - process_ipa(wxr, sound_data, rest) - elif head_template.template_name == "Hörbeispiele": - sound_data.append(Sound()) - process_hoerbeispiele(wxr, sound_data, rest) - elif head_template.template_name == "Reime": - process_rhymes(wxr, sound_data, rest, word_entry) - else: - wxr.wtp.debug( - "Unexpected template in pronunciation section: " - f"{head_template} with content {rest}", - sortid="extractor/de/pronunciation/58)", - ) - - # Remove empty entries - sound_data = [ - entry - for entry in sound_data - if entry.model_dump(exclude_defaults=True) != {} - ] - if len(sound_data) > 0: - word_entry.sounds.extend(sound_data) - - for non_list_node in level_node.invert_find_child(NodeKind.LIST): - wxr.wtp.debug( - "Unexpected non-list node in pronunciation section: " - f"{non_list_node}", - sortid="extractor/de/pronunciation/extract_pronunciation/64", - ) - - -def process_ipa( - wxr: WiktextractContext, - sound_data: list[Sound], - nodes: list[Union[WikiNode, str]], -): - for node in nodes: - if is_template_node_with_name(node, "Lautschrift"): - process_lautschrift_template(wxr, sound_data, node) - elif is_tag_node(node): - append_tag(wxr, sound_data[-1], node) - elif is_new_sound_data_entry_sep(node): - sound_data.append(Sound()) - else: - wxr.wtp.debug( - f"Found unexpected non-Lautschrift node in IPA section: {node}", - sortid="extractor/de/pronunciation/process_ipa/57", - ) - - -def process_lautschrift_template( - wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode ) -> None: - template_parameters = node.template_parameters - - ipa = template_parameters.get(1, "") - - lang_code = template_parameters.get("spr") - if lang_code: - lang = code_to_name(lang_code, "de") - new_data = { - "lang_code": lang_code, - "lang": lang, - } - else: - new_data = dict() - - new_data["ipa"] = ipa - - add_sound_data_without_appending_to_existing_properties( - wxr, - sound_data, - new_data, - ) - - -def process_hoerbeispiele( - wxr: WiktextractContext, - sound_data: list[Sound], - nodes: list[Union[str, WikiNode]], -): - for node in nodes: - if is_template_node_with_name(node, "Audio"): - process_audio_template(wxr, sound_data, node) - elif is_tag_node(node): - append_tag(wxr, sound_data[-1], node) - elif is_new_sound_data_entry_sep(node): - sound_data.append(Sound()) - else: - wxr.wtp.debug( - f"Found unexpected node in Hoerbeispiele section: {node}", - sortid="extractor/de/pronunciation/process_hoerbeispiele/193", - ) - - -def process_audio_template( - wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode -): - audio_file = node.template_parameters.get(1, "").strip() - if len(audio_file) > 0: - add_sound_data_without_appending_to_existing_properties( - wxr, sound_data, create_audio_url_dict(audio_file) - ) - - -def process_rhymes( - wxr: WiktextractContext, - sound_data: list[Sound], - nodes: list[WikiNode], - word_entry: WordEntry, -): - for node in nodes: - if isinstance(node, TemplateNode) and node.template_name == "Reim": - # https://de.wiktionary.org/wiki/Vorlage:Reime - rhyme = clean_node(wxr, word_entry, node) - if rhyme != "": - sound_data.append(Sound(rhymes=rhyme)) - - -def is_template_node_with_name(node: Union[WikiNode, str], template_name: str): - return ( - isinstance(node, WikiNode) - and node.kind == NodeKind.TEMPLATE - and node.template_name == template_name - ) - - -def add_sound_data_without_appending_to_existing_properties( - wxr: WiktextractContext, - sound_data: list[Sound], - new_sound_data: dict, -): - """Creates a new IPA data entry if properties exist in previous entry.""" - if any( - [ - key in sound_data[-1].model_dump(exclude_defaults=True) - for key in new_sound_data.keys() - ] + for list_node in level_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + for sound in extract_pron_list_item(wxr, list_item): + word_entry.sounds.append(sound) + word_entry.categories.extend(sound.categories) + + +def extract_pron_list_item( + wxr: WiktextractContext, list_item: WikiNode +) -> list[Sound]: + raw_tags = [] + sounds = [] + for node in list_item.find_child( + NodeKind.TEMPLATE | NodeKind.ITALIC | NodeKind.LIST ): - sound_data.append(Sound()) - - for key, value in new_sound_data.items(): - if key in sound_data[-1].model_fields: - if isinstance(value, str): - setattr(sound_data[-1], key, value) - else: - getattr(sound_data[-1], key).extend(value) - else: - wxr.wtp.debug( - f"Unexpected key {key} for Sound", - sortid="extractor/de/pronunciation/196", - ) - - -def is_tag_node(node: Union[WikiNode, str]): - return isinstance(node, WikiNode) and node.kind in [ - NodeKind.TEMPLATE, - NodeKind.ITALIC, - ] - - -def append_tag(wxr: WiktextractContext, sound_data: Sound, node: WikiNode): - tag = clean_node(wxr, None, node) - if tag != "": - sound_data.raw_tags.append(tag) - - -def is_new_sound_data_entry_sep(node: Union[WikiNode, str]): - return isinstance(node, str) and node.strip() in [",", ";"] + match node.kind: + case NodeKind.ITALIC: + node_text = clean_node(wxr, None, node) + if node_text.endswith(":"): + raw_tags.append(node_text.removesuffix(":")) + case NodeKind.LIST: + for next_list_item in node.find_child(NodeKind.LIST_ITEM): + sounds.extend(extract_pron_list_item(wxr, next_list_item)) + case NodeKind.TEMPLATE: + match node.template_name: + case "Lautschrift": + ipa = clean_node( + wxr, + None, + node.template_parameters.get(1, ""), + ) + if ipa != "": + sounds.append(Sound(ipa=ipa)) + clean_node(wxr, sounds[-1], node) + case "Audio": + new_sound = extract_audio_template(wxr, node) + if new_sound is not None: + sounds.append(new_sound) + case "Reim": + rhyme = clean_node( + wxr, + None, + node.template_parameters.get(1, ""), + ) + if rhyme != "": + sounds.append(Sound(rhymes=rhyme)) + clean_node(wxr, sounds[-1], node) + + for sound in sounds: + sound.raw_tags.extend(raw_tags) + translate_raw_tags(sound) + return sounds + + +def extract_audio_template( + wxr: WiktextractContext, t_node: TemplateNode +) -> Sound | None: + # https://de.wiktionary.org/wiki/Vorlage:Audio + filename = clean_node(wxr, None, t_node.template_parameters.get(1, "")) + if filename.strip() == "": + return None + sound = Sound() + set_sound_file_url_fields(wxr, filename, sound) + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + for link_node in expanded_node.find_child(NodeKind.LINK): + link_str = clean_node(wxr, None, link_node) + if "(" in link_str: + sound.raw_tags.append(link_str[link_str.index("(") + 1:].strip(")")) + clean_node(wxr, sound, expanded_node) + return sound diff --git a/src/wiktextract/extractor/de/tags.py b/src/wiktextract/extractor/de/tags.py index e55fc783..c1488dd1 100644 --- a/src/wiktextract/extractor/de/tags.py +++ b/src/wiktextract/extractor/de/tags.py @@ -40,6 +40,7 @@ # "das": "", "Dativ": "dative", # "DDR": "", + "Deutschland": "Germany", # "der": "", "dichter.": "poetic", # "die": "", diff --git a/tests/test_de_example.py b/tests/test_de_example.py index bcab8f26..5e3d124b 100644 --- a/tests/test_de_example.py +++ b/tests/test_de_example.py @@ -260,7 +260,7 @@ def test_tag_list(self): { "examples": [ { - "raw_tags": ["Deutschland"], + "tags": ["Germany"], "text": "„Den ganzen ‚Feber‘ hörte man lapidar", } ], diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py index f868f446..d9bebf69 100644 --- a/tests/test_de_gloss.py +++ b/tests/test_de_gloss.py @@ -230,14 +230,13 @@ def test_italit_node_multiple_raw_tags(self): [ { "raw_tags": [ - "Deutschland", "Fernsehen", "Kurzwort", "Akronym", ], "glosses": ["für das erste Fernsehprogramm der ARD"], "sense_index": "2", - "tags": ["colloquial"], + "tags": ["Germany", "colloquial"], }, ], ) diff --git a/tests/test_de_pronunciation.py b/tests/test_de_pronunciation.py index e20b6b90..956d2749 100644 --- a/tests/test_de_pronunciation.py +++ b/tests/test_de_pronunciation.py @@ -3,12 +3,7 @@ from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig -from wiktextract.extractor.de.models import Sound -from wiktextract.extractor.de.pronunciation import ( - process_hoerbeispiele, - process_ipa, - process_lautschrift_template, -) +from wiktextract.extractor.de.page import parse_page from wiktextract.wxr_context import WiktextractContext @@ -17,173 +12,50 @@ class TestDEPronunciation(unittest.TestCase): def setUp(self) -> None: self.wxr = WiktextractContext( - Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de") + Wtp(lang_code="de"), + WiktionaryConfig( + dump_file_lang_code="de", capture_language_codes=None + ), ) def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - def test_de_process_ipa(self): - test_cases = [ - { - "input": "{{Lautschrift|ipa1}}", - "expected": [ - { - "ipa": "ipa1", - } - ], - }, - { - "input": "{{Lautschrift|ipa1|spr=de}}", - "expected": [ - { - "ipa": "ipa1", - "lang": "Deutsch", - "lang_code": "de", - } - ], - }, - { - "input": "{{Lautschrift|ipa1}} {{Lautschrift|ipa2}}{{Lautschrift|ipa3|spr=de}}", - "expected": [ - {"ipa": "ipa1"}, - {"ipa": "ipa2"}, - { - "ipa": "ipa3", - "lang": "Deutsch", - "lang_code": "de", - }, - ], - }, - { - "input": "{{Lautschrift|ipa1}}, ''tag1'' {{Lautschrift|ipa2}}", - "expected": [ - {"ipa": "ipa1"}, - {"ipa": "ipa2", "raw_tags": ["tag1"]}, - ], - }, - ] - - for case in test_cases: - with self.subTest(case=case): - self.wxr.wtp.start_page("") - self.wxr.wtp.add_page("Vorlage:IPA", 10, "") - self.wxr.wtp.add_page("Vorlage:Lautschrift", 10, "(Deutsch)") - - root = self.wxr.wtp.parse(case["input"]) - - sound_data = [Sound()] - - process_ipa( - self.wxr, sound_data, list(root.filter_empty_str_child()) - ) - - sounds = [ - s.model_dump(exclude_defaults=True) for s in sound_data - ] - self.assertEqual(sounds, case["expected"]) - - def test_de_process_hoerbeispiele(self): - # https://de.wiktionary.org/wiki/Beispiel - filename1 = "De-Beispiel.ogg" - # https://de.wiktionary.org/wiki/butineur - filename2 = "LL-Q150 (fra)-WikiLucas00-butineur.wav" - test_cases = [ - { - "input": "{{Audio|" + filename1 + "}}", - "expected": [ - { - "audio": filename1, - "mp3_url": None, # None = we don't care about exact val - "ogg_url": None, - } - ], - }, - { - "input": "{{Audio|" - + filename1 - + "}} {{Audio|" - + filename2 - + "}}", - "expected": [ - { - "audio": filename1, - "mp3_url": None, - "ogg_url": None, - }, - { - "audio": filename2, - "ogg_url": None, - "mp3_url": None, - "wav_url": None, - }, - ], - }, - { - "input": "{{Audio|" - + filename1 - + "}} ''tag1'', ''tag2'' {{Audio|" - + filename2 - + "}}", - "expected": [ - { - "audio": filename1, - "mp3_url": None, - "ogg_url": None, - "raw_tags": ["tag1"], - }, - { - "audio": filename2, - "mp3_url": None, - "ogg_url": None, - "wav_url": None, - "raw_tags": ["tag2"], - }, - ], - }, - ] - - for case in test_cases: - with self.subTest(case=case): - self.wxr.wtp.start_page("") - self.wxr.wtp.add_page("Vorlage:IPA", 10, "") - self.wxr.wtp.add_page("Vorlage:Audio", 10, "") - - root = self.wxr.wtp.parse(case["input"]) - - sound_data = [Sound()] - - process_hoerbeispiele( - self.wxr, sound_data, list(root.filter_empty_str_child()) - ) - - sounds = [ - s.model_dump(exclude_defaults=True) for s in sound_data - ] - self.assertSoundDataMatchesExpected(sounds, case["expected"]) - - def assertSoundDataMatchesExpected(self, sound_data, expected): + def test_normal_page(self): + self.wxr.wtp.add_page( + "Vorlage:Audio", + 10, + """[[Datei:Loudspeaker.svg|15px|Lautsprecherbild|link=]] [[Media:De-at-Hund.ogg|Hund (Österreich)]] ([[:Datei:De-at-Hund.ogg|Info]])[[Kategorie:Wiktionary:Audio-Datei]]""", + ) + data = parse_page( + self.wxr, + "Hund", + """== Hund ({{Sprache|Deutsch}}) == +=== {{Wortart|Substantiv|Deutsch}}, {{m}} === +==== Aussprache ==== +:{{IPA}} {{Lautschrift|hʊnt}} +:{{Hörbeispiele}} {{Audio|De-at-Hund.ogg|spr=at}} +:{{Reime}} {{Reim|ʊnt|Deutsch}} +==== Bedeutungen ==== +:[1] [[Haustier]]""", + ) + self.assertEqual(data[0]["sounds"][0], {"ipa": "hʊnt"}) + self.assertEqual(data[0]["sounds"][1]["audio"], "De-at-Hund.ogg") + self.assertEqual(data[0]["sounds"][1]["tags"], ["Austrian German"]) + self.assertEqual(data[0]["sounds"][2], {"rhymes": "ʊnt"}) + + def test_nested_lists(self): + data = parse_page( + self.wxr, + "Garage", + """== Garage ({{Sprache|Deutsch}}) == +=== {{Wortart|Substantiv|Deutsch}}, {{f}} === +==== Aussprache ==== +:{{IPA}} +::''[[Deutschland]]:'' {{Lautschrift|ɡaˈʁaːʒə}} +==== Bedeutungen ==== +:[1] [[Raum]]""", + ) self.assertEqual( - len(sound_data), - len(expected), - f"Mismatch in number of sound data entries{sound_data}", + data[0]["sounds"][0], {"ipa": "ɡaˈʁaːʒə", "tags": ["Germany"]} ) - - for data, exp in zip(sound_data, expected): - for key, value in exp.items(): - if value is None: - self.assertIn(key, data) - else: - self.assertEqual(data[key], value) - - for key in data: - self.assertIn(key, exp) - if exp[key] is not None: - self.assertEqual(data[key], exp[key]) - - def test_empty_ipa_in_lautschrift(self): - self.wxr.wtp.start_page("BU") - root = self.wxr.wtp.parse("{{Lautschrift}}") - sound_data = [Sound()] - process_lautschrift_template(self.wxr, sound_data, root.children[0]) - self.assertEqual(sound_data[0].model_dump(exclude_defaults=True), {})