diff --git a/src/wiktextract/extractor/de/example.py b/src/wiktextract/extractor/de/example.py index 155387448..8a3a97f62 100644 --- a/src/wiktextract/extractor/de/example.py +++ b/src/wiktextract/extractor/de/example.py @@ -3,6 +3,7 @@ from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor.parser import LevelNode from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid from wiktextract.page import clean_node @@ -12,42 +13,43 @@ def extract_examples( wxr: WiktextractContext, page_data: List[Dict], - list_node: WikiNode, + level_node: LevelNode, ) -> None: - for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): - example_data = defaultdict(str) - - ref_nodes = find_and_remove_child( - list_item_node, - NodeKind.HTML, - lambda html_node: html_node.tag == "ref", + for list_node in level_node.find_child(NodeKind.LIST): + for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): + example_data = defaultdict(str) + + ref_nodes = find_and_remove_child( + list_item_node, + NodeKind.HTML, + lambda html_node: html_node.tag == "ref", + ) + for ref_node in ref_nodes: + extract_reference(wxr, example_data, ref_node) + + example_text = clean_node(wxr, {}, list_item_node.children) + + senseid, example_text = match_senseid(example_text) + + if example_text: + example_data["text"] = example_text + + if senseid: + for sense in page_data[-1]["senses"]: + if sense["senseid"] == senseid: + sense["examples"].append(example_data) + + else: + if example_data: + wxr.wtp.debug( + f"Found example data without senseid and text: {example_data}", + sortid="extractor/de/examples/extract_examples/28", + ) + for non_list_node in level_node.invert_find_child(NodeKind.LIST): + wxr.wtp.debug( + f"Found unexpected non-list node in example section: {non_list_node}", + sortid="extractor/de/examples/extract_examples/33", ) - for ref_node in ref_nodes: - extract_reference(wxr, example_data, ref_node) - - example_text = clean_node(wxr, {}, list_item_node.children) - - senseid, example_text = match_senseid(example_text) - - if example_text: - example_data["text"] = example_text - - if senseid: - sense_data = [ - sense - for sense in page_data[-1]["senses"] - if sense["senseid"] == senseid - ] - - for sense in sense_data: - sense["examples"].append(example_data) - - else: - if example_data: - wxr.wtp.debug( - f"Found example data without senseid and text: {example_data}", - sortid="extractor/de/examples/extract_examples/28", - ) def extract_reference( @@ -67,20 +69,14 @@ def extract_reference( elif len(template_nodes) == 1: template_node = template_nodes[0] - # Sometimes the title is dynamically generated from the template name, - # so we preset the title. If specified in the template, it will be - # overwritten. - reference_data["titel"] = template_node.largs[0][0].strip() - - for arg in template_node.largs[1:]: - arg = clean_node(wxr, {}, arg) - if not arg.strip(): - continue - splits = arg.split("=", 1) - if len(splits) != 2: - continue - arg_name, arg_value = arg.split("=", 1) - if arg_name.strip() and arg_value.strip(): - reference_data[arg_name.lower()] = arg_value + # Most reference templates follow the Literatur template and use named + # parameters. We extract them here. + # https://de.wiktionary.org/wiki/Vorlage:Literatur + for key, value in template_node.template_parameters.items(): + if isinstance(key, str): + reference_data[key.lower()] = clean_node(wxr, {}, value) + + # XXX: Treat other templates as well. + # E.g. https://de.wiktionary.org/wiki/Vorlage:Ref-OWID example_data["ref"] = reference_data diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py index 70b920703..6315259d2 100644 --- a/src/wiktextract/extractor/de/gloss.py +++ b/src/wiktextract/extractor/de/gloss.py @@ -3,6 +3,7 @@ from typing import Dict, List from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor.parser import LevelNode from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid from wiktextract.page import clean_node @@ -10,6 +11,21 @@ def extract_glosses( + wxr: WiktextractContext, + page_data: List[Dict], + level_node: LevelNode, +) -> None: + for list_node in level_node.find_child(NodeKind.LIST): + process_gloss_list_item(wxr, page_data, list_node) + + for non_list_node in level_node.invert_find_child(NodeKind.LIST): + wxr.wtp.debug( + f"Found unexpected non-list node in pronunciation section: {non_list_node}", + sortid="extractor/de/pronunciation/extract_pronunciation/64", + ) + + +def process_gloss_list_item( wxr: WiktextractContext, page_data: List[Dict], list_node: WikiNode, @@ -54,7 +70,11 @@ def extract_glosses( senseid, gloss_text = match_senseid(gloss_text) if senseid: - senseid if senseid[0].isnumeric() else parent_senseid + senseid + senseid = ( + senseid + if senseid[0].isnumeric() + else parent_senseid + senseid + ) gloss_data["senseid"] = senseid else: wxr.wtp.debug( @@ -71,7 +91,7 @@ def extract_glosses( page_data[-1]["senses"].append(gloss_data) for sub_list_node in sub_glosses_list_nodes: - extract_glosses( + process_gloss_list_item( wxr, page_data, sub_list_node, diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py index 0d2f1ca1a..9c0aab1a5 100644 --- a/src/wiktextract/extractor/de/page.py +++ b/src/wiktextract/extractor/de/page.py @@ -7,6 +7,7 @@ from wikitextprocessor.parser import LevelNode from wiktextract.datautils import append_base_data +from wiktextract.extractor.de.pronunciation import extract_pronunciation from wiktextract.wxr_context import WiktextractContext from .gloss import extract_glosses @@ -71,9 +72,10 @@ def parse_section( wxr.wtp.start_subsection(section_name) if section_name == "Bedeutungen": extract_glosses(wxr, page_data, level_node) + if section_name == "Aussprache": + extract_pronunciation(wxr, page_data, level_node) if section_name == "Beispiele": - for list_node in level_node.find_child(NodeKind.LIST): - extract_examples(wxr, page_data, list_node) + extract_examples(wxr, page_data, level_node) FORM_POS = { diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py new file mode 100644 index 000000000..2c7a2759f --- /dev/null +++ b/src/wiktextract/extractor/de/pronunciation.py @@ -0,0 +1,189 @@ +from collections import defaultdict +from typing import Dict, List, Union + +from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor.parser import LevelNode +from wiktextract.extractor.share import create_audio_url_dict + +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext + + +def extract_pronunciation( + wxr: WiktextractContext, + page_data: List[Dict], + level_node: LevelNode, +): + for list_node in level_node.find_child(NodeKind.LIST): + sound_data = [defaultdict(list)] + + for not_list_item_node in list_node.invert_find_child( + NodeKind.LIST_ITEM + ): + wxr.wtp.debug( + f"Found unexpected non-list-item node in pronunciation section: {not_list_item_node}", + sortid="extractor/de/pronunciation/extract_pronunciation/28", + ) + + for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): + children = list(list_item_node.filter_empty_str_child()) + print(children) + if len(children) == 0: + continue + + head_template, rest = children[0], children[1:] + if ( + not isinstance(head_template, WikiNode) + or head_template.kind != NodeKind.TEMPLATE + or not rest + ): + wxr.wtp.debug( + f"Found unexpected non-template node in pronunciation section: {head_template}", + sortid="extractor/de/pronunciation/extract_pronunciation/37", + ) + continue + if head_template.template_name == "IPA": + process_ipa(wxr, sound_data, rest) + elif head_template.template_name == "Hörbeispiele": + sound_data.append(defaultdict(list)) + process_hoerbeispiele(wxr, sound_data, rest) + elif head_template.template_name == "Reime": + process_rhymes(wxr, sound_data, rest) + else: + wxr.wtp.debug( + f"Found unexpected template in pronunciation section: {head_template} with content {rest}", + sortid="extractor/de/pronunciation/extract_pronunciation/45)", + ) + + # Remove empty entries + sound_data = [entry for entry in sound_data if entry != {}] + if len(sound_data) > 0: + page_data[-1]["sounds"].extend(sound_data) + + for non_list_node in level_node.invert_find_child(NodeKind.LIST): + wxr.wtp.debug( + f"Found unexpected non-list node in pronunciation section: {non_list_node}", + sortid="extractor/de/pronunciation/extract_pronunciation/64", + ) + + +def process_ipa( + wxr: WiktextractContext, + sound_data: List[Dict], + nodes: List[Union[WikiNode, str]], +): + if not nodes: + return + + head_node = nodes.pop(0) + + if is_template_node_with_name(head_node, "Lautschrift"): + process_lautschrift_template(wxr, sound_data, head_node) + elif is_tag_node(head_node): + append_tag(wxr, sound_data, head_node) + elif is_new_sound_data_entry_sep(head_node): + sound_data.append(defaultdict(list)) + else: + wxr.wtp.debug( + f"Found unexpected non-Lautschrift node in IPA section: {head_node}", + sortid="extractor/de/pronunciation/process_ipa/57", + ) + + if nodes: + process_ipa(wxr, sound_data, nodes) + + +def process_lautschrift_template( + wxr: WiktextractContext, sound_data: List[Dict], node +): + template_parameters = node.template_parameters + + ipa = template_parameters.get(1) + + lang_code = template_parameters.get("spr") + if lang_code: + language = wxr.wtp.LANGUAGES_BY_CODE[lang_code] + add_sound_data_without_appending_to_existing_properties( + sound_data, + { + "ipa": [ipa], + "lang_code": lang_code, + "language": language, + }, + ) + else: + sound_data[-1]["ipa"].append(ipa) + + +def process_hoerbeispiele( + wxr: WiktextractContext, sound_data: List[Dict], nodes: List[WikiNode] +): + for node in nodes: + if is_template_node_with_name(node, "Audio"): + process_audio_template(wxr, sound_data, node) + elif is_tag_node(node): + append_tag(wxr, sound_data, node) + elif is_new_sound_data_entry_sep(node): + sound_data.append(defaultdict(list)) + else: + wxr.wtp.debug( + f"Found unexpected node in Hoerbeispiele section: {node}", + sortid="extractor/de/pronunciation/process_hoerbeispiele/193", + ) + + +def process_audio_template( + wxr: WiktextractContext, sound_data: List[Dict], node +): + audio_file = node.template_parameters.get(1) + if audio_file: + add_sound_data_without_appending_to_existing_properties( + sound_data, create_audio_url_dict(audio_file) + ) + + +def process_rhymes( + wxr: WiktextractContext, sound_data: List[Dict], nodes: List[WikiNode] +): + # XXX: Extract rhymes from the referenced rhymes page + pass + + +def is_template_node_with_name(node: Union[WikiNode, str], template_name: str): + return ( + isinstance(node, WikiNode) + and node.kind == NodeKind.TEMPLATE + and node.template_name == template_name + ) + + +def add_sound_data_without_appending_to_existing_properties( + sound_data: List[Dict], + new_sound_data: Dict, +): + """Creates a new IPA data entry if properties exist in previous entry.""" + if any([key in sound_data[-1] for key in new_sound_data.keys()]): + sound_data.append(defaultdict(list)) + + for key, value in new_sound_data.items(): + if isinstance(value, str): + sound_data[-1][key] = value + else: + sound_data[-1][key].extend(value) + + +def is_tag_node(node: Union[WikiNode, str]): + return isinstance(node, WikiNode) and node.kind in [ + NodeKind.TEMPLATE, + NodeKind.ITALIC, + ] + + +def append_tag(wxr: WiktextractContext, sound_data: Dict, node: WikiNode): + tag = clean_node(wxr, {}, node).strip() + if tag: + sound_data[-1]["tags"].append(tag) + + +def is_new_sound_data_entry_sep(node: Union[WikiNode, str]): + return isinstance(node, str) and node.strip() in [",", ";"] diff --git a/tests/test_de_example.py b/tests/test_de_example.py index dd9a3086f..980a0be6c 100644 --- a/tests/test_de_example.py +++ b/tests/test_de_example.py @@ -36,7 +36,7 @@ def test_de_extract_examples(self): defaultdict(list, {"senseid": "2"}), ] - extract_examples(self.wxr, page_data, root.children[0]) + extract_examples(self.wxr, page_data, root) self.assertEqual( page_data, @@ -68,7 +68,7 @@ def test_de_extract_example_with_reference(self): defaultdict(list, {"senseid": "1"}), ] - extract_examples(self.wxr, page_data, root.children[0]) + extract_examples(self.wxr, page_data, root) self.assertEqual( page_data, @@ -89,10 +89,42 @@ def test_de_extract_example_with_reference(self): ], ) - def test_de_extract_reference(self): - self.wxr.wtp.start_page("") + def test_de_extract_reference_from_literatur_template(self): + # https://de.wiktionary.org/wiki/Beispiel + self.wxr.wtp.start_page("Beispiel") self.wxr.wtp.add_page("Vorlage:Literatur", 10, "Expanded template") - root = self.wxr.wtp.parse("{{Literatur|Titel=title}}") + root = self.wxr.wtp.parse( + "{{Literatur|Autor=Steffen Möller|Titel=Viva Warszawa|TitelErg=Polen für Fortgeschrittene|Verlag=Piper|Ort=München/Berlin|Jahr=2015}}, Seite 273. ISBN 978-3-89029-459-9." + ) + + example_data = defaultdict(str) + + extract_reference(self.wxr, example_data, root.children[0]) + + self.assertEqual( + example_data, + { + "ref": { + "raw_ref": "Expanded template, Seite 273. ISBN 978-3-89029-459-9.", + "titel": "Viva Warszawa", + "autor": "Steffen Möller", + "titelerg": "Polen für Fortgeschrittene", + "verlag": "Piper", + "ort": "München/Berlin", + "jahr": "2015", + } + }, + ) + + def test_de_extract_reference_from_templates_without_named_args(self): + # https://de.wiktionary.org/wiki/Beispiel + # Reference templates not following the Literatur template pattern are + # currently not extracted field by field (e.g. Vorlage:Ref-OWID) + self.wxr.wtp.start_page("Beispiel") + self.wxr.wtp.add_page("Vorlage:Ref-OWID", 10, "Expanded template") + root = self.wxr.wtp.parse( + "{{Ref-OWID|Sprichwörter|401781|Schlechte Beispiele verderben gute Sitten.}}" + ) example_data = defaultdict(str) @@ -100,5 +132,9 @@ def test_de_extract_reference(self): self.assertEqual( example_data, - {"ref": {"raw_ref": "Expanded template", "titel": "title"}}, + { + "ref": { + "raw_ref": "Expanded template", + } + }, ) diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py index b44eb6001..e43cd4d67 100644 --- a/tests/test_de_gloss.py +++ b/tests/test_de_gloss.py @@ -33,7 +33,7 @@ def test_de_extract_glosses(self): page_data = [defaultdict(list)] - extract_glosses(self.wxr, page_data, root.children[0]) + extract_glosses(self.wxr, page_data, root) self.assertEqual( page_data, @@ -63,7 +63,7 @@ def test_de_extract_glosses_with_subglosses(self): page_data = [defaultdict(list)] - extract_glosses(self.wxr, page_data, root.children[0]) + extract_glosses(self.wxr, page_data, root) self.assertEqual( page_data, @@ -99,7 +99,7 @@ def test_de_extract_glosses_with_only_subglosses(self): page_data = [defaultdict(list)] - extract_glosses(self.wxr, page_data, root.children[0]) + extract_glosses(self.wxr, page_data, root) self.assertEqual( page_data, diff --git a/tests/test_de_pronunciation.py b/tests/test_de_pronunciation.py new file mode 100644 index 000000000..6fae64eb7 --- /dev/null +++ b/tests/test_de_pronunciation.py @@ -0,0 +1,176 @@ +import unittest +from collections import defaultdict + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.de.pronunciation import ( + process_ipa, + process_hoerbeispiele, +) +from wiktextract.thesaurus import close_thesaurus_db +from wiktextract.wxr_context import WiktextractContext + + +class TestDEPronunciation(unittest.TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de") + ) + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + close_thesaurus_db( + self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn + ) + + def test_de_process_ipa(self): + test_cases = [ + { + "input": "{{Lautschrift|ipa1}}", + "expected": [ + { + "ipa": ["ipa1"], + } + ], + }, + { + "input": "{{Lautschrift|ipa1|spr=de}}", + "expected": [ + {"ipa": ["ipa1"], "language": "Deutsch", "lang_code": "de"} + ], + }, + { + "input": "{{Lautschrift|ipa1}} {{Lautschrift|ipa2}}{{Lautschrift|ipa3|spr=de}}", + "expected": [ + {"ipa": ["ipa1", "ipa2"]}, + {"ipa": ["ipa3"], "language": "Deutsch", "lang_code": "de"}, + ], + }, + { + "input": "{{Lautschrift|ipa1}}, ''tag1'' {{Lautschrift|ipa2}}", + "expected": [ + {"ipa": ["ipa1"]}, + {"ipa": ["ipa2"], "tags": ["tag1"]}, + ], + }, + ] + + for case in test_cases: + with self.subTest(case=case): + self.wxr.wtp.start_page("") + self.wxr.wtp.add_page("Vorlage:IPA", 10, "") + self.wxr.wtp.add_page("Vorlage:Lautschrift", 10, "(Deutsch)") + + self.wxr.wtp.LANGUAGES_BY_CODE["de"] = "Deutsch" + + root = self.wxr.wtp.parse(case["input"]) + + sound_data = [defaultdict(list)] + + process_ipa( + self.wxr, sound_data, list(root.filter_empty_str_child()) + ) + + self.assertEqual(sound_data, case["expected"]) + + def test_de_process_hoerbeispiele(self): + # https://de.wiktionary.org/wiki/Beispiel + filename1 = "De-Beispiel.ogg" + # https://de.wiktionary.org/wiki/butineur + filename2 = "LL-Q150 (fra)-WikiLucas00-butineur.wav" + test_cases = [ + { + "input": "{{Audio|" + filename1 + "}}", + "expected": [ + { + "audio": filename1, + "mp3_url": None, # None indicates we don't care about the exact value + "ogg_url": None, + } + ], + }, + { + "input": "{{Audio|" + + filename1 + + "}} {{Audio|" + + filename2 + + "}}", + "expected": [ + { + "audio": filename1, + "mp3_url": None, + "ogg_url": None, + }, + { + "audio": filename2, + "ogg_url": None, + "mp3_url": None, + "wav_url": None, + }, + ], + }, + { + "input": "{{Audio|" + + filename1 + + "}} ''tag1'', ''tag2'' {{Audio|" + + filename2 + + "}}", + "expected": [ + { + "audio": filename1, + "mp3_url": None, + "ogg_url": None, + "tags": ["tag1"], + }, + { + "audio": filename2, + "mp3_url": None, + "ogg_url": None, + "wav_url": None, + "tags": ["tag2"], + }, + ], + }, + ] + + for case in test_cases: + with self.subTest(case=case): + self.wxr.wtp.start_page("") + self.wxr.wtp.add_page("Vorlage:IPA", 10, "") + self.wxr.wtp.add_page("Vorlage:Audio", 10, "") + + self.wxr.wtp.LANGUAGES_BY_CODE["de"] = "Deutsch" + + root = self.wxr.wtp.parse(case["input"]) + + sound_data = [defaultdict(list)] + + process_hoerbeispiele( + self.wxr, sound_data, list(root.filter_empty_str_child()) + ) + + self.assertSoundDataMatchesExpected( + sound_data, case["expected"] + ) + + def assertSoundDataMatchesExpected(self, sound_data, expected): + self.assertEqual( + len(sound_data), + len(expected), + f"Mismatch in number of sound data entries{sound_data}", + ) + + for data, exp in zip(sound_data, expected): + for key, value in exp.items(): + if value is None: + self.assertIn(key, data) + else: + self.assertEqual(data[key], value) + + for key in data: + self.assertIn(key, exp) + if exp[key] is not None: + self.assertEqual(data[key], exp[key])