diff --git a/src/wiktextract/extractor/de/example.py b/src/wiktextract/extractor/de/example.py
index 155387448..8a3a97f62 100644
--- a/src/wiktextract/extractor/de/example.py
+++ b/src/wiktextract/extractor/de/example.py
@@ -3,6 +3,7 @@
from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor.parser import LevelNode
from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
from wiktextract.page import clean_node
@@ -12,42 +13,43 @@
def extract_examples(
wxr: WiktextractContext,
page_data: List[Dict],
- list_node: WikiNode,
+ level_node: LevelNode,
) -> None:
- for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
- example_data = defaultdict(str)
-
- ref_nodes = find_and_remove_child(
- list_item_node,
- NodeKind.HTML,
- lambda html_node: html_node.tag == "ref",
+ for list_node in level_node.find_child(NodeKind.LIST):
+ for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
+ example_data = defaultdict(str)
+
+ ref_nodes = find_and_remove_child(
+ list_item_node,
+ NodeKind.HTML,
+ lambda html_node: html_node.tag == "ref",
+ )
+ for ref_node in ref_nodes:
+ extract_reference(wxr, example_data, ref_node)
+
+ example_text = clean_node(wxr, {}, list_item_node.children)
+
+ senseid, example_text = match_senseid(example_text)
+
+ if example_text:
+ example_data["text"] = example_text
+
+ if senseid:
+ for sense in page_data[-1]["senses"]:
+ if sense["senseid"] == senseid:
+ sense["examples"].append(example_data)
+
+ else:
+ if example_data:
+ wxr.wtp.debug(
+ f"Found example data without senseid and text: {example_data}",
+ sortid="extractor/de/examples/extract_examples/28",
+ )
+ for non_list_node in level_node.invert_find_child(NodeKind.LIST):
+ wxr.wtp.debug(
+ f"Found unexpected non-list node in example section: {non_list_node}",
+ sortid="extractor/de/examples/extract_examples/33",
)
- for ref_node in ref_nodes:
- extract_reference(wxr, example_data, ref_node)
-
- example_text = clean_node(wxr, {}, list_item_node.children)
-
- senseid, example_text = match_senseid(example_text)
-
- if example_text:
- example_data["text"] = example_text
-
- if senseid:
- sense_data = [
- sense
- for sense in page_data[-1]["senses"]
- if sense["senseid"] == senseid
- ]
-
- for sense in sense_data:
- sense["examples"].append(example_data)
-
- else:
- if example_data:
- wxr.wtp.debug(
- f"Found example data without senseid and text: {example_data}",
- sortid="extractor/de/examples/extract_examples/28",
- )
def extract_reference(
@@ -67,20 +69,14 @@ def extract_reference(
elif len(template_nodes) == 1:
template_node = template_nodes[0]
- # Sometimes the title is dynamically generated from the template name,
- # so we preset the title. If specified in the template, it will be
- # overwritten.
- reference_data["titel"] = template_node.largs[0][0].strip()
-
- for arg in template_node.largs[1:]:
- arg = clean_node(wxr, {}, arg)
- if not arg.strip():
- continue
- splits = arg.split("=", 1)
- if len(splits) != 2:
- continue
- arg_name, arg_value = arg.split("=", 1)
- if arg_name.strip() and arg_value.strip():
- reference_data[arg_name.lower()] = arg_value
+ # Most reference templates follow the Literatur template and use named
+ # parameters. We extract them here.
+ # https://de.wiktionary.org/wiki/Vorlage:Literatur
+ for key, value in template_node.template_parameters.items():
+ if isinstance(key, str):
+ reference_data[key.lower()] = clean_node(wxr, {}, value)
+
+ # XXX: Treat other templates as well.
+ # E.g. https://de.wiktionary.org/wiki/Vorlage:Ref-OWID
example_data["ref"] = reference_data
diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py
index 70b920703..6315259d2 100644
--- a/src/wiktextract/extractor/de/gloss.py
+++ b/src/wiktextract/extractor/de/gloss.py
@@ -3,6 +3,7 @@
from typing import Dict, List
from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor.parser import LevelNode
from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
from wiktextract.page import clean_node
@@ -10,6 +11,21 @@
def extract_glosses(
+ wxr: WiktextractContext,
+ page_data: List[Dict],
+ level_node: LevelNode,
+) -> None:
+ for list_node in level_node.find_child(NodeKind.LIST):
+ process_gloss_list_item(wxr, page_data, list_node)
+
+ for non_list_node in level_node.invert_find_child(NodeKind.LIST):
+ wxr.wtp.debug(
+ f"Found unexpected non-list node in pronunciation section: {non_list_node}",
+ sortid="extractor/de/pronunciation/extract_pronunciation/64",
+ )
+
+
+def process_gloss_list_item(
wxr: WiktextractContext,
page_data: List[Dict],
list_node: WikiNode,
@@ -54,7 +70,11 @@ def extract_glosses(
senseid, gloss_text = match_senseid(gloss_text)
if senseid:
- senseid if senseid[0].isnumeric() else parent_senseid + senseid
+ senseid = (
+ senseid
+ if senseid[0].isnumeric()
+ else parent_senseid + senseid
+ )
gloss_data["senseid"] = senseid
else:
wxr.wtp.debug(
@@ -71,7 +91,7 @@ def extract_glosses(
page_data[-1]["senses"].append(gloss_data)
for sub_list_node in sub_glosses_list_nodes:
- extract_glosses(
+ process_gloss_list_item(
wxr,
page_data,
sub_list_node,
diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py
index 0d2f1ca1a..9c0aab1a5 100644
--- a/src/wiktextract/extractor/de/page.py
+++ b/src/wiktextract/extractor/de/page.py
@@ -7,6 +7,7 @@
from wikitextprocessor.parser import LevelNode
from wiktextract.datautils import append_base_data
+from wiktextract.extractor.de.pronunciation import extract_pronunciation
from wiktextract.wxr_context import WiktextractContext
from .gloss import extract_glosses
@@ -71,9 +72,10 @@ def parse_section(
wxr.wtp.start_subsection(section_name)
if section_name == "Bedeutungen":
extract_glosses(wxr, page_data, level_node)
+ if section_name == "Aussprache":
+ extract_pronunciation(wxr, page_data, level_node)
if section_name == "Beispiele":
- for list_node in level_node.find_child(NodeKind.LIST):
- extract_examples(wxr, page_data, list_node)
+ extract_examples(wxr, page_data, level_node)
FORM_POS = {
diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py
new file mode 100644
index 000000000..2c7a2759f
--- /dev/null
+++ b/src/wiktextract/extractor/de/pronunciation.py
@@ -0,0 +1,189 @@
+from collections import defaultdict
+from typing import Dict, List, Union
+
+from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor.parser import LevelNode
+from wiktextract.extractor.share import create_audio_url_dict
+
+from wiktextract.page import clean_node
+from wiktextract.wxr_context import WiktextractContext
+
+
+def extract_pronunciation(
+ wxr: WiktextractContext,
+ page_data: List[Dict],
+ level_node: LevelNode,
+):
+ for list_node in level_node.find_child(NodeKind.LIST):
+ sound_data = [defaultdict(list)]
+
+ for not_list_item_node in list_node.invert_find_child(
+ NodeKind.LIST_ITEM
+ ):
+ wxr.wtp.debug(
+ f"Found unexpected non-list-item node in pronunciation section: {not_list_item_node}",
+ sortid="extractor/de/pronunciation/extract_pronunciation/28",
+ )
+
+ for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
+ children = list(list_item_node.filter_empty_str_child())
+ print(children)
+ if len(children) == 0:
+ continue
+
+ head_template, rest = children[0], children[1:]
+ if (
+ not isinstance(head_template, WikiNode)
+ or head_template.kind != NodeKind.TEMPLATE
+ or not rest
+ ):
+ wxr.wtp.debug(
+ f"Found unexpected non-template node in pronunciation section: {head_template}",
+ sortid="extractor/de/pronunciation/extract_pronunciation/37",
+ )
+ continue
+ if head_template.template_name == "IPA":
+ process_ipa(wxr, sound_data, rest)
+ elif head_template.template_name == "Hörbeispiele":
+ sound_data.append(defaultdict(list))
+ process_hoerbeispiele(wxr, sound_data, rest)
+ elif head_template.template_name == "Reime":
+ process_rhymes(wxr, sound_data, rest)
+ else:
+ wxr.wtp.debug(
+ f"Found unexpected template in pronunciation section: {head_template} with content {rest}",
+ sortid="extractor/de/pronunciation/extract_pronunciation/45)",
+ )
+
+ # Remove empty entries
+ sound_data = [entry for entry in sound_data if entry != {}]
+ if len(sound_data) > 0:
+ page_data[-1]["sounds"].extend(sound_data)
+
+ for non_list_node in level_node.invert_find_child(NodeKind.LIST):
+ wxr.wtp.debug(
+ f"Found unexpected non-list node in pronunciation section: {non_list_node}",
+ sortid="extractor/de/pronunciation/extract_pronunciation/64",
+ )
+
+
+def process_ipa(
+ wxr: WiktextractContext,
+ sound_data: List[Dict],
+ nodes: List[Union[WikiNode, str]],
+):
+ if not nodes:
+ return
+
+ head_node = nodes.pop(0)
+
+ if is_template_node_with_name(head_node, "Lautschrift"):
+ process_lautschrift_template(wxr, sound_data, head_node)
+ elif is_tag_node(head_node):
+ append_tag(wxr, sound_data, head_node)
+ elif is_new_sound_data_entry_sep(head_node):
+ sound_data.append(defaultdict(list))
+ else:
+ wxr.wtp.debug(
+ f"Found unexpected non-Lautschrift node in IPA section: {head_node}",
+ sortid="extractor/de/pronunciation/process_ipa/57",
+ )
+
+ if nodes:
+ process_ipa(wxr, sound_data, nodes)
+
+
+def process_lautschrift_template(
+ wxr: WiktextractContext, sound_data: List[Dict], node
+):
+ template_parameters = node.template_parameters
+
+ ipa = template_parameters.get(1)
+
+ lang_code = template_parameters.get("spr")
+ if lang_code:
+ language = wxr.wtp.LANGUAGES_BY_CODE[lang_code]
+ add_sound_data_without_appending_to_existing_properties(
+ sound_data,
+ {
+ "ipa": [ipa],
+ "lang_code": lang_code,
+ "language": language,
+ },
+ )
+ else:
+ sound_data[-1]["ipa"].append(ipa)
+
+
+def process_hoerbeispiele(
+ wxr: WiktextractContext, sound_data: List[Dict], nodes: List[WikiNode]
+):
+ for node in nodes:
+ if is_template_node_with_name(node, "Audio"):
+ process_audio_template(wxr, sound_data, node)
+ elif is_tag_node(node):
+ append_tag(wxr, sound_data, node)
+ elif is_new_sound_data_entry_sep(node):
+ sound_data.append(defaultdict(list))
+ else:
+ wxr.wtp.debug(
+ f"Found unexpected node in Hoerbeispiele section: {node}",
+ sortid="extractor/de/pronunciation/process_hoerbeispiele/193",
+ )
+
+
+def process_audio_template(
+ wxr: WiktextractContext, sound_data: List[Dict], node
+):
+ audio_file = node.template_parameters.get(1)
+ if audio_file:
+ add_sound_data_without_appending_to_existing_properties(
+ sound_data, create_audio_url_dict(audio_file)
+ )
+
+
+def process_rhymes(
+ wxr: WiktextractContext, sound_data: List[Dict], nodes: List[WikiNode]
+):
+ # XXX: Extract rhymes from the referenced rhymes page
+ pass
+
+
+def is_template_node_with_name(node: Union[WikiNode, str], template_name: str):
+ return (
+ isinstance(node, WikiNode)
+ and node.kind == NodeKind.TEMPLATE
+ and node.template_name == template_name
+ )
+
+
+def add_sound_data_without_appending_to_existing_properties(
+ sound_data: List[Dict],
+ new_sound_data: Dict,
+):
+ """Creates a new IPA data entry if properties exist in previous entry."""
+ if any([key in sound_data[-1] for key in new_sound_data.keys()]):
+ sound_data.append(defaultdict(list))
+
+ for key, value in new_sound_data.items():
+ if isinstance(value, str):
+ sound_data[-1][key] = value
+ else:
+ sound_data[-1][key].extend(value)
+
+
+def is_tag_node(node: Union[WikiNode, str]):
+ return isinstance(node, WikiNode) and node.kind in [
+ NodeKind.TEMPLATE,
+ NodeKind.ITALIC,
+ ]
+
+
+def append_tag(wxr: WiktextractContext, sound_data: Dict, node: WikiNode):
+ tag = clean_node(wxr, {}, node).strip()
+ if tag:
+ sound_data[-1]["tags"].append(tag)
+
+
+def is_new_sound_data_entry_sep(node: Union[WikiNode, str]):
+ return isinstance(node, str) and node.strip() in [",", ";"]
diff --git a/tests/test_de_example.py b/tests/test_de_example.py
index dd9a3086f..980a0be6c 100644
--- a/tests/test_de_example.py
+++ b/tests/test_de_example.py
@@ -36,7 +36,7 @@ def test_de_extract_examples(self):
defaultdict(list, {"senseid": "2"}),
]
- extract_examples(self.wxr, page_data, root.children[0])
+ extract_examples(self.wxr, page_data, root)
self.assertEqual(
page_data,
@@ -68,7 +68,7 @@ def test_de_extract_example_with_reference(self):
defaultdict(list, {"senseid": "1"}),
]
- extract_examples(self.wxr, page_data, root.children[0])
+ extract_examples(self.wxr, page_data, root)
self.assertEqual(
page_data,
@@ -89,10 +89,42 @@ def test_de_extract_example_with_reference(self):
],
)
- def test_de_extract_reference(self):
- self.wxr.wtp.start_page("")
+ def test_de_extract_reference_from_literatur_template(self):
+ # https://de.wiktionary.org/wiki/Beispiel
+ self.wxr.wtp.start_page("Beispiel")
self.wxr.wtp.add_page("Vorlage:Literatur", 10, "Expanded template")
- root = self.wxr.wtp.parse("[{{Literatur|Titel=title}}]")
+ root = self.wxr.wtp.parse(
+ "[{{Literatur|Autor=Steffen Möller|Titel=Viva Warszawa|TitelErg=Polen für Fortgeschrittene|Verlag=Piper|Ort=München/Berlin|Jahr=2015}}, Seite 273. ISBN 978-3-89029-459-9.]"
+ )
+
+ example_data = defaultdict(str)
+
+ extract_reference(self.wxr, example_data, root.children[0])
+
+ self.assertEqual(
+ example_data,
+ {
+ "ref": {
+ "raw_ref": "Expanded template, Seite 273. ISBN 978-3-89029-459-9.",
+ "titel": "Viva Warszawa",
+ "autor": "Steffen Möller",
+ "titelerg": "Polen für Fortgeschrittene",
+ "verlag": "Piper",
+ "ort": "München/Berlin",
+ "jahr": "2015",
+ }
+ },
+ )
+
+ def test_de_extract_reference_from_templates_without_named_args(self):
+ # https://de.wiktionary.org/wiki/Beispiel
+ # Reference templates not following the Literatur template pattern are
+ # currently not extracted field by field (e.g. Vorlage:Ref-OWID)
+ self.wxr.wtp.start_page("Beispiel")
+ self.wxr.wtp.add_page("Vorlage:Ref-OWID", 10, "Expanded template")
+ root = self.wxr.wtp.parse(
+ "[{{Ref-OWID|Sprichwörter|401781|Schlechte Beispiele verderben gute Sitten.}}]"
+ )
example_data = defaultdict(str)
@@ -100,5 +132,9 @@ def test_de_extract_reference(self):
self.assertEqual(
example_data,
- {"ref": {"raw_ref": "Expanded template", "titel": "title"}},
+ {
+ "ref": {
+ "raw_ref": "Expanded template",
+ }
+ },
)
diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py
index b44eb6001..e43cd4d67 100644
--- a/tests/test_de_gloss.py
+++ b/tests/test_de_gloss.py
@@ -33,7 +33,7 @@ def test_de_extract_glosses(self):
page_data = [defaultdict(list)]
- extract_glosses(self.wxr, page_data, root.children[0])
+ extract_glosses(self.wxr, page_data, root)
self.assertEqual(
page_data,
@@ -63,7 +63,7 @@ def test_de_extract_glosses_with_subglosses(self):
page_data = [defaultdict(list)]
- extract_glosses(self.wxr, page_data, root.children[0])
+ extract_glosses(self.wxr, page_data, root)
self.assertEqual(
page_data,
@@ -99,7 +99,7 @@ def test_de_extract_glosses_with_only_subglosses(self):
page_data = [defaultdict(list)]
- extract_glosses(self.wxr, page_data, root.children[0])
+ extract_glosses(self.wxr, page_data, root)
self.assertEqual(
page_data,
diff --git a/tests/test_de_pronunciation.py b/tests/test_de_pronunciation.py
new file mode 100644
index 000000000..6fae64eb7
--- /dev/null
+++ b/tests/test_de_pronunciation.py
@@ -0,0 +1,176 @@
+import unittest
+from collections import defaultdict
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.de.pronunciation import (
+ process_ipa,
+ process_hoerbeispiele,
+)
+from wiktextract.thesaurus import close_thesaurus_db
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestDEPronunciation(unittest.TestCase):
+ maxDiff = None
+
+ def setUp(self) -> None:
+ self.wxr = WiktextractContext(
+ Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de")
+ )
+
+ def tearDown(self) -> None:
+ self.wxr.wtp.close_db_conn()
+ close_thesaurus_db(
+ self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
+ )
+
+ def test_de_process_ipa(self):
+ test_cases = [
+ {
+ "input": "{{Lautschrift|ipa1}}",
+ "expected": [
+ {
+ "ipa": ["ipa1"],
+ }
+ ],
+ },
+ {
+ "input": "{{Lautschrift|ipa1|spr=de}}",
+ "expected": [
+ {"ipa": ["ipa1"], "language": "Deutsch", "lang_code": "de"}
+ ],
+ },
+ {
+ "input": "{{Lautschrift|ipa1}} {{Lautschrift|ipa2}}{{Lautschrift|ipa3|spr=de}}",
+ "expected": [
+ {"ipa": ["ipa1", "ipa2"]},
+ {"ipa": ["ipa3"], "language": "Deutsch", "lang_code": "de"},
+ ],
+ },
+ {
+ "input": "{{Lautschrift|ipa1}}, ''tag1'' {{Lautschrift|ipa2}}",
+ "expected": [
+ {"ipa": ["ipa1"]},
+ {"ipa": ["ipa2"], "tags": ["tag1"]},
+ ],
+ },
+ ]
+
+ for case in test_cases:
+ with self.subTest(case=case):
+ self.wxr.wtp.start_page("")
+ self.wxr.wtp.add_page("Vorlage:IPA", 10, "")
+ self.wxr.wtp.add_page("Vorlage:Lautschrift", 10, "(Deutsch)")
+
+ self.wxr.wtp.LANGUAGES_BY_CODE["de"] = "Deutsch"
+
+ root = self.wxr.wtp.parse(case["input"])
+
+ sound_data = [defaultdict(list)]
+
+ process_ipa(
+ self.wxr, sound_data, list(root.filter_empty_str_child())
+ )
+
+ self.assertEqual(sound_data, case["expected"])
+
+ def test_de_process_hoerbeispiele(self):
+ # https://de.wiktionary.org/wiki/Beispiel
+ filename1 = "De-Beispiel.ogg"
+ # https://de.wiktionary.org/wiki/butineur
+ filename2 = "LL-Q150 (fra)-WikiLucas00-butineur.wav"
+ test_cases = [
+ {
+ "input": "{{Audio|" + filename1 + "}}",
+ "expected": [
+ {
+ "audio": filename1,
+ "mp3_url": None, # None indicates we don't care about the exact value
+ "ogg_url": None,
+ }
+ ],
+ },
+ {
+ "input": "{{Audio|"
+ + filename1
+ + "}} {{Audio|"
+ + filename2
+ + "}}",
+ "expected": [
+ {
+ "audio": filename1,
+ "mp3_url": None,
+ "ogg_url": None,
+ },
+ {
+ "audio": filename2,
+ "ogg_url": None,
+ "mp3_url": None,
+ "wav_url": None,
+ },
+ ],
+ },
+ {
+ "input": "{{Audio|"
+ + filename1
+ + "}} ''tag1'', ''tag2'' {{Audio|"
+ + filename2
+ + "}}",
+ "expected": [
+ {
+ "audio": filename1,
+ "mp3_url": None,
+ "ogg_url": None,
+ "tags": ["tag1"],
+ },
+ {
+ "audio": filename2,
+ "mp3_url": None,
+ "ogg_url": None,
+ "wav_url": None,
+ "tags": ["tag2"],
+ },
+ ],
+ },
+ ]
+
+ for case in test_cases:
+ with self.subTest(case=case):
+ self.wxr.wtp.start_page("")
+ self.wxr.wtp.add_page("Vorlage:IPA", 10, "")
+ self.wxr.wtp.add_page("Vorlage:Audio", 10, "")
+
+ self.wxr.wtp.LANGUAGES_BY_CODE["de"] = "Deutsch"
+
+ root = self.wxr.wtp.parse(case["input"])
+
+ sound_data = [defaultdict(list)]
+
+ process_hoerbeispiele(
+ self.wxr, sound_data, list(root.filter_empty_str_child())
+ )
+
+ self.assertSoundDataMatchesExpected(
+ sound_data, case["expected"]
+ )
+
+ def assertSoundDataMatchesExpected(self, sound_data, expected):
+ self.assertEqual(
+ len(sound_data),
+ len(expected),
+ f"Mismatch in number of sound data entries{sound_data}",
+ )
+
+ for data, exp in zip(sound_data, expected):
+ for key, value in exp.items():
+ if value is None:
+ self.assertIn(key, data)
+ else:
+ self.assertEqual(data[key], value)
+
+ for key in data:
+ self.assertIn(key, exp)
+ if exp[key] is not None:
+ self.assertEqual(data[key], exp[key])