Skip to content

Commit

Permalink
Extract pronunciation data from German Wiktionary
Browse files Browse the repository at this point in the history
Move list_node loop to section extractors

This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
  • Loading branch information
empiriker committed Oct 13, 2023
1 parent 3b85751 commit e0e119e
Show file tree
Hide file tree
Showing 7 changed files with 443 additions and 43 deletions.
75 changes: 41 additions & 34 deletions src/wiktextract/extractor/de/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@


from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode
from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid

from wiktextract.page import clean_node
Expand All @@ -12,42 +13,48 @@
def extract_examples(
wxr: WiktextractContext,
page_data: List[Dict],
list_node: WikiNode,
level_node: LevelNode,
) -> None:
for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
example_data = defaultdict(str)

ref_nodes = find_and_remove_child(
list_item_node,
NodeKind.HTML,
lambda html_node: html_node.tag == "ref",
for list_node in level_node.find_child(NodeKind.LIST):
for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
example_data = defaultdict(str)

ref_nodes = find_and_remove_child(
list_item_node,
NodeKind.HTML,
lambda html_node: html_node.tag == "ref",
)
for ref_node in ref_nodes:
extract_reference(wxr, example_data, ref_node)

example_text = clean_node(wxr, {}, list_item_node.children)

senseid, example_text = match_senseid(example_text)

if example_text:
example_data["text"] = example_text

if senseid:
sense_data = [
sense
for sense in page_data[-1]["senses"]
if sense["senseid"] == senseid
]

for sense in sense_data:
sense["examples"].append(example_data)

else:
if example_data:
wxr.wtp.debug(
f"Found example data without senseid and text: {example_data}",
sortid="extractor/de/examples/extract_examples/28",
)
for non_list_node in level_node.invert_find_child(NodeKind.LIST):
wxr.wtp.debug(
f"Found unexpected non-list node in example section: {non_list_node}",
sortid="extractor/de/examples/extract_examples/33",
)
for ref_node in ref_nodes:
extract_reference(wxr, example_data, ref_node)

example_text = clean_node(wxr, {}, list_item_node.children)

senseid, example_text = match_senseid(example_text)

if example_text:
example_data["text"] = example_text

if senseid:
sense_data = [
sense
for sense in page_data[-1]["senses"]
if sense["senseid"] == senseid
]

for sense in sense_data:
sense["examples"].append(example_data)

else:
if example_data:
wxr.wtp.debug(
f"Found example data without senseid and text: {example_data}",
sortid="extractor/de/examples/extract_examples/28",
)


def extract_reference(
Expand Down
24 changes: 22 additions & 2 deletions src/wiktextract/extractor/de/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,29 @@
from typing import Dict, List

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode
from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid

from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext


def extract_glosses(
wxr: WiktextractContext,
page_data: List[Dict],
level_node: LevelNode,
) -> None:
for list_node in level_node.find_child(NodeKind.LIST):
process_gloss_list_item(wxr, page_data, list_node)

for non_list_node in level_node.invert_find_child(NodeKind.LIST):
wxr.wtp.debug(
f"Found unexpected non-list node in pronunciation section: {non_list_node}",
sortid="extractor/de/pronunciation/extract_pronunciation/64",
)


def process_gloss_list_item(
wxr: WiktextractContext,
page_data: List[Dict],
list_node: WikiNode,
Expand Down Expand Up @@ -54,7 +70,11 @@ def extract_glosses(
senseid, gloss_text = match_senseid(gloss_text)

if senseid:
senseid if senseid[0].isnumeric() else parent_senseid + senseid
senseid = (
senseid
if senseid[0].isnumeric()
else parent_senseid + senseid
)
gloss_data["senseid"] = senseid
else:
wxr.wtp.debug(
Expand All @@ -71,7 +91,7 @@ def extract_glosses(
page_data[-1]["senses"].append(gloss_data)

for sub_list_node in sub_glosses_list_nodes:
extract_glosses(
process_gloss_list_item(
wxr,
page_data,
sub_list_node,
Expand Down
6 changes: 4 additions & 2 deletions src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from wikitextprocessor.parser import LevelNode

from wiktextract.datautils import append_base_data
from wiktextract.extractor.de.pronunciation import extract_pronunciation
from wiktextract.wxr_context import WiktextractContext

from .gloss import extract_glosses
Expand Down Expand Up @@ -71,9 +72,10 @@ def parse_section(
wxr.wtp.start_subsection(section_name)
if section_name == "Bedeutungen":
extract_glosses(wxr, page_data, level_node)
if section_name == "Aussprache":
extract_pronunciation(wxr, page_data, level_node)
if section_name == "Beispiele":
for list_node in level_node.find_child(NodeKind.LIST):
extract_examples(wxr, page_data, list_node)
extract_examples(wxr, page_data, level_node)


FORM_POS = {
Expand Down
201 changes: 201 additions & 0 deletions src/wiktextract/extractor/de/pronunciation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
from collections import defaultdict
from typing import Dict, List, Union

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode
from wiktextract.extractor.share import create_audio_url_dict

from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext


def extract_pronunciation(
wxr: WiktextractContext,
page_data: List[Dict],
level_node: LevelNode,
):
for list_node in level_node.find_child(NodeKind.LIST):
sound_data = [defaultdict(list)]

for not_list_item_node in list_node.invert_find_child(
NodeKind.LIST_ITEM
):
wxr.wtp.debug(
f"Found unexpected non-list-item node in pronunciation section: {not_list_item_node}",
sortid="extractor/de/pronunciation/extract_pronunciation/28",
)

for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
children = list_item_node.children
if len(children) == 0:
continue

head_template, rest = children[0], children[1:]
if (
not isinstance(head_template, WikiNode)
or head_template.kind != NodeKind.TEMPLATE
or not rest
):
wxr.wtp.debug(
f"Found unexpected non-template node in pronunciation section: {head_template}",
sortid="extractor/de/pronunciation/extract_pronunciation/37",
)
continue
if head_template.template_name == "IPA":
process_ipa(wxr, sound_data, rest)
elif head_template.template_name == "Hörbeispiele":
sound_data.append(defaultdict(list))
process_hoerbeispiele(wxr, sound_data, rest)
elif head_template.template_name == "Reime":
process_rhymes(wxr, sound_data, rest)
else:
wxr.wtp.debug(
f"Found unexpected template in pronunciation section: {head_template} with content {rest}",
sortid="extractor/de/pronunciation/extract_pronunciation/45)",
)

# Remove empty entries
sound_data = [entry for entry in sound_data if entry != {}]
if len(sound_data) > 0:
page_data[-1]["sounds"].extend(sound_data)

for non_list_node in level_node.invert_find_child(NodeKind.LIST):
wxr.wtp.debug(
f"Found unexpected non-list node in pronunciation section: {non_list_node}",
sortid="extractor/de/pronunciation/extract_pronunciation/64",
)


def process_ipa(
wxr: WiktextractContext,
sound_data: List[Dict],
nodes: List[Union[WikiNode, str]],
):
if not nodes:
return

def process_lautschrift_template(
wxr: WiktextractContext, sound_data: List[Dict], node
):
largs = node.largs
if len(largs) > 1 and len(largs[1]):
ipa = largs[1][0]
if ipa.strip() and ipa != "...":
if len(largs) > 2:
lang_arg = largs[2][0].split("=", 1)
if len(lang_arg) > 1:
lang_code = lang_arg[1].strip()
language = wxr.wtp.LANGUAGES_BY_CODE[lang_code]
add_sound_data_without_appending_to_existing_properties(
sound_data,
{
"ipa": [ipa],
"lang_code": lang_code,
"language": language,
},
)
else:
sound_data[-1]["ipa"].append(ipa)

head_node = nodes.pop(0)

if is_template_node_with_name(head_node, "Lautschrift"):
process_lautschrift_template(wxr, sound_data, head_node)
elif is_tag_node(head_node):
append_tag(wxr, sound_data, head_node)
elif is_new_sound_data_entry_sep(head_node):
sound_data.append(defaultdict(list))
elif not is_empty_string(head_node):
wxr.wtp.debug(
f"Found unexpected non-Lautschrift node in IPA section: {head_node}",
sortid="extractor/de/pronunciation/process_ipa/57",
)

if nodes:
process_ipa(wxr, sound_data, nodes)


def process_hoerbeispiele(
wxr: WiktextractContext, sound_data: List[Dict], nodes: List[WikiNode]
):
if not nodes:
return

def process_audio_template(
wxr: WiktextractContext, sound_data: List[Dict], node
):
largs = node.largs
if len(largs) > 1 and len(largs[1]):
audio_file = largs[1][0]
if audio_file.strip():
add_sound_data_without_appending_to_existing_properties(
sound_data, create_audio_url_dict(audio_file)
)

head_node = nodes.pop(0)

if is_template_node_with_name(head_node, "Audio"):
process_audio_template(wxr, sound_data, head_node)
elif is_tag_node(head_node):
append_tag(wxr, sound_data, head_node)
elif is_new_sound_data_entry_sep(head_node):
sound_data.append(defaultdict(list))
elif not is_empty_string(head_node):
wxr.wtp.debug(
f"Found unexpected node in Hoerbeispiele section: {head_node}",
sortid="extractor/de/pronunciation/process_hoerbeispiele/193",
)

if nodes:
process_hoerbeispiele(wxr, sound_data, nodes)


def process_rhymes(
wxr: WiktextractContext, sound_data: List[Dict], nodes: List[WikiNode]
):
# XXX: Extract rhymes from the referenced rhymes page
pass


def is_template_node_with_name(node: WikiNode | str, template_name: str):
return (
isinstance(node, WikiNode)
and node.kind == NodeKind.TEMPLATE
and node.template_name == template_name
)


def add_sound_data_without_appending_to_existing_properties(
sound_data: List[Dict],
new_sound_data: Dict,
):
"""Creates a new IPA data entry if properties exist in previous entry."""
if any([key in sound_data[-1] for key in new_sound_data.keys()]):
sound_data.append(defaultdict(list))

for key, value in new_sound_data.items():
if isinstance(value, str):
sound_data[-1][key] = value
else:
sound_data[-1][key].extend(value)


def is_tag_node(node: WikiNode | str):
return isinstance(node, WikiNode) and node.kind in [
NodeKind.TEMPLATE,
NodeKind.ITALIC,
]


def append_tag(wxr: WiktextractContext, sound_data: Dict, node: WikiNode):
tag = clean_node(wxr, {}, node).strip()
if tag:
sound_data[-1]["tags"].append(tag)


def is_new_sound_data_entry_sep(node: WikiNode | str):
return isinstance(node, str) and node.strip() in [",", ";"]


def is_empty_string(node: WikiNode | str):
return isinstance(node, str) and not node.strip()
4 changes: 2 additions & 2 deletions tests/test_de_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_de_extract_examples(self):
defaultdict(list, {"senseid": "2"}),
]

extract_examples(self.wxr, page_data, root.children[0])
extract_examples(self.wxr, page_data, root)

self.assertEqual(
page_data,
Expand Down Expand Up @@ -68,7 +68,7 @@ def test_de_extract_example_with_reference(self):
defaultdict(list, {"senseid": "1"}),
]

extract_examples(self.wxr, page_data, root.children[0])
extract_examples(self.wxr, page_data, root)

self.assertEqual(
page_data,
Expand Down
Loading

0 comments on commit e0e119e

Please sign in to comment.