Skip to content

Commit

Permalink
Extract examples with references from German Wiktionary
Browse files Browse the repository at this point in the history
This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
  • Loading branch information
empiriker committed Oct 13, 2023
1 parent e3254f6 commit 3b85751
Show file tree
Hide file tree
Showing 3 changed files with 194 additions and 0 deletions.
86 changes: 86 additions & 0 deletions src/wiktextract/extractor/de/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from collections import defaultdict
from typing import Dict, List


from wikitextprocessor import NodeKind, WikiNode
from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid

from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext


def extract_examples(
wxr: WiktextractContext,
page_data: List[Dict],
list_node: WikiNode,
) -> None:
for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
example_data = defaultdict(str)

ref_nodes = find_and_remove_child(
list_item_node,
NodeKind.HTML,
lambda html_node: html_node.tag == "ref",
)
for ref_node in ref_nodes:
extract_reference(wxr, example_data, ref_node)

example_text = clean_node(wxr, {}, list_item_node.children)

senseid, example_text = match_senseid(example_text)

if example_text:
example_data["text"] = example_text

if senseid:
sense_data = [
sense
for sense in page_data[-1]["senses"]
if sense["senseid"] == senseid
]

for sense in sense_data:
sense["examples"].append(example_data)

else:
if example_data:
wxr.wtp.debug(
f"Found example data without senseid and text: {example_data}",
sortid="extractor/de/examples/extract_examples/28",
)


def extract_reference(
wxr: WiktextractContext, example_data: Dict[str, str], ref_node: WikiNode
):
reference_data = defaultdict()

reference_data["raw_ref"] = clean_node(wxr, {}, ref_node.children)

template_nodes = list(ref_node.find_child(NodeKind.TEMPLATE))

if len(template_nodes) > 1:
wxr.wtp.debug(
f"Found unexpected number of templates in example: {template_nodes}",
sortid="extractor/de/examples/extract_examples/64",
)
elif len(template_nodes) == 1:
template_node = template_nodes[0]

# Sometimes the title is dynamically generated from the template name,
# so we preset the title. If specified in the template, it will be
# overwritten.
reference_data["titel"] = template_node.largs[0][0].strip()

for arg in template_node.largs[1:]:
arg = clean_node(wxr, {}, arg)
if not arg.strip():
continue
splits = arg.split("=", 1)
if len(splits) != 2:
continue
arg_name, arg_value = arg.split("=", 1)
if arg_name.strip() and arg_value.strip():
reference_data[arg_name.lower()] = arg_value

example_data["ref"] = reference_data
4 changes: 4 additions & 0 deletions src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from wiktextract.wxr_context import WiktextractContext

from .gloss import extract_glosses
from .example import extract_examples

# Templates that are used to form panels on pages and that should be ignored in
# various positions
Expand Down Expand Up @@ -70,6 +71,9 @@ def parse_section(
wxr.wtp.start_subsection(section_name)
if section_name == "Bedeutungen":
extract_glosses(wxr, page_data, level_node)
if section_name == "Beispiele":
for list_node in level_node.find_child(NodeKind.LIST):
extract_examples(wxr, page_data, list_node)


FORM_POS = {
Expand Down
104 changes: 104 additions & 0 deletions tests/test_de_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import unittest
from collections import defaultdict

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.example import extract_examples, extract_reference

from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


class TestDEExample(unittest.TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de")
)

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
close_thesaurus_db(
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)

def test_de_extract_examples(self):
self.wxr.wtp.start_page("")
root = self.wxr.wtp.parse(
":[1] example1A \n:[1] example1B\n:[2] example2\n:[3] example3"
)

page_data = [defaultdict(list)]
page_data[-1]["senses"] = [
defaultdict(list, {"senseid": "1"}),
defaultdict(list, {"senseid": "2"}),
]

extract_examples(self.wxr, page_data, root.children[0])

self.assertEqual(
page_data,
[
{
"senses": [
{
"examples": [
{"text": "example1A"},
{"text": "example1B"},
],
"senseid": "1",
},
{
"examples": [{"text": "example2"}],
"senseid": "2",
},
]
}
],
)

def test_de_extract_example_with_reference(self):
self.wxr.wtp.start_page("")
root = self.wxr.wtp.parse(":[1] example1 <ref>ref1A</ref>")

page_data = [defaultdict(list)]
page_data[-1]["senses"] = [
defaultdict(list, {"senseid": "1"}),
]

extract_examples(self.wxr, page_data, root.children[0])

self.assertEqual(
page_data,
[
{
"senses": [
{
"examples": [
{
"text": "example1",
"ref": {"raw_ref": "ref1A"},
},
],
"senseid": "1",
},
]
}
],
)

def test_de_extract_reference(self):
self.wxr.wtp.start_page("")
self.wxr.wtp.add_page("Vorlage:Literatur", 10, "Expanded template")
root = self.wxr.wtp.parse("<ref>{{Literatur|Titel=title}}</ref>")

example_data = defaultdict(str)

extract_reference(self.wxr, example_data, root.children[0])

self.assertEqual(
example_data,
{"ref": {"raw_ref": "Expanded template", "titel": "title"}},
)

0 comments on commit 3b85751

Please sign in to comment.