-
Notifications
You must be signed in to change notification settings - Fork 88
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Extract examples with references from German Wiktionary
This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
- Loading branch information
Showing
3 changed files
with
194 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
from collections import defaultdict | ||
from typing import Dict, List | ||
|
||
|
||
from wikitextprocessor import NodeKind, WikiNode | ||
from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid | ||
|
||
from wiktextract.page import clean_node | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
|
||
def extract_examples( | ||
wxr: WiktextractContext, | ||
page_data: List[Dict], | ||
list_node: WikiNode, | ||
) -> None: | ||
for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): | ||
example_data = defaultdict(str) | ||
|
||
ref_nodes = find_and_remove_child( | ||
list_item_node, | ||
NodeKind.HTML, | ||
lambda html_node: html_node.tag == "ref", | ||
) | ||
for ref_node in ref_nodes: | ||
extract_reference(wxr, example_data, ref_node) | ||
|
||
example_text = clean_node(wxr, {}, list_item_node.children) | ||
|
||
senseid, example_text = match_senseid(example_text) | ||
|
||
if example_text: | ||
example_data["text"] = example_text | ||
|
||
if senseid: | ||
sense_data = [ | ||
sense | ||
for sense in page_data[-1]["senses"] | ||
if sense["senseid"] == senseid | ||
] | ||
|
||
for sense in sense_data: | ||
sense["examples"].append(example_data) | ||
|
||
else: | ||
if example_data: | ||
wxr.wtp.debug( | ||
f"Found example data without senseid and text: {example_data}", | ||
sortid="extractor/de/examples/extract_examples/28", | ||
) | ||
|
||
|
||
def extract_reference( | ||
wxr: WiktextractContext, example_data: Dict[str, str], ref_node: WikiNode | ||
): | ||
reference_data = defaultdict() | ||
|
||
reference_data["raw_ref"] = clean_node(wxr, {}, ref_node.children) | ||
|
||
template_nodes = list(ref_node.find_child(NodeKind.TEMPLATE)) | ||
|
||
if len(template_nodes) > 1: | ||
wxr.wtp.debug( | ||
f"Found unexpected number of templates in example: {template_nodes}", | ||
sortid="extractor/de/examples/extract_examples/64", | ||
) | ||
elif len(template_nodes) == 1: | ||
template_node = template_nodes[0] | ||
|
||
# Sometimes the title is dynamically generated from the template name, | ||
# so we preset the title. If specified in the template, it will be | ||
# overwritten. | ||
reference_data["titel"] = template_node.largs[0][0].strip() | ||
|
||
for arg in template_node.largs[1:]: | ||
arg = clean_node(wxr, {}, arg) | ||
if not arg.strip(): | ||
continue | ||
splits = arg.split("=", 1) | ||
if len(splits) != 2: | ||
continue | ||
arg_name, arg_value = arg.split("=", 1) | ||
if arg_name.strip() and arg_value.strip(): | ||
reference_data[arg_name.lower()] = arg_value | ||
|
||
example_data["ref"] = reference_data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
import unittest | ||
from collections import defaultdict | ||
|
||
from wikitextprocessor import Wtp | ||
|
||
from wiktextract.config import WiktionaryConfig | ||
from wiktextract.extractor.de.example import extract_examples, extract_reference | ||
|
||
from wiktextract.thesaurus import close_thesaurus_db | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
|
||
class TestDEExample(unittest.TestCase): | ||
maxDiff = None | ||
|
||
def setUp(self) -> None: | ||
self.wxr = WiktextractContext( | ||
Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de") | ||
) | ||
|
||
def tearDown(self) -> None: | ||
self.wxr.wtp.close_db_conn() | ||
close_thesaurus_db( | ||
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn | ||
) | ||
|
||
def test_de_extract_examples(self): | ||
self.wxr.wtp.start_page("") | ||
root = self.wxr.wtp.parse( | ||
":[1] example1A \n:[1] example1B\n:[2] example2\n:[3] example3" | ||
) | ||
|
||
page_data = [defaultdict(list)] | ||
page_data[-1]["senses"] = [ | ||
defaultdict(list, {"senseid": "1"}), | ||
defaultdict(list, {"senseid": "2"}), | ||
] | ||
|
||
extract_examples(self.wxr, page_data, root.children[0]) | ||
|
||
self.assertEqual( | ||
page_data, | ||
[ | ||
{ | ||
"senses": [ | ||
{ | ||
"examples": [ | ||
{"text": "example1A"}, | ||
{"text": "example1B"}, | ||
], | ||
"senseid": "1", | ||
}, | ||
{ | ||
"examples": [{"text": "example2"}], | ||
"senseid": "2", | ||
}, | ||
] | ||
} | ||
], | ||
) | ||
|
||
def test_de_extract_example_with_reference(self): | ||
self.wxr.wtp.start_page("") | ||
root = self.wxr.wtp.parse(":[1] example1 <ref>ref1A</ref>") | ||
|
||
page_data = [defaultdict(list)] | ||
page_data[-1]["senses"] = [ | ||
defaultdict(list, {"senseid": "1"}), | ||
] | ||
|
||
extract_examples(self.wxr, page_data, root.children[0]) | ||
|
||
self.assertEqual( | ||
page_data, | ||
[ | ||
{ | ||
"senses": [ | ||
{ | ||
"examples": [ | ||
{ | ||
"text": "example1", | ||
"ref": {"raw_ref": "ref1A"}, | ||
}, | ||
], | ||
"senseid": "1", | ||
}, | ||
] | ||
} | ||
], | ||
) | ||
|
||
def test_de_extract_reference(self): | ||
self.wxr.wtp.start_page("") | ||
self.wxr.wtp.add_page("Vorlage:Literatur", 10, "Expanded template") | ||
root = self.wxr.wtp.parse("<ref>{{Literatur|Titel=title}}</ref>") | ||
|
||
example_data = defaultdict(str) | ||
|
||
extract_reference(self.wxr, example_data, root.children[0]) | ||
|
||
self.assertEqual( | ||
example_data, | ||
{"ref": {"raw_ref": "Expanded template", "titel": "title"}}, | ||
) |