From 3b85751c8ce7e4ac38cd1ec1d1b4b8bf19ea9376 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Thu, 5 Oct 2023 20:33:03 +0300 Subject: [PATCH] Extract examples with references from German Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- src/wiktextract/extractor/de/example.py | 86 ++++++++++++++++++++ src/wiktextract/extractor/de/page.py | 4 + tests/test_de_example.py | 104 ++++++++++++++++++++++++ 3 files changed, 194 insertions(+) create mode 100644 src/wiktextract/extractor/de/example.py create mode 100644 tests/test_de_example.py diff --git a/src/wiktextract/extractor/de/example.py b/src/wiktextract/extractor/de/example.py new file mode 100644 index 000000000..155387448 --- /dev/null +++ b/src/wiktextract/extractor/de/example.py @@ -0,0 +1,86 @@ +from collections import defaultdict +from typing import Dict, List + + +from wikitextprocessor import NodeKind, WikiNode +from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid + +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext + + +def extract_examples( + wxr: WiktextractContext, + page_data: List[Dict], + list_node: WikiNode, +) -> None: + for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): + example_data = defaultdict(str) + + ref_nodes = find_and_remove_child( + list_item_node, + NodeKind.HTML, + lambda html_node: html_node.tag == "ref", + ) + for ref_node in ref_nodes: + extract_reference(wxr, example_data, ref_node) + + example_text = clean_node(wxr, {}, list_item_node.children) + + senseid, example_text = match_senseid(example_text) + + if example_text: + example_data["text"] = example_text + + if senseid: + sense_data = [ + sense + for sense in page_data[-1]["senses"] + if sense["senseid"] == senseid + ] + + for sense in sense_data: + sense["examples"].append(example_data) + + else: + if example_data: + wxr.wtp.debug( + f"Found example data without senseid and text: {example_data}", + sortid="extractor/de/examples/extract_examples/28", + ) + + +def extract_reference( + wxr: WiktextractContext, example_data: Dict[str, str], ref_node: WikiNode +): + reference_data = defaultdict() + + reference_data["raw_ref"] = clean_node(wxr, {}, ref_node.children) + + template_nodes = list(ref_node.find_child(NodeKind.TEMPLATE)) + + if len(template_nodes) > 1: + wxr.wtp.debug( + f"Found unexpected number of templates in example: {template_nodes}", + sortid="extractor/de/examples/extract_examples/64", + ) + elif len(template_nodes) == 1: + template_node = template_nodes[0] + + # Sometimes the title is dynamically generated from the template name, + # so we preset the title. If specified in the template, it will be + # overwritten. + reference_data["titel"] = template_node.largs[0][0].strip() + + for arg in template_node.largs[1:]: + arg = clean_node(wxr, {}, arg) + if not arg.strip(): + continue + splits = arg.split("=", 1) + if len(splits) != 2: + continue + arg_name, arg_value = arg.split("=", 1) + if arg_name.strip() and arg_value.strip(): + reference_data[arg_name.lower()] = arg_value + + example_data["ref"] = reference_data diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py index e416a4064..0d2f1ca1a 100644 --- a/src/wiktextract/extractor/de/page.py +++ b/src/wiktextract/extractor/de/page.py @@ -10,6 +10,7 @@ from wiktextract.wxr_context import WiktextractContext from .gloss import extract_glosses +from .example import extract_examples # Templates that are used to form panels on pages and that should be ignored in # various positions @@ -70,6 +71,9 @@ def parse_section( wxr.wtp.start_subsection(section_name) if section_name == "Bedeutungen": extract_glosses(wxr, page_data, level_node) + if section_name == "Beispiele": + for list_node in level_node.find_child(NodeKind.LIST): + extract_examples(wxr, page_data, list_node) FORM_POS = { diff --git a/tests/test_de_example.py b/tests/test_de_example.py new file mode 100644 index 000000000..dd9a3086f --- /dev/null +++ b/tests/test_de_example.py @@ -0,0 +1,104 @@ +import unittest +from collections import defaultdict + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.de.example import extract_examples, extract_reference + +from wiktextract.thesaurus import close_thesaurus_db +from wiktextract.wxr_context import WiktextractContext + + +class TestDEExample(unittest.TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de") + ) + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + close_thesaurus_db( + self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn + ) + + def test_de_extract_examples(self): + self.wxr.wtp.start_page("") + root = self.wxr.wtp.parse( + ":[1] example1A \n:[1] example1B\n:[2] example2\n:[3] example3" + ) + + page_data = [defaultdict(list)] + page_data[-1]["senses"] = [ + defaultdict(list, {"senseid": "1"}), + defaultdict(list, {"senseid": "2"}), + ] + + extract_examples(self.wxr, page_data, root.children[0]) + + self.assertEqual( + page_data, + [ + { + "senses": [ + { + "examples": [ + {"text": "example1A"}, + {"text": "example1B"}, + ], + "senseid": "1", + }, + { + "examples": [{"text": "example2"}], + "senseid": "2", + }, + ] + } + ], + ) + + def test_de_extract_example_with_reference(self): + self.wxr.wtp.start_page("") + root = self.wxr.wtp.parse(":[1] example1 ref1A") + + page_data = [defaultdict(list)] + page_data[-1]["senses"] = [ + defaultdict(list, {"senseid": "1"}), + ] + + extract_examples(self.wxr, page_data, root.children[0]) + + self.assertEqual( + page_data, + [ + { + "senses": [ + { + "examples": [ + { + "text": "example1", + "ref": {"raw_ref": "ref1A"}, + }, + ], + "senseid": "1", + }, + ] + } + ], + ) + + def test_de_extract_reference(self): + self.wxr.wtp.start_page("") + self.wxr.wtp.add_page("Vorlage:Literatur", 10, "Expanded template") + root = self.wxr.wtp.parse("{{Literatur|Titel=title}}") + + example_data = defaultdict(str) + + extract_reference(self.wxr, example_data, root.children[0]) + + self.assertEqual( + example_data, + {"ref": {"raw_ref": "Expanded template", "titel": "title"}}, + )