Extract examples with references from German Wiktionary

This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
tatuylonen · Oct 13, 2023 · 3b85751 · 3b85751
1 parent e3254f6
commit 3b85751
Show file tree

Hide file tree

Showing 3 changed files with 194 additions and 0 deletions.
diff --git a/src/wiktextract/extractor/de/example.py b/src/wiktextract/extractor/de/example.py
@@ -0,0 +1,86 @@
+from collections import defaultdict
+from typing import Dict, List
+
+
+from wikitextprocessor import NodeKind, WikiNode
+from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
+
+from wiktextract.page import clean_node
+from wiktextract.wxr_context import WiktextractContext
+
+
+def extract_examples(
+    wxr: WiktextractContext,
+    page_data: List[Dict],
+    list_node: WikiNode,
+) -> None:
+    for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
+        example_data = defaultdict(str)
+
+        ref_nodes = find_and_remove_child(
+            list_item_node,
+            NodeKind.HTML,
+            lambda html_node: html_node.tag == "ref",
+        )
+        for ref_node in ref_nodes:
+            extract_reference(wxr, example_data, ref_node)
+
+        example_text = clean_node(wxr, {}, list_item_node.children)
+
+        senseid, example_text = match_senseid(example_text)
+
+        if example_text:
+            example_data["text"] = example_text
+
+        if senseid:
+            sense_data = [
+                sense
+                for sense in page_data[-1]["senses"]
+                if sense["senseid"] == senseid
+            ]
+
+            for sense in sense_data:
+                sense["examples"].append(example_data)
+
+        else:
+            if example_data:
+                wxr.wtp.debug(
+                    f"Found example data without senseid and text: {example_data}",
+                    sortid="extractor/de/examples/extract_examples/28",
+                )
+
+
+def extract_reference(
+    wxr: WiktextractContext, example_data: Dict[str, str], ref_node: WikiNode
+):
+    reference_data = defaultdict()
+
+    reference_data["raw_ref"] = clean_node(wxr, {}, ref_node.children)
+
+    template_nodes = list(ref_node.find_child(NodeKind.TEMPLATE))
+
+    if len(template_nodes) > 1:
+        wxr.wtp.debug(
+            f"Found unexpected number of templates in example: {template_nodes}",
+            sortid="extractor/de/examples/extract_examples/64",
+        )
+    elif len(template_nodes) == 1:
+        template_node = template_nodes[0]
+
+        # Sometimes the title is dynamically generated from the template name,
+        # so we preset the title. If specified in the template, it will be
+        # overwritten.
+        reference_data["titel"] = template_node.largs[0][0].strip()
+
+        for arg in template_node.largs[1:]:
+            arg = clean_node(wxr, {}, arg)
+            if not arg.strip():
+                continue
+            splits = arg.split("=", 1)
+            if len(splits) != 2:
+                continue
+            arg_name, arg_value = arg.split("=", 1)
+            if arg_name.strip() and arg_value.strip():
+                reference_data[arg_name.lower()] = arg_value
+
+    example_data["ref"] = reference_data
diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py
@@ -10,6 +10,7 @@
 from wiktextract.wxr_context import WiktextractContext
 
 from .gloss import extract_glosses
+from .example import extract_examples
 
 # Templates that are used to form panels on pages and that should be ignored in
 # various positions
@@ -70,6 +71,9 @@ def parse_section(
         wxr.wtp.start_subsection(section_name)
         if section_name == "Bedeutungen":
             extract_glosses(wxr, page_data, level_node)
+        if section_name == "Beispiele":
+            for list_node in level_node.find_child(NodeKind.LIST):
+                extract_examples(wxr, page_data, list_node)
 
 
 FORM_POS = {

diff --git a/tests/test_de_example.py b/tests/test_de_example.py
@@ -0,0 +1,104 @@
+import unittest
+from collections import defaultdict
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.de.example import extract_examples, extract_reference
+
+from wiktextract.thesaurus import close_thesaurus_db
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestDEExample(unittest.TestCase):
+    maxDiff = None
+
+    def setUp(self) -> None:
+        self.wxr = WiktextractContext(
+            Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de")
+        )
+
+    def tearDown(self) -> None:
+        self.wxr.wtp.close_db_conn()
+        close_thesaurus_db(
+            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
+        )
+
+    def test_de_extract_examples(self):
+        self.wxr.wtp.start_page("")
+        root = self.wxr.wtp.parse(
+            ":[1] example1A \n:[1] example1B\n:[2] example2\n:[3] example3"
+        )
+
+        page_data = [defaultdict(list)]
+        page_data[-1]["senses"] = [
+            defaultdict(list, {"senseid": "1"}),
+            defaultdict(list, {"senseid": "2"}),
+        ]
+
+        extract_examples(self.wxr, page_data, root.children[0])
+
+        self.assertEqual(
+            page_data,
+            [
+                {
+                    "senses": [
+                        {
+                            "examples": [
+                                {"text": "example1A"},
+                                {"text": "example1B"},
+                            ],
+                            "senseid": "1",
+                        },
+                        {
+                            "examples": [{"text": "example2"}],
+                            "senseid": "2",
+                        },
+                    ]
+                }
+            ],
+        )
+
+    def test_de_extract_example_with_reference(self):
+        self.wxr.wtp.start_page("")
+        root = self.wxr.wtp.parse(":[1] example1 <ref>ref1A</ref>")
+
+        page_data = [defaultdict(list)]
+        page_data[-1]["senses"] = [
+            defaultdict(list, {"senseid": "1"}),
+        ]
+
+        extract_examples(self.wxr, page_data, root.children[0])
+
+        self.assertEqual(
+            page_data,
+            [
+                {
+                    "senses": [
+                        {
+                            "examples": [
+                                {
+                                    "text": "example1",
+                                    "ref": {"raw_ref": "ref1A"},
+                                },
+                            ],
+                            "senseid": "1",
+                        },
+                    ]
+                }
+            ],
+        )
+
+    def test_de_extract_reference(self):
+        self.wxr.wtp.start_page("")
+        self.wxr.wtp.add_page("Vorlage:Literatur", 10, "Expanded template")
+        root = self.wxr.wtp.parse("<ref>{{Literatur|Titel=title}}</ref>")
+
+        example_data = defaultdict(str)
+
+        extract_reference(self.wxr, example_data, root.children[0])
+
+        self.assertEqual(
+            example_data,
+            {"ref": {"raw_ref": "Expanded template", "titel": "title"}},
+        )