Extract glosses from Spanish Wiktionary

This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
tatuylonen · Nov 24, 2023 · e5a4a0a · e5a4a0a
1 parent a821647
commit e5a4a0a
Show file tree

Hide file tree

Showing 4 changed files with 159 additions and 3 deletions.
diff --git a/src/wiktextract/extractor/es/gloss.py b/src/wiktextract/extractor/es/gloss.py
@@ -0,0 +1,60 @@
+import re
+from typing import List
+from wiktextract.extractor.es.models import Sense, WordEntry
+from wiktextract.page import clean_node
+from wiktextract.wxr_context import WiktextractContext
+from wikitextprocessor import WikiNode, NodeKind
+from wikitextprocessor.parser import WikiNodeChildrenList
+
+
+def extract_gloss(
+    wxr: WiktextractContext,
+    page_data: List[WordEntry],
+    list_node: WikiNode,
+) -> None:
+    for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+        gloss_data = Sense(glosses=[])
+
+        definition: WikiNodeChildrenList = []
+        other: WikiNodeChildrenList = []
+
+        for node in list_item.definition:
+            if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+                other.append(node)
+            else:
+                definition.append(node)
+
+        list_item.definition
+
+        gloss = clean_node(wxr, gloss_data, definition)
+        gloss_data.glosses.append(gloss)
+
+        gloss_note = clean_node(wxr, gloss_data, list_item.children)
+
+        match = re.match(r"^(\d+)", gloss_note)
+
+        if match:
+            gloss_data["senseid"] = int(match.group(1))
+            tag_string = gloss_note[len(match.group(1)) :].strip()
+        else:
+            tag_string = gloss_data["tags"] = gloss_note.strip()
+
+        # split tags by comma or "y"
+        tags = re.split(r",|y", tag_string)
+        for tag in tags:
+            tag = (
+                tag.strip()
+                .removesuffix(".")
+                .removesuffix("Main")
+                .removeprefix("Main")
+            )
+            if tag:
+                gloss_data["tags"].append(tag)
+
+        if other:
+            wxr.wtp.debug(
+                f"Found nodes that are not part of definition: {other}",
+                sortid="extractor/es/gloss/extract_gloss/46",
+            )
+
+        page_data[-1].senses.append(gloss_data)
diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py
@@ -79,6 +79,9 @@ class Sense(LoggingExtraFieldsModel):
     subsenses: list["Sense"] = Field(
         default=[], description="List of subsenses"
     )
+    senseid: Optional[int] = Field(
+        default=None, description="Sense number used in Wiktionary"
+    )
 
 
 class WordEntry(LoggingExtraFieldsModel):

diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py
@@ -5,6 +5,7 @@
 
 from wikitextprocessor import NodeKind, WikiNode
 from wiktextract.datautils import append_base_data
+from wiktextract.extractor.es.gloss import extract_gloss
 from wiktextract.extractor.es.pronunciation import extract_pronunciation
 from wiktextract.extractor.es.models import WordEntry, PydanticLogger
 
@@ -76,9 +77,13 @@ def process_pos_block(
         ):
             # XXX: Extract forms
             pass
-        elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
-            # XXX: Extract data
-            pass
+        elif (
+            isinstance(child, WikiNode)
+            and child.kind == NodeKind.LIST
+            and child.sarg == ";"
+        ):
+            extract_gloss(wxr, page_data, child)
+
         else:
             # XXX: Extract data
             pass

diff --git a/tests/test_es_gloss.py b/tests/test_es_gloss.py
@@ -0,0 +1,88 @@
+from typing import List
+import unittest
+
+from wikitextprocessor import Wtp
+from wiktextract.extractor.es.gloss import extract_gloss
+from wiktextract.extractor.es.models import WordEntry
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestESGloss(unittest.TestCase):
+    def setUp(self) -> None:
+        self.wxr = WiktextractContext(
+            Wtp(lang_code="es"),
+            WiktionaryConfig(dump_file_lang_code="es"),
+        )
+
+    def tearDown(self) -> None:
+        self.wxr.wtp.close_db_conn()
+
+    def get_default_page_data(self) -> List[WordEntry]:
+        return [WordEntry(word="test", lang_code="es", lang_name="Language")]
+
+    def test_es_extract_glosses(self):
+        # https://es.wiktionary.org/wiki/ayudar
+
+        self.wxr.wtp.add_page("Plantilla:plm", 10, "Contribuir")
+        self.wxr.wtp.start_page("")
+
+        root = self.wxr.wtp.parse(
+            """;1: {{plm|contribuir}} [[esfuerzo]] o [[recurso]]s para la [[realización]] de algo.
+;2: Por antonomasia, [[cooperar]] a que alguno [[salir|salga]] de una [[situación]] [[dificultoso|dificultosa]]"""
+        )
+
+        page_data = self.get_default_page_data()
+
+        extract_gloss(self.wxr, page_data, root.children[0])
+
+        self.assertEqual(
+            page_data[0].model_dump(exclude_defaults=True)["senses"],
+            [
+                {
+                    "glosses": [
+                        "Contribuir esfuerzo o recursos para la realización de algo."
+                    ],
+                    "senseid": 1,
+                },
+                {
+                    "glosses": [
+                        "Por antonomasia, cooperar a que alguno salga de una situación dificultosa"
+                    ],
+                    "senseid": 2,
+                },
+            ],
+        )
+
+    def test_es_extract_gloss_categories(self):
+        # https://es.wiktionary.org/wiki/amor
+        self.wxr.wtp.add_page("Plantilla:plm", 10, "Sentimiento")
+        self.wxr.wtp.add_page(
+            "Plantilla:sentimientos",
+            10,
+            "Humanidades. [[Categoría:ES:Sentimientos]]",
+        )
+        self.wxr.wtp.start_page("")
+
+        root = self.wxr.wtp.parse(
+            ";1 {{sentimientos}}: {{plm|sentimiento}} [[afectivo]] de [[atracción]], [[unión]] y [[afinidad]] que se experimenta hacia una persona, animal o cosa"
+        )
+
+        page_data = self.get_default_page_data()
+
+        extract_gloss(self.wxr, page_data, root.children[0])
+
+        self.assertEqual(
+            page_data[0].model_dump(exclude_defaults=True)["senses"],
+            [
+                {
+                    "glosses": [
+                        "Sentimiento afectivo de atracción, unión y afinidad que se experimenta hacia una persona, animal o cosa"
+                    ],
+                    "senseid": 1,
+                    "tags": ["Humanidades."],
+                    "categories": ["ES:Sentimientos"],
+                }
+            ],
+        )