Skip to content

Commit

Permalink
Extract glosses from Spanish Wiktionary
Browse files Browse the repository at this point in the history
This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
  • Loading branch information
empiriker committed Nov 24, 2023
1 parent a821647 commit e5a4a0a
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 3 deletions.
60 changes: 60 additions & 0 deletions src/wiktextract/extractor/es/gloss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import re
from typing import List
from wiktextract.extractor.es.models import Sense, WordEntry
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
from wikitextprocessor import WikiNode, NodeKind
from wikitextprocessor.parser import WikiNodeChildrenList


def extract_gloss(
wxr: WiktextractContext,
page_data: List[WordEntry],
list_node: WikiNode,
) -> None:
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
gloss_data = Sense(glosses=[])

definition: WikiNodeChildrenList = []
other: WikiNodeChildrenList = []

for node in list_item.definition:
if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
other.append(node)
else:
definition.append(node)

list_item.definition

gloss = clean_node(wxr, gloss_data, definition)
gloss_data.glosses.append(gloss)

gloss_note = clean_node(wxr, gloss_data, list_item.children)

match = re.match(r"^(\d+)", gloss_note)

if match:
gloss_data["senseid"] = int(match.group(1))
tag_string = gloss_note[len(match.group(1)) :].strip()
else:
tag_string = gloss_data["tags"] = gloss_note.strip()

# split tags by comma or "y"
tags = re.split(r",|y", tag_string)
for tag in tags:
tag = (
tag.strip()
.removesuffix(".")
.removesuffix("Main")
.removeprefix("Main")
)
if tag:
gloss_data["tags"].append(tag)

if other:
wxr.wtp.debug(
f"Found nodes that are not part of definition: {other}",
sortid="extractor/es/gloss/extract_gloss/46",
)

page_data[-1].senses.append(gloss_data)
3 changes: 3 additions & 0 deletions src/wiktextract/extractor/es/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ class Sense(LoggingExtraFieldsModel):
subsenses: list["Sense"] = Field(
default=[], description="List of subsenses"
)
senseid: Optional[int] = Field(
default=None, description="Sense number used in Wiktionary"
)


class WordEntry(LoggingExtraFieldsModel):
Expand Down
11 changes: 8 additions & 3 deletions src/wiktextract/extractor/es/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from wikitextprocessor import NodeKind, WikiNode
from wiktextract.datautils import append_base_data
from wiktextract.extractor.es.gloss import extract_gloss
from wiktextract.extractor.es.pronunciation import extract_pronunciation
from wiktextract.extractor.es.models import WordEntry, PydanticLogger

Expand Down Expand Up @@ -76,9 +77,13 @@ def process_pos_block(
):
# XXX: Extract forms
pass
elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
# XXX: Extract data
pass
elif (
isinstance(child, WikiNode)
and child.kind == NodeKind.LIST
and child.sarg == ";"
):
extract_gloss(wxr, page_data, child)

else:
# XXX: Extract data
pass
Expand Down
88 changes: 88 additions & 0 deletions tests/test_es_gloss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from typing import List
import unittest

from wikitextprocessor import Wtp
from wiktextract.extractor.es.gloss import extract_gloss
from wiktextract.extractor.es.models import WordEntry

from wiktextract.config import WiktionaryConfig
from wiktextract.wxr_context import WiktextractContext


class TestESGloss(unittest.TestCase):
def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="es"),
WiktionaryConfig(dump_file_lang_code="es"),
)

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def get_default_page_data(self) -> List[WordEntry]:
return [WordEntry(word="test", lang_code="es", lang_name="Language")]

def test_es_extract_glosses(self):
# https://es.wiktionary.org/wiki/ayudar

self.wxr.wtp.add_page("Plantilla:plm", 10, "Contribuir")
self.wxr.wtp.start_page("")

root = self.wxr.wtp.parse(
""";1: {{plm|contribuir}} [[esfuerzo]] o [[recurso]]s para la [[realización]] de algo.
;2: Por antonomasia, [[cooperar]] a que alguno [[salir|salga]] de una [[situación]] [[dificultoso|dificultosa]]"""
)

page_data = self.get_default_page_data()

extract_gloss(self.wxr, page_data, root.children[0])

self.assertEqual(
page_data[0].model_dump(exclude_defaults=True)["senses"],
[
{
"glosses": [
"Contribuir esfuerzo o recursos para la realización de algo."
],
"senseid": 1,
},
{
"glosses": [
"Por antonomasia, cooperar a que alguno salga de una situación dificultosa"
],
"senseid": 2,
},
],
)

def test_es_extract_gloss_categories(self):
# https://es.wiktionary.org/wiki/amor
self.wxr.wtp.add_page("Plantilla:plm", 10, "Sentimiento")
self.wxr.wtp.add_page(
"Plantilla:sentimientos",
10,
"Humanidades. [[Categoría:ES:Sentimientos]]",
)
self.wxr.wtp.start_page("")

root = self.wxr.wtp.parse(
";1 {{sentimientos}}: {{plm|sentimiento}} [[afectivo]] de [[atracción]], [[unión]] y [[afinidad]] que se experimenta hacia una persona, animal o cosa"
)

page_data = self.get_default_page_data()

extract_gloss(self.wxr, page_data, root.children[0])

self.assertEqual(
page_data[0].model_dump(exclude_defaults=True)["senses"],
[
{
"glosses": [
"Sentimiento afectivo de atracción, unión y afinidad que se experimenta hacia una persona, animal o cosa"
],
"senseid": 1,
"tags": ["Humanidades."],
"categories": ["ES:Sentimientos"],
}
],
)

0 comments on commit e5a4a0a

Please sign in to comment.