-
Notifications
You must be signed in to change notification settings - Fork 87
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Extract glosses from Spanish Wiktionary
This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
- Loading branch information
Showing
4 changed files
with
159 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import re | ||
from typing import List | ||
from wiktextract.extractor.es.models import Sense, WordEntry | ||
from wiktextract.page import clean_node | ||
from wiktextract.wxr_context import WiktextractContext | ||
from wikitextprocessor import WikiNode, NodeKind | ||
from wikitextprocessor.parser import WikiNodeChildrenList | ||
|
||
|
||
def extract_gloss( | ||
wxr: WiktextractContext, | ||
page_data: List[WordEntry], | ||
list_node: WikiNode, | ||
) -> None: | ||
for list_item in list_node.find_child(NodeKind.LIST_ITEM): | ||
gloss_data = Sense(glosses=[]) | ||
|
||
definition: WikiNodeChildrenList = [] | ||
other: WikiNodeChildrenList = [] | ||
|
||
for node in list_item.definition: | ||
if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: | ||
other.append(node) | ||
else: | ||
definition.append(node) | ||
|
||
list_item.definition | ||
|
||
gloss = clean_node(wxr, gloss_data, definition) | ||
gloss_data.glosses.append(gloss) | ||
|
||
gloss_note = clean_node(wxr, gloss_data, list_item.children) | ||
|
||
match = re.match(r"^(\d+)", gloss_note) | ||
|
||
if match: | ||
gloss_data["senseid"] = int(match.group(1)) | ||
tag_string = gloss_note[len(match.group(1)) :].strip() | ||
else: | ||
tag_string = gloss_data["tags"] = gloss_note.strip() | ||
|
||
# split tags by comma or "y" | ||
tags = re.split(r",|y", tag_string) | ||
for tag in tags: | ||
tag = ( | ||
tag.strip() | ||
.removesuffix(".") | ||
.removesuffix("Main") | ||
.removeprefix("Main") | ||
) | ||
if tag: | ||
gloss_data["tags"].append(tag) | ||
|
||
if other: | ||
wxr.wtp.debug( | ||
f"Found nodes that are not part of definition: {other}", | ||
sortid="extractor/es/gloss/extract_gloss/46", | ||
) | ||
|
||
page_data[-1].senses.append(gloss_data) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
from typing import List | ||
import unittest | ||
|
||
from wikitextprocessor import Wtp | ||
from wiktextract.extractor.es.gloss import extract_gloss | ||
from wiktextract.extractor.es.models import WordEntry | ||
|
||
from wiktextract.config import WiktionaryConfig | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
|
||
class TestESGloss(unittest.TestCase): | ||
def setUp(self) -> None: | ||
self.wxr = WiktextractContext( | ||
Wtp(lang_code="es"), | ||
WiktionaryConfig(dump_file_lang_code="es"), | ||
) | ||
|
||
def tearDown(self) -> None: | ||
self.wxr.wtp.close_db_conn() | ||
|
||
def get_default_page_data(self) -> List[WordEntry]: | ||
return [WordEntry(word="test", lang_code="es", lang_name="Language")] | ||
|
||
def test_es_extract_glosses(self): | ||
# https://es.wiktionary.org/wiki/ayudar | ||
|
||
self.wxr.wtp.add_page("Plantilla:plm", 10, "Contribuir") | ||
self.wxr.wtp.start_page("") | ||
|
||
root = self.wxr.wtp.parse( | ||
""";1: {{plm|contribuir}} [[esfuerzo]] o [[recurso]]s para la [[realización]] de algo. | ||
;2: Por antonomasia, [[cooperar]] a que alguno [[salir|salga]] de una [[situación]] [[dificultoso|dificultosa]]""" | ||
) | ||
|
||
page_data = self.get_default_page_data() | ||
|
||
extract_gloss(self.wxr, page_data, root.children[0]) | ||
|
||
self.assertEqual( | ||
page_data[0].model_dump(exclude_defaults=True)["senses"], | ||
[ | ||
{ | ||
"glosses": [ | ||
"Contribuir esfuerzo o recursos para la realización de algo." | ||
], | ||
"senseid": 1, | ||
}, | ||
{ | ||
"glosses": [ | ||
"Por antonomasia, cooperar a que alguno salga de una situación dificultosa" | ||
], | ||
"senseid": 2, | ||
}, | ||
], | ||
) | ||
|
||
def test_es_extract_gloss_categories(self): | ||
# https://es.wiktionary.org/wiki/amor | ||
self.wxr.wtp.add_page("Plantilla:plm", 10, "Sentimiento") | ||
self.wxr.wtp.add_page( | ||
"Plantilla:sentimientos", | ||
10, | ||
"Humanidades. [[Categoría:ES:Sentimientos]]", | ||
) | ||
self.wxr.wtp.start_page("") | ||
|
||
root = self.wxr.wtp.parse( | ||
";1 {{sentimientos}}: {{plm|sentimiento}} [[afectivo]] de [[atracción]], [[unión]] y [[afinidad]] que se experimenta hacia una persona, animal o cosa" | ||
) | ||
|
||
page_data = self.get_default_page_data() | ||
|
||
extract_gloss(self.wxr, page_data, root.children[0]) | ||
|
||
self.assertEqual( | ||
page_data[0].model_dump(exclude_defaults=True)["senses"], | ||
[ | ||
{ | ||
"glosses": [ | ||
"Sentimiento afectivo de atracción, unión y afinidad que se experimenta hacia una persona, animal o cosa" | ||
], | ||
"senseid": 1, | ||
"tags": ["Humanidades"], | ||
"categories": ["ES:Sentimientos"], | ||
} | ||
], | ||
) |