Skip to content

Commit

Permalink
Merge pull request #935 from xxyzz/pt
Browse files Browse the repository at this point in the history
[pt] extract synonyms, cognates and etymology sections
  • Loading branch information
xxyzz authored Dec 5, 2024
2 parents 7b9c5d7 + ecfeed2 commit 7ec39a8
Show file tree
Hide file tree
Showing 8 changed files with 267 additions and 14 deletions.
43 changes: 43 additions & 0 deletions src/wiktextract/extractor/pt/etymology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from wikitextprocessor.parser import (
LEVEL_KIND_FLAGS,
LevelNode,
NodeKind,
WikiNode,
)

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import WordEntry


def extract_etymology_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
level_node: LevelNode,
) -> None:
cats = {}
e_nodes = []
e_texts = []
for node in level_node.children:
if isinstance(node, WikiNode) and node.kind in LEVEL_KIND_FLAGS:
break
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
e_text = clean_node(wxr, cats, e_nodes).lstrip(": ")
if e_text != "":
e_texts.append(e_text)
e_nodes.clear()
for list_item in node.find_child(NodeKind.LIST_ITEM):
e_text = clean_node(wxr, cats, list_item.children)
if e_text != "":
e_texts.append(e_text)
else:
e_nodes.append(node)

if len(e_nodes) > 0:
e_text = clean_node(wxr, cats, e_nodes).lstrip(": ")
if e_text != "":
e_texts.append(e_text)
for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.etymology_texts.extend(e_texts)
data.categories.extend(cats.get("categories", []))
93 changes: 91 additions & 2 deletions src/wiktextract/extractor/pt/linkage.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from wikitextprocessor import LevelNode, NodeKind, WikiNode
import re

from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Linkage, WordEntry
from .tags import translate_raw_tags


def extract_expression_section(
Expand Down Expand Up @@ -34,7 +37,17 @@ def extract_expression_list_item(
elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
sense_nodes.append(node)

sense_str = clean_node(wxr, None, sense_nodes)
sense_str = clean_node(
wxr,
None,
[
n
for n in sense_nodes
if not (
isinstance(n, TemplateNode) and n.template_name == "escopo2"
)
],
)
if sense_str != "":
gloss_list_item = WikiNode(NodeKind.LIST_ITEM, 0)
gloss_list_item.children = sense_nodes
Expand All @@ -48,3 +61,79 @@ def extract_expression_list_item(

if expression_data.word != "":
word_entry.expressions.append(expression_data)


def extract_linkage_section(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: LevelNode,
linkage_type: str,
) -> None:
sense = ""
sense_index = 0
for node in level_node.children:
if isinstance(node, TemplateNode) and node.template_name == "fraseini":
sense, sense_index = extract_fraseini_template(wxr, node)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
for list_item in node.find_child(NodeKind.LIST_ITEM):
extract_linkage_list_item(
wxr, word_entry, list_item, linkage_type, sense, sense_index
)


def extract_fraseini_template(
wxr: WiktextractContext, t_node: TemplateNode
) -> tuple[str, int]:
sense = ""
sense_index = 0
first_arg = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
m = re.search(r"(\d+)$", first_arg)
if m is not None:
sense_index = int(m.group(1))
sense = first_arg[: m.start()].strip()
else:
sense = first_arg
return sense, sense_index


def extract_linkage_list_item(
wxr: WiktextractContext,
word_entry: WordEntry,
list_item: WikiNode,
linkage_type: str,
sense: str,
sense_index: int,
) -> None:
linkage_words = []
raw_tags = []
for node in list_item.children:
if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
word = clean_node(wxr, None, node)
if word != "":
linkage_words.append(word)
elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
bold_str = clean_node(wxr, None, node)
if re.fullmatch(r"\d+", bold_str):
sense_index = int(bold_str)
elif isinstance(node, str):
m = re.search(r"\((.+)\)", node)
if m is not None:
sense = m.group(1)
elif (
isinstance(node, TemplateNode)
and node.template_name == "link preto"
):
word = clean_node(wxr, None, node.template_parameters.get(1, ""))
if word != "":
linkage_words.append(word)
elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
raw_tag = clean_node(wxr, None, node)
if raw_tag != "":
raw_tags.append(raw_tag)

for word in linkage_words:
linkage = Linkage(
word=word, sense=sense, sense_index=sense_index, raw_tags=raw_tags
)
translate_raw_tags(linkage)
getattr(word_entry, linkage_type).append(linkage)
8 changes: 8 additions & 0 deletions src/wiktextract/extractor/pt/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ class Linkage(PortugueseBaseModel):
tags: list[str] = []
raw_tags: list[str] = []
senses: list[Sense] = []
sense: str = ""
sense_index: int = Field(
default=0, ge=0, description="Number of the definition, start from 1"
)


class WordEntry(PortugueseBaseModel):
Expand All @@ -61,3 +65,7 @@ class WordEntry(PortugueseBaseModel):
raw_tags: list[str] = []
translations: list[Translation] = []
expressions: list[Linkage] = []
antonyms: list[Linkage] = []
synonyms: list[Linkage] = []
derived: list[Linkage] = []
etymology_texts: list[str] = []
16 changes: 13 additions & 3 deletions src/wiktextract/extractor/pt/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .linkage import extract_expression_section
from .etymology import extract_etymology_section
from .linkage import extract_expression_section, extract_linkage_section
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .translation import extract_translation_section


Expand All @@ -32,14 +33,23 @@ def parse_section(
title_text,
cats.get("categories", []),
)
elif title_text == "Tradução":
elif title_text in ["Tradução", "Cognatos"]:
extract_translation_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
elif title_text == "Expressões":
extract_expression_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
elif title_text in LINKAGE_SECTIONS:
extract_linkage_section(
wxr,
page_data[-1] if len(page_data) > 0 else base_data,
level_node,
LINKAGE_SECTIONS[title_text],
)
elif title_text == "Etimologia":
extract_etymology_section(wxr, page_data, level_node)

cats = {}
for link_node in level_node.find_child(NodeKind.LINK):
Expand Down
8 changes: 8 additions & 0 deletions src/wiktextract/extractor/pt/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"Pronome": {"pos": "pron"},
"Substantivo": {"pos": "noun"},
"Verbo": {"pos": "verb"},
"Forma de substantivo": {"pos": "noun", "tags": ["form-of"]},
"Forma verbal": {"pos": "verb", "tags": ["form-of"]},
"Locução substantiva": {"pos": "phrase", "tags": ["substantive"]},
"Locução adjetiva": {"pos": "phrase", "tags": ["adjectival"]},
Expand All @@ -24,3 +25,10 @@
"Sigla": {"pos": "abbrev", "tags": ["abbreviation"]},
"Símbolo": {"pos": "symbol"},
}


LINKAGE_SECTIONS = {
"Antônimos": "antonyms",
"Sinônimos": "synonyms",
"Verbetes derivados": "derived",
}
19 changes: 10 additions & 9 deletions src/wiktextract/extractor/pt/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,16 +125,17 @@ def extract_trad_template(
if arg not in t_node.template_parameters:
break
tr_str = clean_node(wxr, None, t_node.template_parameters.get(arg, ""))
translations.append(
Translation(
word=tr_str,
lang=lang_name,
lang_code=lang_code,
roman=roman,
sense=sense,
sense_index=sense_index,
if tr_str != "":
translations.append(
Translation(
word=tr_str,
lang=lang_name,
lang_code=lang_code,
roman=roman,
sense=sense,
sense_index=sense_index,
)
)
)
return translations


Expand Down
49 changes: 49 additions & 0 deletions tests/test_pt_etymology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.pt.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestPtEtymology(TestCase):
maxDiff = None

def setUp(self) -> None:
conf = WiktionaryConfig(
dump_file_lang_code="pt",
capture_language_codes=None,
)
self.wxr = WiktextractContext(
Wtp(
lang_code="pt",
parser_function_aliases=conf.parser_function_aliases,
),
conf,
)

def test_list(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
self.wxr.wtp.add_page(
"Predefinição:etimo2",
10,
""":[[Categoria:Entrada de étimo latino (Português)]]Do [[latim]] ''[[oculus#la|oculus]]''<small><sup> ([[:la:oculus|<span title="ver no Wikcionário em latim">la</span>]])</sup></small>.""",
)
data = parse_page(
self.wxr,
"olho",
"""={{-pt-}}=
==Substantivo==
# órgão
==Etimologia==
{{etimo2|la|oculus|pt}}
:* '''Datação''': [[w:século XIII|século XIII]]""",
)
self.assertEqual(
data[0]["etymology_texts"],
["Do latim oculus⁽ˡᵃ⁾.", "Datação: século XIII"],
)
self.assertEqual(
data[0]["categories"], ["Entrada de étimo latino (Português)"]
)
45 changes: 45 additions & 0 deletions tests/test_pt_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,48 @@ def test_expression(self):
},
],
)

def test_synonyms(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
data = parse_page(
self.wxr,
"olho",
"""={{-pt-}}=
==Substantivo==
# órgão
===Sinônimos===
* De '''5''' (furo, na agulha, para a passagem de linhas ou fios): [[buraco]]""",
)
self.assertEqual(
data[0]["synonyms"],
[
{
"word": "buraco",
"sense": "furo, na agulha, para a passagem de linhas ou fios",
"sense_index": 5,
}
],
)

def test_link_preto(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
data = parse_page(
self.wxr,
"olho",
"""={{-pt-}}=
==Substantivo==
# órgão
===Verbetes derivados===
{{fraseini|Nomes de animais derivados de ''olho''}}
* {{link preto|olho-branco}} (''[[species:Zosteropidae|Zosteropidae]]'')""",
)
self.assertEqual(
data[0]["derived"],
[
{
"word": "olho-branco",
"sense": "Nomes de animais derivados de olho",
"raw_tags": ["Zosteropidae"],
}
],
)

0 comments on commit 7ec39a8

Please sign in to comment.