Skip to content

Commit

Permalink
[pt] extract etymology section
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Dec 5, 2024
1 parent b7d9464 commit ecfeed2
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 9 deletions.
43 changes: 43 additions & 0 deletions src/wiktextract/extractor/pt/etymology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from wikitextprocessor.parser import (
LEVEL_KIND_FLAGS,
LevelNode,
NodeKind,
WikiNode,
)

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import WordEntry


def extract_etymology_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
level_node: LevelNode,
) -> None:
cats = {}
e_nodes = []
e_texts = []
for node in level_node.children:
if isinstance(node, WikiNode) and node.kind in LEVEL_KIND_FLAGS:
break
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
e_text = clean_node(wxr, cats, e_nodes).lstrip(": ")
if e_text != "":
e_texts.append(e_text)
e_nodes.clear()
for list_item in node.find_child(NodeKind.LIST_ITEM):
e_text = clean_node(wxr, cats, list_item.children)
if e_text != "":
e_texts.append(e_text)
else:
e_nodes.append(node)

if len(e_nodes) > 0:
e_text = clean_node(wxr, cats, e_nodes).lstrip(": ")
if e_text != "":
e_texts.append(e_text)
for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.etymology_texts.extend(e_texts)
data.categories.extend(cats.get("categories", []))
1 change: 1 addition & 0 deletions src/wiktextract/extractor/pt/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,4 @@ class WordEntry(PortugueseBaseModel):
antonyms: list[Linkage] = []
synonyms: list[Linkage] = []
derived: list[Linkage] = []
etymology_texts: list[str] = []
3 changes: 3 additions & 0 deletions src/wiktextract/extractor/pt/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .etymology import extract_etymology_section
from .linkage import extract_expression_section, extract_linkage_section
from .models import Sense, WordEntry
from .pos import extract_pos_section
Expand Down Expand Up @@ -47,6 +48,8 @@ def parse_section(
level_node,
LINKAGE_SECTIONS[title_text],
)
elif title_text == "Etimologia":
extract_etymology_section(wxr, page_data, level_node)

cats = {}
for link_node in level_node.find_child(NodeKind.LINK):
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/pt/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"Pronome": {"pos": "pron"},
"Substantivo": {"pos": "noun"},
"Verbo": {"pos": "verb"},
"Forma de substantivo": {"pos": "noun", "tags": ["form-of"]},
"Forma verbal": {"pos": "verb", "tags": ["form-of"]},
"Locução substantiva": {"pos": "phrase", "tags": ["substantive"]},
"Locução adjetiva": {"pos": "phrase", "tags": ["adjectival"]},
Expand Down
19 changes: 10 additions & 9 deletions src/wiktextract/extractor/pt/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,16 +125,17 @@ def extract_trad_template(
if arg not in t_node.template_parameters:
break
tr_str = clean_node(wxr, None, t_node.template_parameters.get(arg, ""))
translations.append(
Translation(
word=tr_str,
lang=lang_name,
lang_code=lang_code,
roman=roman,
sense=sense,
sense_index=sense_index,
if tr_str != "":
translations.append(
Translation(
word=tr_str,
lang=lang_name,
lang_code=lang_code,
roman=roman,
sense=sense,
sense_index=sense_index,
)
)
)
return translations


Expand Down
49 changes: 49 additions & 0 deletions tests/test_pt_etymology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.pt.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestPtEtymology(TestCase):
maxDiff = None

def setUp(self) -> None:
conf = WiktionaryConfig(
dump_file_lang_code="pt",
capture_language_codes=None,
)
self.wxr = WiktextractContext(
Wtp(
lang_code="pt",
parser_function_aliases=conf.parser_function_aliases,
),
conf,
)

def test_list(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
self.wxr.wtp.add_page(
"Predefinição:etimo2",
10,
""":[[Categoria:Entrada de étimo latino (Português)]]Do [[latim]] ''[[oculus#la|oculus]]''<small><sup> ([[:la:oculus|<span title="ver no Wikcionário em latim">la</span>]])</sup></small>.""",
)
data = parse_page(
self.wxr,
"olho",
"""={{-pt-}}=
==Substantivo==
# órgão
==Etimologia==
{{etimo2|la|oculus|pt}}
:* '''Datação''': [[w:século XIII|século XIII]]""",
)
self.assertEqual(
data[0]["etymology_texts"],
["Do latim oculus⁽ˡᵃ⁾.", "Datação: século XIII"],
)
self.assertEqual(
data[0]["categories"], ["Entrada de étimo latino (Português)"]
)

0 comments on commit ecfeed2

Please sign in to comment.