Skip to content

Commit

Permalink
[pt] extract example list
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Dec 2, 2024
1 parent c23b79c commit fe6bb0c
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 5 deletions.
7 changes: 7 additions & 0 deletions src/wiktextract/extractor/pt/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,19 @@ class PortugueseBaseModel(BaseModel):
)


class Example(PortugueseBaseModel):
text: str = ""
translation: str = ""
ref: str = ""


class Sense(PortugueseBaseModel):
glosses: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []
topics: list[str] = []
examples: list[Example] = []


class WordEntry(PortugueseBaseModel):
Expand Down
21 changes: 17 additions & 4 deletions src/wiktextract/extractor/pt/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Sense, WordEntry
from .models import Example, Sense, WordEntry
from .section_titles import POS_DATA


Expand Down Expand Up @@ -30,11 +30,11 @@ def extract_pos_section(
def extract_gloss_list_item(
wxr: WiktextractContext,
word_entry: WordEntry,
list_item_node: WikiNode,
list_item: WikiNode,
) -> None:
gloss_nodes = []
sense = Sense()
for node in list_item_node.children:
for node in list_item.children:
if isinstance(node, TemplateNode):
if node.template_name == "escopo":
extract_escopo_template(wxr, sense, node)
Expand All @@ -43,7 +43,9 @@ def extract_gloss_list_item(
else:
gloss_nodes.append(node)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
pass
if node.sarg.endswith("*"):
for next_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(wxr, sense, next_list_item)
else:
gloss_nodes.append(node)

Expand Down Expand Up @@ -80,3 +82,14 @@ def extract_escopo2_template(
sense.raw_tags.append(
clean_node(wxr, None, t_node.template_parameters[arg])
)


def extract_example_list_item(
wxr: WiktextractContext,
sense: Sense,
list_item: WikiNode,
) -> None:
example = Example()
example.text = clean_node(wxr, sense, list_item.children)
if example.text != "":
sense.examples.append(example)
4 changes: 3 additions & 1 deletion tests/test_pt_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ def test_escopo(self):
"cão",
"""={{-pt-}}=
=={{Substantivo|pt}}==
# {{escopo|pt|Brasil|popular}} [[gênio]] do [[mal]] em geral ("capeta")""",
# {{escopo|pt|Brasil|popular}} [[gênio]] do [[mal]] em geral ("capeta")
#* ''O '''cão''' em forma de gente.''""",
)
self.assertEqual(
data,
Expand All @@ -66,6 +67,7 @@ def test_escopo(self):
],
"glosses": ['gênio do mal em geral ("capeta")'],
"raw_tags": ["Brasil", "popular"],
"examples": [{"text": "O cão em forma de gente."}],
}
],
"word": "cão",
Expand Down

0 comments on commit fe6bb0c

Please sign in to comment.