Skip to content

Commit

Permalink
Merge pull request #932 from xxyzz/pt
Browse files Browse the repository at this point in the history
[pt] add parser function aliases configurations
  • Loading branch information
xxyzz authored Dec 2, 2024
2 parents 94ba7e1 + fe6bb0c commit 7008139
Show file tree
Hide file tree
Showing 8 changed files with 173 additions and 9 deletions.
2 changes: 2 additions & 0 deletions src/wiktextract/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ class WiktionaryConfig:
"save_ns_names",
"extract_ns_names",
"allowed_html_tags",
"parser_function_aliases",
)

def __init__(
Expand Down Expand Up @@ -128,6 +129,7 @@ def __init__(
# these are extracted namespaces
self.extract_ns_names = ["Main"]
self.allowed_html_tags: dict[str, HTMLTagData] = {}
self.parser_function_aliases: dict[str, str] = {}
self.load_edition_settings()

def merge_return(self, ret: CollatedErrorReturnData):
Expand Down
9 changes: 9 additions & 0 deletions src/wiktextract/data/pt/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"parser_function_aliases": {
"#se": "#if",
"#seigual": "#ifeq",
"#seerro": "#iferror",
"#seexiste": "#ifexist",
"#seexpr": "#ifexpr"
}
}
4 changes: 3 additions & 1 deletion src/wiktextract/extractor/de/pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ def extract_audio_template(
for link_node in expanded_node.find_child(NodeKind.LINK):
link_str = clean_node(wxr, None, link_node)
if "(" in link_str:
sound.raw_tags.append(link_str[link_str.index("(") + 1:].strip(")"))
sound.raw_tags.append(
link_str[link_str.index("(") + 1 :].strip(")")
)
clean_node(wxr, sound, expanded_node)
return sound
7 changes: 7 additions & 0 deletions src/wiktextract/extractor/pt/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,19 @@ class PortugueseBaseModel(BaseModel):
)


class Example(PortugueseBaseModel):
text: str = ""
translation: str = ""
ref: str = ""


class Sense(PortugueseBaseModel):
glosses: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []
topics: list[str] = []
examples: list[Example] = []


class WordEntry(PortugueseBaseModel):
Expand Down
14 changes: 11 additions & 3 deletions src/wiktextract/extractor/pt/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
LEVEL_KIND_FLAGS,
LevelNode,
NodeKind,
WikiNode,
)

from ...page import clean_node
Expand All @@ -23,7 +22,14 @@ def parse_section(
cats = {}
title_text = clean_node(wxr, cats, level_node.largs)
if title_text in POS_DATA:
extract_pos_section(wxr, page_data, base_data, level_node, title_text)
extract_pos_section(
wxr,
page_data,
base_data,
level_node,
title_text,
cats.get("categories", []),
)


def parse_page(
Expand All @@ -35,7 +41,8 @@ def parse_page(
tree = wxr.wtp.parse(page_text)
page_data: list[WordEntry] = []
for level1_node in tree.find_child(NodeKind.LEVEL1):
lang_name = clean_node(wxr, None, level1_node.largs)
lang_cats = {}
lang_name = clean_node(wxr, lang_cats, level1_node.largs)
lang_code = "unknown"
for lang_template in level1_node.find_content(NodeKind.TEMPLATE):
lang_code = lang_template.template_name.strip("-")
Expand All @@ -51,6 +58,7 @@ def parse_page(
lang_code=lang_code,
lang=lang_name,
pos="unknown",
categories=lang_cats.get("categories", []),
)
for next_level_node in level1_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level_node)
Expand Down
65 changes: 61 additions & 4 deletions src/wiktextract/extractor/pt/pos.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from wikitextprocessor import LevelNode, NodeKind, WikiNode
from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Sense, WordEntry
from .models import Example, Sense, WordEntry
from .section_titles import POS_DATA


Expand All @@ -12,12 +12,14 @@ def extract_pos_section(
base_data: WordEntry,
level_node: LevelNode,
pos_title: str,
categories: list[str],
) -> None:
page_data.append(base_data.model_copy(deep=True))
page_data[-1].pos_title = pos_title
pos_data = POS_DATA[pos_title]
page_data[-1].pos = pos_data["pos"]
page_data[-1].tags.extend(pos_data.get("tags", []))
page_data[-1].categories.extend(categories)

for list_index, list_node in level_node.find_child(NodeKind.LIST, True):
if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
Expand All @@ -28,11 +30,66 @@ def extract_pos_section(
def extract_gloss_list_item(
wxr: WiktextractContext,
word_entry: WordEntry,
list_item_node: WikiNode,
list_item: WikiNode,
) -> None:
gloss_nodes = list(list_item_node.invert_find_child(NodeKind.LIST))
gloss_nodes = []
sense = Sense()
for node in list_item.children:
if isinstance(node, TemplateNode):
if node.template_name == "escopo":
extract_escopo_template(wxr, sense, node)
elif node.template_name == "escopo2":
extract_escopo2_template(wxr, sense, node)
else:
gloss_nodes.append(node)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
if node.sarg.endswith("*"):
for next_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(wxr, sense, next_list_item)
else:
gloss_nodes.append(node)

gloss_str = clean_node(wxr, sense, gloss_nodes)
if len(gloss_str) > 0:
sense.glosses.append(gloss_str)
word_entry.senses.append(sense)


def extract_escopo_template(
wxr: WiktextractContext,
sense: Sense,
t_node: TemplateNode,
) -> None:
# https://pt.wiktionary.org/wiki/Predefinição:escopo
for arg in range(2, 9):
if arg not in t_node.template_parameters:
break
sense.raw_tags.append(
clean_node(wxr, None, t_node.template_parameters[arg])
)
clean_node(wxr, sense, t_node)


def extract_escopo2_template(
wxr: WiktextractContext,
sense: Sense,
t_node: TemplateNode,
) -> None:
# https://pt.wiktionary.org/wiki/Predefinição:escopo2
for arg in range(1, 4):
if arg not in t_node.parameters:
break
sense.raw_tags.append(
clean_node(wxr, None, t_node.template_parameters[arg])
)


def extract_example_list_item(
wxr: WiktextractContext,
sense: Sense,
list_item: WikiNode,
) -> None:
example = Example()
example.text = clean_node(wxr, sense, list_item.children)
if example.text != "":
sense.examples.append(example)
5 changes: 4 additions & 1 deletion src/wiktextract/wiktwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,8 +395,11 @@ def main():
wtp = Wtp(
db_path=args.db_path,
lang_code=args.dump_file_language_code,
template_override_funcs=template_override_fns,
template_override_funcs=template_override_fns
if args.dump_file_language_code == "en"
else {},
extension_tags=conf.allowed_html_tags,
parser_function_aliases=conf.parser_function_aliases,
quiet=args.quiet,
)
wxr = WiktextractContext(wtp, conf)
Expand Down
76 changes: 76 additions & 0 deletions tests/test_pt_gloss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.pt.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestPtGloss(TestCase):
maxDiff = None

def setUp(self) -> None:
conf = WiktionaryConfig(
dump_file_lang_code="pt",
capture_language_codes=None,
)
self.wxr = WiktextractContext(
Wtp(
lang_code="pt",
parser_function_aliases=conf.parser_function_aliases,
),
conf,
)

def test_escopo(self):
self.wxr.wtp.add_page(
"Predefinição:-pt-",
10,
"Português[[Categoria:!Entrada (Português)]]",
)
self.wxr.wtp.add_page(
"Predefinição:Substantivo",
10,
"Substantivo[[Categoria:Substantivo (Português)]]",
)
self.wxr.wtp.add_page(
"Predefinição:escopo",
10,
"""(''<span style="color:navy;">[[Categoria:Português brasileiro]]Brasil e&nbsp;[[Categoria:Coloquialismo (Português)]]popular</span>'')""",
)
data = parse_page(
self.wxr,
"cão",
"""={{-pt-}}=
=={{Substantivo|pt}}==
# {{escopo|pt|Brasil|popular}} [[gênio]] do [[mal]] em geral ("capeta")
#* ''O '''cão''' em forma de gente.''""",
)
self.assertEqual(
data,
[
{
"lang": "Português",
"lang_code": "pt",
"pos": "noun",
"pos_title": "Substantivo",
"categories": [
"!Entrada (Português)",
"Substantivo (Português)",
],
"senses": [
{
"categories": [
"Português brasileiro",
"Coloquialismo (Português)",
],
"glosses": ['gênio do mal em geral ("capeta")'],
"raw_tags": ["Brasil", "popular"],
"examples": [{"text": "O cão em forma de gente."}],
}
],
"word": "cão",
}
],
)

0 comments on commit 7008139

Please sign in to comment.