Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pt] improve linkage and pos section code, extract pronunciation section #936

Merged
merged 4 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 36 additions & 19 deletions src/wiktextract/extractor/pt/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,29 +107,46 @@ def extract_linkage_list_item(
linkage_words = []
raw_tags = []
for node in list_item.children:
if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
word = clean_node(wxr, None, node)
if word != "":
linkage_words.append(word)
elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
bold_str = clean_node(wxr, None, node)
if re.fullmatch(r"\d+", bold_str):
sense_index = int(bold_str)
if isinstance(node, TemplateNode):
match node.template_name:
case "link preto":
word = clean_node(
wxr, None, node.template_parameters.get(1, "")
)
if word != "":
linkage_words.append(word)
case "escopo2":
from .pos import extract_escopo2_template

raw_tags.extend(extract_escopo2_template(wxr, node))
elif isinstance(node, WikiNode):
match node.kind:
case NodeKind.LINK:
word = clean_node(wxr, None, node)
if word != "" and not word.startswith("Wikisaurus:"):
linkage_words.append(word)
case NodeKind.BOLD:
bold_str = clean_node(wxr, None, node)
if re.fullmatch(r"\d+", bold_str):
sense_index = int(bold_str)
case NodeKind.ITALIC:
raw_tag = clean_node(wxr, None, node)
if raw_tag != "":
raw_tags.append(raw_tag)
case NodeKind.LIST:
for child_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_linkage_list_item(
wxr,
word_entry,
child_list_item,
linkage_type,
sense,
sense_index,
)
elif isinstance(node, str):
m = re.search(r"\((.+)\)", node)
if m is not None:
sense = m.group(1)
elif (
isinstance(node, TemplateNode)
and node.template_name == "link preto"
):
word = clean_node(wxr, None, node.template_parameters.get(1, ""))
if word != "":
linkage_words.append(word)
elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
raw_tag = clean_node(wxr, None, node)
if raw_tag != "":
raw_tags.append(raw_tag)

for word in linkage_words:
linkage = Linkage(
Expand Down
14 changes: 14 additions & 0 deletions src/wiktextract/extractor/pt/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,19 @@ class Linkage(PortugueseBaseModel):
)


class Sound(PortugueseBaseModel):
ipa: str = Field(default="", description="International Phonetic Alphabet")
audio: str = Field(default="", description="Audio file name")
wav_url: str = ""
oga_url: str = ""
ogg_url: str = ""
mp3_url: str = ""
opus_url: str = ""
flac_url: str = ""
tags: list[str] = []
raw_tags: list[str] = []


class WordEntry(PortugueseBaseModel):
model_config = ConfigDict(title="Portuguese Wiktionary")
word: str = Field(description="Word string", min_length=1)
Expand All @@ -69,3 +82,4 @@ class WordEntry(PortugueseBaseModel):
synonyms: list[Linkage] = []
derived: list[Linkage] = []
etymology_texts: list[str] = []
sounds: list[Sound] = []
32 changes: 26 additions & 6 deletions src/wiktextract/extractor/pt/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .linkage import extract_expression_section, extract_linkage_section
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .pronunciation import extract_pronunciation_section
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .translation import extract_translation_section

Expand All @@ -23,7 +24,7 @@ def parse_section(
level_node: LevelNode,
) -> None:
cats = {}
title_text = clean_node(wxr, cats, level_node.largs)
title_text = clean_node(wxr, cats, level_node.largs).strip("⁰¹²³⁴⁵⁶⁷⁸⁹")
if title_text in POS_DATA:
extract_pos_section(
wxr,
Expand All @@ -50,16 +51,35 @@ def parse_section(
)
elif title_text == "Etimologia":
extract_etymology_section(wxr, page_data, level_node)
elif title_text == "Pronúncia":
extract_pronunciation_section(wxr, page_data, level_node)

if title_text not in POS_DATA:
save_section_cats(
cats.get("categories", []), page_data, level_node, True
)
cats = {}
for link_node in level_node.find_child(NodeKind.LINK):
clean_node(wxr, cats, link_node)
for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.categories.extend(cats.get("categories", []))
save_section_cats(cats.get("categories", []), page_data, level_node, False)

if title_text != "Pronúncia":
for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)


for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)
def save_section_cats(
cats: list[str],
page_data: list[WordEntry],
level_node: LevelNode,
from_title: bool,
) -> None:
if not from_title or (from_title and level_node.kind == NodeKind.LEVEL2):
for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.categories.extend(cats)
elif len(page_data) > 0:
page_data[-1].categories.extend(cats)


def parse_page(
Expand Down
31 changes: 21 additions & 10 deletions src/wiktextract/extractor/pt/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def extract_gloss_list_item(
if node.template_name == "escopo":
extract_escopo_template(wxr, sense, node)
elif node.template_name == "escopo2":
extract_escopo2_template(wxr, sense, node)
sense.raw_tags.extend(extract_escopo2_template(wxr, node))
else:
gloss_nodes.append(node)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
Expand All @@ -80,24 +80,25 @@ def extract_escopo_template(
for arg in range(2, 9):
if arg not in t_node.template_parameters:
break
sense.raw_tags.append(
clean_node(wxr, None, t_node.template_parameters[arg])
)
raw_tag = clean_node(wxr, None, t_node.template_parameters[arg])
if raw_tag != "":
sense.raw_tags.append(raw_tag)
clean_node(wxr, sense, t_node)


def extract_escopo2_template(
wxr: WiktextractContext,
sense: Sense,
t_node: TemplateNode,
) -> None:
) -> list[str]:
# https://pt.wiktionary.org/wiki/Predefinição:escopo2
raw_tags = []
for arg in range(1, 4):
if arg not in t_node.template_parameters:
break
sense.raw_tags.append(
clean_node(wxr, None, t_node.template_parameters[arg])
)
raw_tag = clean_node(wxr, None, t_node.template_parameters[arg])
if raw_tag != "":
raw_tags.append(raw_tag)
return raw_tags


def extract_example_list_item(
Expand All @@ -106,8 +107,13 @@ def extract_example_list_item(
list_item: WikiNode,
) -> None:
example = Example()
ref_nodes = []
for node in list_item.children:
if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
if (
isinstance(node, WikiNode)
and node.kind == NodeKind.ITALIC
and example.text == ""
):
example.text = clean_node(wxr, None, node)
elif isinstance(node, HTMLNode) and node.tag == "small":
example.translation = clean_node(wxr, None, node)
Expand All @@ -131,5 +137,10 @@ def extract_example_list_item(
example.text = clean_node(
wxr, sense, node.template_parameters.get(1, "")
)
else:
ref_nodes.append(node)

if example.text != "":
if example.ref == "":
example.ref = clean_node(wxr, sense, ref_nodes).strip(":() \n")
sense.examples.append(example)
73 changes: 73 additions & 0 deletions src/wiktextract/extractor/pt/pronunciation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from wikitextprocessor.parser import (
LEVEL_KIND_FLAGS,
LevelNode,
NodeKind,
WikiNode,
)

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Sound, WordEntry
from .tags import translate_raw_tags


def extract_pronunciation_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
level_node: LevelNode,
) -> None:
raw_tags = []
sounds = []
title_text = clean_node(wxr, None, level_node.largs)
if title_text not in ["", "Pronúncia"]:
raw_tags.append(title_text)

for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
sounds.extend(
extract_pronunciation_list_item(wxr, list_item, raw_tags)
)

for child_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
extract_pronunciation_section(wxr, page_data, child_level_node)

for data in page_data:
if data.lang_code == page_data[-1].lang_code:
for sound in sounds:
translate_raw_tags(sound)
data.sounds.append(sound)


def extract_pronunciation_list_item(
wxr: WiktextractContext, list_item: WikiNode, raw_tags: list[str]
) -> list[Sound]:
sounds = []
for index, node in enumerate(list_item.children):
if isinstance(node, str) and ":" in node:
raw_tag = clean_node(wxr, None, list_item.children[:index])
sound_value = clean_node(
wxr,
None,
[node[node.index(":") + 1 :]]
+ [
n
for n in list_item.children[index + 1 :]
if not (isinstance(n, WikiNode) and n.kind == NodeKind.LIST)
],
)
if sound_value != "":
sound = Sound(ipa=sound_value, raw_tags=raw_tags)
if raw_tag == "X-SAMPA":
sound.tags.append("X-SAMPA")
sounds.append(sound)
elif raw_tag != "":
raw_tags.append(raw_tag)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
for child_list_item in node.find_child(NodeKind.LIST_ITEM):
sounds.extend(
extract_pronunciation_list_item(
wxr, child_list_item, raw_tags
)
)

return sounds
2 changes: 2 additions & 0 deletions src/wiktextract/extractor/pt/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,7 @@
LINKAGE_SECTIONS = {
"Antônimos": "antonyms",
"Sinônimos": "synonyms",
"Sinónimos/Sinônimos": "synonyms",
"Sinónimos": "synonyms",
"Verbetes derivados": "derived",
}
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/pt/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def extract_translation_list_item(
)
)
elif isinstance(node, str) and re.search(r"\(.+\)", node) is not None:
roman = node.strip("() ")
roman = node.strip("() \n")
for tr_data in translations:
tr_data.roman = roman
elif (
Expand Down
Loading
Loading