Skip to content

Commit

Permalink
Merge pull request #936 from xxyzz/pt
Browse files Browse the repository at this point in the history
[pt] improve linkage and pos section code, extract pronunciation section
  • Loading branch information
xxyzz authored Dec 6, 2024
2 parents 7ec39a8 + d2ca145 commit bb46d54
Show file tree
Hide file tree
Showing 11 changed files with 396 additions and 118 deletions.
55 changes: 36 additions & 19 deletions src/wiktextract/extractor/pt/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,29 +107,46 @@ def extract_linkage_list_item(
linkage_words = []
raw_tags = []
for node in list_item.children:
if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
word = clean_node(wxr, None, node)
if word != "":
linkage_words.append(word)
elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
bold_str = clean_node(wxr, None, node)
if re.fullmatch(r"\d+", bold_str):
sense_index = int(bold_str)
if isinstance(node, TemplateNode):
match node.template_name:
case "link preto":
word = clean_node(
wxr, None, node.template_parameters.get(1, "")
)
if word != "":
linkage_words.append(word)
case "escopo2":
from .pos import extract_escopo2_template

raw_tags.extend(extract_escopo2_template(wxr, node))
elif isinstance(node, WikiNode):
match node.kind:
case NodeKind.LINK:
word = clean_node(wxr, None, node)
if word != "" and not word.startswith("Wikisaurus:"):
linkage_words.append(word)
case NodeKind.BOLD:
bold_str = clean_node(wxr, None, node)
if re.fullmatch(r"\d+", bold_str):
sense_index = int(bold_str)
case NodeKind.ITALIC:
raw_tag = clean_node(wxr, None, node)
if raw_tag != "":
raw_tags.append(raw_tag)
case NodeKind.LIST:
for child_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_linkage_list_item(
wxr,
word_entry,
child_list_item,
linkage_type,
sense,
sense_index,
)
elif isinstance(node, str):
m = re.search(r"\((.+)\)", node)
if m is not None:
sense = m.group(1)
elif (
isinstance(node, TemplateNode)
and node.template_name == "link preto"
):
word = clean_node(wxr, None, node.template_parameters.get(1, ""))
if word != "":
linkage_words.append(word)
elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
raw_tag = clean_node(wxr, None, node)
if raw_tag != "":
raw_tags.append(raw_tag)

for word in linkage_words:
linkage = Linkage(
Expand Down
14 changes: 14 additions & 0 deletions src/wiktextract/extractor/pt/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,19 @@ class Linkage(PortugueseBaseModel):
)


class Sound(PortugueseBaseModel):
ipa: str = Field(default="", description="International Phonetic Alphabet")
audio: str = Field(default="", description="Audio file name")
wav_url: str = ""
oga_url: str = ""
ogg_url: str = ""
mp3_url: str = ""
opus_url: str = ""
flac_url: str = ""
tags: list[str] = []
raw_tags: list[str] = []


class WordEntry(PortugueseBaseModel):
model_config = ConfigDict(title="Portuguese Wiktionary")
word: str = Field(description="Word string", min_length=1)
Expand All @@ -69,3 +82,4 @@ class WordEntry(PortugueseBaseModel):
synonyms: list[Linkage] = []
derived: list[Linkage] = []
etymology_texts: list[str] = []
sounds: list[Sound] = []
32 changes: 26 additions & 6 deletions src/wiktextract/extractor/pt/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .linkage import extract_expression_section, extract_linkage_section
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .pronunciation import extract_pronunciation_section
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .translation import extract_translation_section

Expand All @@ -23,7 +24,7 @@ def parse_section(
level_node: LevelNode,
) -> None:
cats = {}
title_text = clean_node(wxr, cats, level_node.largs)
title_text = clean_node(wxr, cats, level_node.largs).strip("⁰¹²³⁴⁵⁶⁷⁸⁹")
if title_text in POS_DATA:
extract_pos_section(
wxr,
Expand All @@ -50,16 +51,35 @@ def parse_section(
)
elif title_text == "Etimologia":
extract_etymology_section(wxr, page_data, level_node)
elif title_text == "Pronúncia":
extract_pronunciation_section(wxr, page_data, level_node)

if title_text not in POS_DATA:
save_section_cats(
cats.get("categories", []), page_data, level_node, True
)
cats = {}
for link_node in level_node.find_child(NodeKind.LINK):
clean_node(wxr, cats, link_node)
for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.categories.extend(cats.get("categories", []))
save_section_cats(cats.get("categories", []), page_data, level_node, False)

if title_text != "Pronúncia":
for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)


for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)
def save_section_cats(
cats: list[str],
page_data: list[WordEntry],
level_node: LevelNode,
from_title: bool,
) -> None:
if not from_title or (from_title and level_node.kind == NodeKind.LEVEL2):
for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.categories.extend(cats)
elif len(page_data) > 0:
page_data[-1].categories.extend(cats)


def parse_page(
Expand Down
31 changes: 21 additions & 10 deletions src/wiktextract/extractor/pt/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def extract_gloss_list_item(
if node.template_name == "escopo":
extract_escopo_template(wxr, sense, node)
elif node.template_name == "escopo2":
extract_escopo2_template(wxr, sense, node)
sense.raw_tags.extend(extract_escopo2_template(wxr, node))
else:
gloss_nodes.append(node)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
Expand All @@ -80,24 +80,25 @@ def extract_escopo_template(
for arg in range(2, 9):
if arg not in t_node.template_parameters:
break
sense.raw_tags.append(
clean_node(wxr, None, t_node.template_parameters[arg])
)
raw_tag = clean_node(wxr, None, t_node.template_parameters[arg])
if raw_tag != "":
sense.raw_tags.append(raw_tag)
clean_node(wxr, sense, t_node)


def extract_escopo2_template(
wxr: WiktextractContext,
sense: Sense,
t_node: TemplateNode,
) -> None:
) -> list[str]:
# https://pt.wiktionary.org/wiki/Predefinição:escopo2
raw_tags = []
for arg in range(1, 4):
if arg not in t_node.template_parameters:
break
sense.raw_tags.append(
clean_node(wxr, None, t_node.template_parameters[arg])
)
raw_tag = clean_node(wxr, None, t_node.template_parameters[arg])
if raw_tag != "":
raw_tags.append(raw_tag)
return raw_tags


def extract_example_list_item(
Expand All @@ -106,8 +107,13 @@ def extract_example_list_item(
list_item: WikiNode,
) -> None:
example = Example()
ref_nodes = []
for node in list_item.children:
if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
if (
isinstance(node, WikiNode)
and node.kind == NodeKind.ITALIC
and example.text == ""
):
example.text = clean_node(wxr, None, node)
elif isinstance(node, HTMLNode) and node.tag == "small":
example.translation = clean_node(wxr, None, node)
Expand All @@ -131,5 +137,10 @@ def extract_example_list_item(
example.text = clean_node(
wxr, sense, node.template_parameters.get(1, "")
)
else:
ref_nodes.append(node)

if example.text != "":
if example.ref == "":
example.ref = clean_node(wxr, sense, ref_nodes).strip(":() \n")
sense.examples.append(example)
73 changes: 73 additions & 0 deletions src/wiktextract/extractor/pt/pronunciation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from wikitextprocessor.parser import (
LEVEL_KIND_FLAGS,
LevelNode,
NodeKind,
WikiNode,
)

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Sound, WordEntry
from .tags import translate_raw_tags


def extract_pronunciation_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
level_node: LevelNode,
) -> None:
raw_tags = []
sounds = []
title_text = clean_node(wxr, None, level_node.largs)
if title_text not in ["", "Pronúncia"]:
raw_tags.append(title_text)

for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
sounds.extend(
extract_pronunciation_list_item(wxr, list_item, raw_tags)
)

for child_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
extract_pronunciation_section(wxr, page_data, child_level_node)

for data in page_data:
if data.lang_code == page_data[-1].lang_code:
for sound in sounds:
translate_raw_tags(sound)
data.sounds.append(sound)


def extract_pronunciation_list_item(
wxr: WiktextractContext, list_item: WikiNode, raw_tags: list[str]
) -> list[Sound]:
sounds = []
for index, node in enumerate(list_item.children):
if isinstance(node, str) and ":" in node:
raw_tag = clean_node(wxr, None, list_item.children[:index])
sound_value = clean_node(
wxr,
None,
[node[node.index(":") + 1 :]]
+ [
n
for n in list_item.children[index + 1 :]
if not (isinstance(n, WikiNode) and n.kind == NodeKind.LIST)
],
)
if sound_value != "":
sound = Sound(ipa=sound_value, raw_tags=raw_tags)
if raw_tag == "X-SAMPA":
sound.tags.append("X-SAMPA")
sounds.append(sound)
elif raw_tag != "":
raw_tags.append(raw_tag)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
for child_list_item in node.find_child(NodeKind.LIST_ITEM):
sounds.extend(
extract_pronunciation_list_item(
wxr, child_list_item, raw_tags
)
)

return sounds
2 changes: 2 additions & 0 deletions src/wiktextract/extractor/pt/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,7 @@
LINKAGE_SECTIONS = {
"Antônimos": "antonyms",
"Sinônimos": "synonyms",
"Sinónimos/Sinônimos": "synonyms",
"Sinónimos": "synonyms",
"Verbetes derivados": "derived",
}
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/pt/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def extract_translation_list_item(
)
)
elif isinstance(node, str) and re.search(r"\(.+\)", node) is not None:
roman = node.strip("() ")
roman = node.strip("() \n")
for tr_data in translations:
tr_data.roman = roman
elif (
Expand Down
Loading

0 comments on commit bb46d54

Please sign in to comment.