diff --git a/src/wiktextract/extractor/pt/linkage.py b/src/wiktextract/extractor/pt/linkage.py
index 492a76d1..95afe165 100644
--- a/src/wiktextract/extractor/pt/linkage.py
+++ b/src/wiktextract/extractor/pt/linkage.py
@@ -107,29 +107,46 @@ def extract_linkage_list_item(
linkage_words = []
raw_tags = []
for node in list_item.children:
- if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
- word = clean_node(wxr, None, node)
- if word != "":
- linkage_words.append(word)
- elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
- bold_str = clean_node(wxr, None, node)
- if re.fullmatch(r"\d+", bold_str):
- sense_index = int(bold_str)
+ if isinstance(node, TemplateNode):
+ match node.template_name:
+ case "link preto":
+ word = clean_node(
+ wxr, None, node.template_parameters.get(1, "")
+ )
+ if word != "":
+ linkage_words.append(word)
+ case "escopo2":
+ from .pos import extract_escopo2_template
+
+ raw_tags.extend(extract_escopo2_template(wxr, node))
+ elif isinstance(node, WikiNode):
+ match node.kind:
+ case NodeKind.LINK:
+ word = clean_node(wxr, None, node)
+ if word != "" and not word.startswith("Wikisaurus:"):
+ linkage_words.append(word)
+ case NodeKind.BOLD:
+ bold_str = clean_node(wxr, None, node)
+ if re.fullmatch(r"\d+", bold_str):
+ sense_index = int(bold_str)
+ case NodeKind.ITALIC:
+ raw_tag = clean_node(wxr, None, node)
+ if raw_tag != "":
+ raw_tags.append(raw_tag)
+ case NodeKind.LIST:
+ for child_list_item in node.find_child(NodeKind.LIST_ITEM):
+ extract_linkage_list_item(
+ wxr,
+ word_entry,
+ child_list_item,
+ linkage_type,
+ sense,
+ sense_index,
+ )
elif isinstance(node, str):
m = re.search(r"\((.+)\)", node)
if m is not None:
sense = m.group(1)
- elif (
- isinstance(node, TemplateNode)
- and node.template_name == "link preto"
- ):
- word = clean_node(wxr, None, node.template_parameters.get(1, ""))
- if word != "":
- linkage_words.append(word)
- elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
- raw_tag = clean_node(wxr, None, node)
- if raw_tag != "":
- raw_tags.append(raw_tag)
for word in linkage_words:
linkage = Linkage(
diff --git a/src/wiktextract/extractor/pt/models.py b/src/wiktextract/extractor/pt/models.py
index 98435499..e51e3755 100644
--- a/src/wiktextract/extractor/pt/models.py
+++ b/src/wiktextract/extractor/pt/models.py
@@ -52,6 +52,19 @@ class Linkage(PortugueseBaseModel):
)
+class Sound(PortugueseBaseModel):
+ ipa: str = Field(default="", description="International Phonetic Alphabet")
+ audio: str = Field(default="", description="Audio file name")
+ wav_url: str = ""
+ oga_url: str = ""
+ ogg_url: str = ""
+ mp3_url: str = ""
+ opus_url: str = ""
+ flac_url: str = ""
+ tags: list[str] = []
+ raw_tags: list[str] = []
+
+
class WordEntry(PortugueseBaseModel):
model_config = ConfigDict(title="Portuguese Wiktionary")
word: str = Field(description="Word string", min_length=1)
@@ -69,3 +82,4 @@ class WordEntry(PortugueseBaseModel):
synonyms: list[Linkage] = []
derived: list[Linkage] = []
etymology_texts: list[str] = []
+ sounds: list[Sound] = []
diff --git a/src/wiktextract/extractor/pt/page.py b/src/wiktextract/extractor/pt/page.py
index 0e1fc3af..73c95fc4 100644
--- a/src/wiktextract/extractor/pt/page.py
+++ b/src/wiktextract/extractor/pt/page.py
@@ -12,6 +12,7 @@
from .linkage import extract_expression_section, extract_linkage_section
from .models import Sense, WordEntry
from .pos import extract_pos_section
+from .pronunciation import extract_pronunciation_section
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .translation import extract_translation_section
@@ -23,7 +24,7 @@ def parse_section(
level_node: LevelNode,
) -> None:
cats = {}
- title_text = clean_node(wxr, cats, level_node.largs)
+ title_text = clean_node(wxr, cats, level_node.largs).strip("⁰¹²³⁴⁵⁶⁷⁸⁹")
if title_text in POS_DATA:
extract_pos_section(
wxr,
@@ -50,16 +51,35 @@ def parse_section(
)
elif title_text == "Etimologia":
extract_etymology_section(wxr, page_data, level_node)
+ elif title_text == "Pronúncia":
+ extract_pronunciation_section(wxr, page_data, level_node)
+ if title_text not in POS_DATA:
+ save_section_cats(
+ cats.get("categories", []), page_data, level_node, True
+ )
cats = {}
for link_node in level_node.find_child(NodeKind.LINK):
clean_node(wxr, cats, link_node)
- for data in page_data:
- if data.lang_code == page_data[-1].lang_code:
- data.categories.extend(cats.get("categories", []))
+ save_section_cats(cats.get("categories", []), page_data, level_node, False)
+
+ if title_text != "Pronúncia":
+ for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
+ parse_section(wxr, page_data, base_data, next_level)
+
- for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
- parse_section(wxr, page_data, base_data, next_level)
+def save_section_cats(
+ cats: list[str],
+ page_data: list[WordEntry],
+ level_node: LevelNode,
+ from_title: bool,
+) -> None:
+ if not from_title or (from_title and level_node.kind == NodeKind.LEVEL2):
+ for data in page_data:
+ if data.lang_code == page_data[-1].lang_code:
+ data.categories.extend(cats)
+ elif len(page_data) > 0:
+ page_data[-1].categories.extend(cats)
def parse_page(
diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py
index cbb5e632..de75f36e 100644
--- a/src/wiktextract/extractor/pt/pos.py
+++ b/src/wiktextract/extractor/pt/pos.py
@@ -53,7 +53,7 @@ def extract_gloss_list_item(
if node.template_name == "escopo":
extract_escopo_template(wxr, sense, node)
elif node.template_name == "escopo2":
- extract_escopo2_template(wxr, sense, node)
+ sense.raw_tags.extend(extract_escopo2_template(wxr, node))
else:
gloss_nodes.append(node)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
@@ -80,24 +80,25 @@ def extract_escopo_template(
for arg in range(2, 9):
if arg not in t_node.template_parameters:
break
- sense.raw_tags.append(
- clean_node(wxr, None, t_node.template_parameters[arg])
- )
+ raw_tag = clean_node(wxr, None, t_node.template_parameters[arg])
+ if raw_tag != "":
+ sense.raw_tags.append(raw_tag)
clean_node(wxr, sense, t_node)
def extract_escopo2_template(
wxr: WiktextractContext,
- sense: Sense,
t_node: TemplateNode,
-) -> None:
+) -> list[str]:
# https://pt.wiktionary.org/wiki/Predefinição:escopo2
+ raw_tags = []
for arg in range(1, 4):
if arg not in t_node.template_parameters:
break
- sense.raw_tags.append(
- clean_node(wxr, None, t_node.template_parameters[arg])
- )
+ raw_tag = clean_node(wxr, None, t_node.template_parameters[arg])
+ if raw_tag != "":
+ raw_tags.append(raw_tag)
+ return raw_tags
def extract_example_list_item(
@@ -106,8 +107,13 @@ def extract_example_list_item(
list_item: WikiNode,
) -> None:
example = Example()
+ ref_nodes = []
for node in list_item.children:
- if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
+ if (
+ isinstance(node, WikiNode)
+ and node.kind == NodeKind.ITALIC
+ and example.text == ""
+ ):
example.text = clean_node(wxr, None, node)
elif isinstance(node, HTMLNode) and node.tag == "small":
example.translation = clean_node(wxr, None, node)
@@ -131,5 +137,10 @@ def extract_example_list_item(
example.text = clean_node(
wxr, sense, node.template_parameters.get(1, "")
)
+ else:
+ ref_nodes.append(node)
+
if example.text != "":
+ if example.ref == "":
+ example.ref = clean_node(wxr, sense, ref_nodes).strip(":() \n")
sense.examples.append(example)
diff --git a/src/wiktextract/extractor/pt/pronunciation.py b/src/wiktextract/extractor/pt/pronunciation.py
new file mode 100644
index 00000000..b15a8217
--- /dev/null
+++ b/src/wiktextract/extractor/pt/pronunciation.py
@@ -0,0 +1,73 @@
+from wikitextprocessor.parser import (
+ LEVEL_KIND_FLAGS,
+ LevelNode,
+ NodeKind,
+ WikiNode,
+)
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Sound, WordEntry
+from .tags import translate_raw_tags
+
+
+def extract_pronunciation_section(
+ wxr: WiktextractContext,
+ page_data: list[WordEntry],
+ level_node: LevelNode,
+) -> None:
+ raw_tags = []
+ sounds = []
+ title_text = clean_node(wxr, None, level_node.largs)
+ if title_text not in ["", "Pronúncia"]:
+ raw_tags.append(title_text)
+
+ for list_node in level_node.find_child(NodeKind.LIST):
+ for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+ sounds.extend(
+ extract_pronunciation_list_item(wxr, list_item, raw_tags)
+ )
+
+ for child_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
+ extract_pronunciation_section(wxr, page_data, child_level_node)
+
+ for data in page_data:
+ if data.lang_code == page_data[-1].lang_code:
+ for sound in sounds:
+ translate_raw_tags(sound)
+ data.sounds.append(sound)
+
+
+def extract_pronunciation_list_item(
+ wxr: WiktextractContext, list_item: WikiNode, raw_tags: list[str]
+) -> list[Sound]:
+ sounds = []
+ for index, node in enumerate(list_item.children):
+ if isinstance(node, str) and ":" in node:
+ raw_tag = clean_node(wxr, None, list_item.children[:index])
+ sound_value = clean_node(
+ wxr,
+ None,
+ [node[node.index(":") + 1 :]]
+ + [
+ n
+ for n in list_item.children[index + 1 :]
+ if not (isinstance(n, WikiNode) and n.kind == NodeKind.LIST)
+ ],
+ )
+ if sound_value != "":
+ sound = Sound(ipa=sound_value, raw_tags=raw_tags)
+ if raw_tag == "X-SAMPA":
+ sound.tags.append("X-SAMPA")
+ sounds.append(sound)
+ elif raw_tag != "":
+ raw_tags.append(raw_tag)
+ elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+ for child_list_item in node.find_child(NodeKind.LIST_ITEM):
+ sounds.extend(
+ extract_pronunciation_list_item(
+ wxr, child_list_item, raw_tags
+ )
+ )
+
+ return sounds
diff --git a/src/wiktextract/extractor/pt/section_titles.py b/src/wiktextract/extractor/pt/section_titles.py
index 56bc41eb..f65b817b 100644
--- a/src/wiktextract/extractor/pt/section_titles.py
+++ b/src/wiktextract/extractor/pt/section_titles.py
@@ -30,5 +30,7 @@
LINKAGE_SECTIONS = {
"Antônimos": "antonyms",
"Sinônimos": "synonyms",
+ "Sinónimos/Sinônimos": "synonyms",
+ "Sinónimos": "synonyms",
"Verbetes derivados": "derived",
}
diff --git a/src/wiktextract/extractor/pt/translation.py b/src/wiktextract/extractor/pt/translation.py
index 1cc7189f..c2251c92 100644
--- a/src/wiktextract/extractor/pt/translation.py
+++ b/src/wiktextract/extractor/pt/translation.py
@@ -87,7 +87,7 @@ def extract_translation_list_item(
)
)
elif isinstance(node, str) and re.search(r"\(.+\)", node) is not None:
- roman = node.strip("() ")
+ roman = node.strip("() \n")
for tr_data in translations:
tr_data.roman = roman
elif (
diff --git a/tests/test_pt_example.py b/tests/test_pt_example.py
new file mode 100644
index 00000000..1def5fae
--- /dev/null
+++ b/tests/test_pt_example.py
@@ -0,0 +1,129 @@
+from unittest import TestCase
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.pt.page import parse_page
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestPtExample(TestCase):
+ maxDiff = None
+
+ def setUp(self) -> None:
+ conf = WiktionaryConfig(
+ dump_file_lang_code="pt",
+ capture_language_codes=None,
+ )
+ self.wxr = WiktextractContext(
+ Wtp(
+ lang_code="pt",
+ parser_function_aliases=conf.parser_function_aliases,
+ ),
+ conf,
+ )
+
+ def test_tradex_template(self):
+ self.wxr.wtp.add_page("Predefinição:-ryu-", 10, "Okinawano")
+ self.wxr.wtp.add_page("Predefinição:Substantivo", 10, "Substantivo")
+ self.wxr.wtp.add_page(
+ "Predefinição:tradex",
+ 10,
+ """[[Categoria:Entrada com exemplo traduzido (Okinawano)|a]]''沖縄ぬ'''政治''' (うちなーぬしーじ)'' ('''governo''' de Okinawa)""",
+ )
+ data = parse_page(
+ self.wxr,
+ "政治",
+ """={{-ryu-}}=
+=={{Substantivo|ryu}}==
+# [[governo]]
+#*{{tradex|ryu|沖縄ぬ'''政治''' (うちなーぬしーじ)|'''governo''' de Okinawa}}""",
+ )
+ self.assertEqual(
+ data[0]["senses"][0],
+ {
+ "categories": ["Entrada com exemplo traduzido (Okinawano)"],
+ "glosses": ["governo"],
+ "examples": [
+ {
+ "text": "沖縄ぬ政治 (うちなーぬしーじ)",
+ "translation": "governo de Okinawa",
+ }
+ ],
+ },
+ )
+
+ def test_small_tag_in_example(self):
+ self.wxr.wtp.add_page("Predefinição:-en-", 10, "Inglês")
+ data = parse_page(
+ self.wxr,
+ "book",
+ """={{-en-}}=
+==Substantivo==
+'''book'''
+# [[livro]]
+#* ''My life is an open '''book'''. (I have no secrets.)'': Minha vida é um livro aberto. (Não tenho segredos.)""",
+ )
+ self.assertEqual(
+ data[0]["senses"][0],
+ {
+ "glosses": ["livro"],
+ "examples": [
+ {
+ "text": "My life is an open book. (I have no secrets.)",
+ "translation": "Minha vida é um livro aberto. (Não tenho segredos.)",
+ }
+ ],
+ },
+ )
+
+ def test_OESP_template(self):
+ self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
+ self.wxr.wtp.add_page(
+ "Predefinição:OESP",
+ 10,
+ "(notícia do jornal ''O Estado de S. Paulo'' de 08 de abril de 2008)",
+ )
+ data = parse_page(
+ self.wxr,
+ "livro",
+ """={{-pt-}}=
+==Substantivo==
+# objeto
+#* ''Com verba pública, '''livro''' técnico ainda é restrito.'' {{OESP|2008|abril|08}}""",
+ )
+ self.assertEqual(
+ data[0]["senses"][0],
+ {
+ "glosses": ["objeto"],
+ "examples": [
+ {
+ "text": "Com verba pública, livro técnico ainda é restrito.",
+ "ref": "notícia do jornal O Estado de S. Paulo de 08 de abril de 2008",
+ }
+ ],
+ },
+ )
+
+ def test_double_italic_nodes(self):
+ self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
+ data = parse_page(
+ self.wxr,
+ "diabo",
+ """={{-pt-}}=
+==Substantivo1==
+# espírito
+#* ''“O '''diabo''' é o pai do rock!”.'' (passagem da composição ''“Rock do Diabo”'' de Raul Seixas/Paulo Coelho, 1975)""",
+ )
+ self.assertEqual(
+ data[0]["senses"][0],
+ {
+ "glosses": ["espírito"],
+ "examples": [
+ {
+ "text": "“O diabo é o pai do rock!”.",
+ "ref": "passagem da composição “Rock do Diabo” de Raul Seixas/Paulo Coelho, 1975",
+ }
+ ],
+ },
+ )
diff --git a/tests/test_pt_gloss.py b/tests/test_pt_gloss.py
index 10a12671..8e5d0e4a 100644
--- a/tests/test_pt_gloss.py
+++ b/tests/test_pt_gloss.py
@@ -74,85 +74,3 @@ def test_escopo(self):
}
],
)
-
- def test_tradex_template(self):
- self.wxr.wtp.add_page("Predefinição:-ryu-", 10, "Okinawano")
- self.wxr.wtp.add_page("Predefinição:Substantivo", 10, "Substantivo")
- self.wxr.wtp.add_page(
- "Predefinição:tradex",
- 10,
- """[[Categoria:Entrada com exemplo traduzido (Okinawano)|a]]''沖縄ぬ'''政治''' (うちなーぬしーじ)'' ('''governo''' de Okinawa)""",
- )
- data = parse_page(
- self.wxr,
- "政治",
- """={{-ryu-}}=
-=={{Substantivo|ryu}}==
-# [[governo]]
-#*{{tradex|ryu|沖縄ぬ'''政治''' (うちなーぬしーじ)|'''governo''' de Okinawa}}""",
- )
- self.assertEqual(
- data[0]["senses"][0],
- {
- "categories": ["Entrada com exemplo traduzido (Okinawano)"],
- "glosses": ["governo"],
- "examples": [
- {
- "text": "沖縄ぬ政治 (うちなーぬしーじ)",
- "translation": "governo de Okinawa",
- }
- ],
- },
- )
-
- def test_small_tag_in_example(self):
- self.wxr.wtp.add_page("Predefinição:-en-", 10, "Inglês")
- data = parse_page(
- self.wxr,
- "book",
- """={{-en-}}=
-==Substantivo==
-'''book'''
-# [[livro]]
-#* ''My life is an open '''book'''. (I have no secrets.)'': Minha vida é um livro aberto. (Não tenho segredos.)""",
- )
- self.assertEqual(
- data[0]["senses"][0],
- {
- "glosses": ["livro"],
- "examples": [
- {
- "text": "My life is an open book. (I have no secrets.)",
- "translation": "Minha vida é um livro aberto. (Não tenho segredos.)",
- }
- ],
- },
- )
-
- def test_OESP_template(self):
- self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
- self.wxr.wtp.add_page(
- "Predefinição:OESP",
- 10,
- "(notícia do jornal ''O Estado de S. Paulo'' de 08 de abril de 2008)",
- )
- data = parse_page(
- self.wxr,
- "livro",
- """={{-pt-}}=
-==Substantivo==
-# objeto
-#* ''Com verba pública, '''livro''' técnico ainda é restrito.'' {{OESP|2008|abril|08}}""",
- )
- self.assertEqual(
- data[0]["senses"][0],
- {
- "glosses": ["objeto"],
- "examples": [
- {
- "text": "Com verba pública, livro técnico ainda é restrito.",
- "ref": "notícia do jornal O Estado de S. Paulo de 08 de abril de 2008",
- }
- ],
- },
- )
diff --git a/tests/test_pt_linkage.py b/tests/test_pt_linkage.py
index be96be59..a4cc1a7b 100644
--- a/tests/test_pt_linkage.py
+++ b/tests/test_pt_linkage.py
@@ -113,3 +113,35 @@ def test_link_preto(self):
}
],
)
+
+ def test_nested_list(self):
+ self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
+ data = parse_page(
+ self.wxr,
+ "cão",
+ """={{-pt-}}=
+==Substantivo==
+# animal
+===Sinônimos===
+* De '''1''' (animal mamífero, carnívoro e quadrúpede):
+** [[cachorro]]
+** {{escopo2|Brasil|RS}} [[cusco]]
+*De '''3''' (gênio do mal):
+** vide [[Wikisaurus:diabo]]""",
+ )
+ self.assertEqual(
+ data[0]["synonyms"],
+ [
+ {
+ "word": "cachorro",
+ "sense": "animal mamífero, carnívoro e quadrúpede",
+ "sense_index": 1,
+ },
+ {
+ "word": "cusco",
+ "sense": "animal mamífero, carnívoro e quadrúpede",
+ "sense_index": 1,
+ "raw_tags": ["Brasil", "RS"],
+ },
+ ],
+ )
diff --git a/tests/test_pt_sound.py b/tests/test_pt_sound.py
new file mode 100644
index 00000000..a384690f
--- /dev/null
+++ b/tests/test_pt_sound.py
@@ -0,0 +1,62 @@
+from unittest import TestCase
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.pt.page import parse_page
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestPtSound(TestCase):
+ maxDiff = None
+
+ def setUp(self) -> None:
+ conf = WiktionaryConfig(
+ dump_file_lang_code="pt",
+ capture_language_codes=None,
+ )
+ self.wxr = WiktextractContext(
+ Wtp(
+ lang_code="pt",
+ parser_function_aliases=conf.parser_function_aliases,
+ ),
+ conf,
+ )
+
+ def test_subsection(self):
+ self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
+ self.wxr.wtp.add_page(
+ "Predefinição:pronúncia",
+ 10,
+ """Pronúncia[[Categoria:Entrada com pronúncia (Português)|olho]]""",
+ )
+ self.wxr.wtp.add_page("Predefinição:AFI", 10, "{{{1}}}")
+ data = parse_page(
+ self.wxr,
+ "olho",
+ """={{-pt-}}=
+==Substantivo==
+# órgão
+=={{pronúncia|pt}}==
+===Brasil===
+* '''Forma verbal''':
+** [[AFI]]: {{AFI|/ˈɔ.ʎʊ/}}
+** [[X-SAMPA]]: /"O.LU/""",
+ )
+ self.assertEqual(
+ data[0]["sounds"],
+ [
+ {
+ "ipa": "/ˈɔ.ʎʊ/",
+ "raw_tags": ["Brasil", "Forma verbal"],
+ },
+ {
+ "ipa": '/"O.LU/',
+ "raw_tags": ["Brasil", "Forma verbal"],
+ "tags": ["X-SAMPA"],
+ },
+ ],
+ )
+ self.assertEqual(
+ data[0]["categories"], ["Entrada com pronúncia (Português)"]
+ )