diff --git a/src/wiktextract/extractor/pt/linkage.py b/src/wiktextract/extractor/pt/linkage.py index 492a76d1..95afe165 100644 --- a/src/wiktextract/extractor/pt/linkage.py +++ b/src/wiktextract/extractor/pt/linkage.py @@ -107,29 +107,46 @@ def extract_linkage_list_item( linkage_words = [] raw_tags = [] for node in list_item.children: - if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: - word = clean_node(wxr, None, node) - if word != "": - linkage_words.append(word) - elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD: - bold_str = clean_node(wxr, None, node) - if re.fullmatch(r"\d+", bold_str): - sense_index = int(bold_str) + if isinstance(node, TemplateNode): + match node.template_name: + case "link preto": + word = clean_node( + wxr, None, node.template_parameters.get(1, "") + ) + if word != "": + linkage_words.append(word) + case "escopo2": + from .pos import extract_escopo2_template + + raw_tags.extend(extract_escopo2_template(wxr, node)) + elif isinstance(node, WikiNode): + match node.kind: + case NodeKind.LINK: + word = clean_node(wxr, None, node) + if word != "" and not word.startswith("Wikisaurus:"): + linkage_words.append(word) + case NodeKind.BOLD: + bold_str = clean_node(wxr, None, node) + if re.fullmatch(r"\d+", bold_str): + sense_index = int(bold_str) + case NodeKind.ITALIC: + raw_tag = clean_node(wxr, None, node) + if raw_tag != "": + raw_tags.append(raw_tag) + case NodeKind.LIST: + for child_list_item in node.find_child(NodeKind.LIST_ITEM): + extract_linkage_list_item( + wxr, + word_entry, + child_list_item, + linkage_type, + sense, + sense_index, + ) elif isinstance(node, str): m = re.search(r"\((.+)\)", node) if m is not None: sense = m.group(1) - elif ( - isinstance(node, TemplateNode) - and node.template_name == "link preto" - ): - word = clean_node(wxr, None, node.template_parameters.get(1, "")) - if word != "": - linkage_words.append(word) - elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: - raw_tag = clean_node(wxr, None, node) - if raw_tag != "": - raw_tags.append(raw_tag) for word in linkage_words: linkage = Linkage( diff --git a/src/wiktextract/extractor/pt/models.py b/src/wiktextract/extractor/pt/models.py index 98435499..e51e3755 100644 --- a/src/wiktextract/extractor/pt/models.py +++ b/src/wiktextract/extractor/pt/models.py @@ -52,6 +52,19 @@ class Linkage(PortugueseBaseModel): ) +class Sound(PortugueseBaseModel): + ipa: str = Field(default="", description="International Phonetic Alphabet") + audio: str = Field(default="", description="Audio file name") + wav_url: str = "" + oga_url: str = "" + ogg_url: str = "" + mp3_url: str = "" + opus_url: str = "" + flac_url: str = "" + tags: list[str] = [] + raw_tags: list[str] = [] + + class WordEntry(PortugueseBaseModel): model_config = ConfigDict(title="Portuguese Wiktionary") word: str = Field(description="Word string", min_length=1) @@ -69,3 +82,4 @@ class WordEntry(PortugueseBaseModel): synonyms: list[Linkage] = [] derived: list[Linkage] = [] etymology_texts: list[str] = [] + sounds: list[Sound] = [] diff --git a/src/wiktextract/extractor/pt/page.py b/src/wiktextract/extractor/pt/page.py index 0e1fc3af..73c95fc4 100644 --- a/src/wiktextract/extractor/pt/page.py +++ b/src/wiktextract/extractor/pt/page.py @@ -12,6 +12,7 @@ from .linkage import extract_expression_section, extract_linkage_section from .models import Sense, WordEntry from .pos import extract_pos_section +from .pronunciation import extract_pronunciation_section from .section_titles import LINKAGE_SECTIONS, POS_DATA from .translation import extract_translation_section @@ -23,7 +24,7 @@ def parse_section( level_node: LevelNode, ) -> None: cats = {} - title_text = clean_node(wxr, cats, level_node.largs) + title_text = clean_node(wxr, cats, level_node.largs).strip("⁰¹²³⁴⁵⁶⁷⁸⁹") if title_text in POS_DATA: extract_pos_section( wxr, @@ -50,16 +51,35 @@ def parse_section( ) elif title_text == "Etimologia": extract_etymology_section(wxr, page_data, level_node) + elif title_text == "Pronúncia": + extract_pronunciation_section(wxr, page_data, level_node) + if title_text not in POS_DATA: + save_section_cats( + cats.get("categories", []), page_data, level_node, True + ) cats = {} for link_node in level_node.find_child(NodeKind.LINK): clean_node(wxr, cats, link_node) - for data in page_data: - if data.lang_code == page_data[-1].lang_code: - data.categories.extend(cats.get("categories", [])) + save_section_cats(cats.get("categories", []), page_data, level_node, False) + + if title_text != "Pronúncia": + for next_level in level_node.find_child(LEVEL_KIND_FLAGS): + parse_section(wxr, page_data, base_data, next_level) + - for next_level in level_node.find_child(LEVEL_KIND_FLAGS): - parse_section(wxr, page_data, base_data, next_level) +def save_section_cats( + cats: list[str], + page_data: list[WordEntry], + level_node: LevelNode, + from_title: bool, +) -> None: + if not from_title or (from_title and level_node.kind == NodeKind.LEVEL2): + for data in page_data: + if data.lang_code == page_data[-1].lang_code: + data.categories.extend(cats) + elif len(page_data) > 0: + page_data[-1].categories.extend(cats) def parse_page( diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py index cbb5e632..de75f36e 100644 --- a/src/wiktextract/extractor/pt/pos.py +++ b/src/wiktextract/extractor/pt/pos.py @@ -53,7 +53,7 @@ def extract_gloss_list_item( if node.template_name == "escopo": extract_escopo_template(wxr, sense, node) elif node.template_name == "escopo2": - extract_escopo2_template(wxr, sense, node) + sense.raw_tags.extend(extract_escopo2_template(wxr, node)) else: gloss_nodes.append(node) elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: @@ -80,24 +80,25 @@ def extract_escopo_template( for arg in range(2, 9): if arg not in t_node.template_parameters: break - sense.raw_tags.append( - clean_node(wxr, None, t_node.template_parameters[arg]) - ) + raw_tag = clean_node(wxr, None, t_node.template_parameters[arg]) + if raw_tag != "": + sense.raw_tags.append(raw_tag) clean_node(wxr, sense, t_node) def extract_escopo2_template( wxr: WiktextractContext, - sense: Sense, t_node: TemplateNode, -) -> None: +) -> list[str]: # https://pt.wiktionary.org/wiki/Predefinição:escopo2 + raw_tags = [] for arg in range(1, 4): if arg not in t_node.template_parameters: break - sense.raw_tags.append( - clean_node(wxr, None, t_node.template_parameters[arg]) - ) + raw_tag = clean_node(wxr, None, t_node.template_parameters[arg]) + if raw_tag != "": + raw_tags.append(raw_tag) + return raw_tags def extract_example_list_item( @@ -106,8 +107,13 @@ def extract_example_list_item( list_item: WikiNode, ) -> None: example = Example() + ref_nodes = [] for node in list_item.children: - if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: + if ( + isinstance(node, WikiNode) + and node.kind == NodeKind.ITALIC + and example.text == "" + ): example.text = clean_node(wxr, None, node) elif isinstance(node, HTMLNode) and node.tag == "small": example.translation = clean_node(wxr, None, node) @@ -131,5 +137,10 @@ def extract_example_list_item( example.text = clean_node( wxr, sense, node.template_parameters.get(1, "") ) + else: + ref_nodes.append(node) + if example.text != "": + if example.ref == "": + example.ref = clean_node(wxr, sense, ref_nodes).strip(":() \n") sense.examples.append(example) diff --git a/src/wiktextract/extractor/pt/pronunciation.py b/src/wiktextract/extractor/pt/pronunciation.py new file mode 100644 index 00000000..b15a8217 --- /dev/null +++ b/src/wiktextract/extractor/pt/pronunciation.py @@ -0,0 +1,73 @@ +from wikitextprocessor.parser import ( + LEVEL_KIND_FLAGS, + LevelNode, + NodeKind, + WikiNode, +) + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Sound, WordEntry +from .tags import translate_raw_tags + + +def extract_pronunciation_section( + wxr: WiktextractContext, + page_data: list[WordEntry], + level_node: LevelNode, +) -> None: + raw_tags = [] + sounds = [] + title_text = clean_node(wxr, None, level_node.largs) + if title_text not in ["", "Pronúncia"]: + raw_tags.append(title_text) + + for list_node in level_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + sounds.extend( + extract_pronunciation_list_item(wxr, list_item, raw_tags) + ) + + for child_level_node in level_node.find_child(LEVEL_KIND_FLAGS): + extract_pronunciation_section(wxr, page_data, child_level_node) + + for data in page_data: + if data.lang_code == page_data[-1].lang_code: + for sound in sounds: + translate_raw_tags(sound) + data.sounds.append(sound) + + +def extract_pronunciation_list_item( + wxr: WiktextractContext, list_item: WikiNode, raw_tags: list[str] +) -> list[Sound]: + sounds = [] + for index, node in enumerate(list_item.children): + if isinstance(node, str) and ":" in node: + raw_tag = clean_node(wxr, None, list_item.children[:index]) + sound_value = clean_node( + wxr, + None, + [node[node.index(":") + 1 :]] + + [ + n + for n in list_item.children[index + 1 :] + if not (isinstance(n, WikiNode) and n.kind == NodeKind.LIST) + ], + ) + if sound_value != "": + sound = Sound(ipa=sound_value, raw_tags=raw_tags) + if raw_tag == "X-SAMPA": + sound.tags.append("X-SAMPA") + sounds.append(sound) + elif raw_tag != "": + raw_tags.append(raw_tag) + elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: + for child_list_item in node.find_child(NodeKind.LIST_ITEM): + sounds.extend( + extract_pronunciation_list_item( + wxr, child_list_item, raw_tags + ) + ) + + return sounds diff --git a/src/wiktextract/extractor/pt/section_titles.py b/src/wiktextract/extractor/pt/section_titles.py index 56bc41eb..f65b817b 100644 --- a/src/wiktextract/extractor/pt/section_titles.py +++ b/src/wiktextract/extractor/pt/section_titles.py @@ -30,5 +30,7 @@ LINKAGE_SECTIONS = { "Antônimos": "antonyms", "Sinônimos": "synonyms", + "Sinónimos/Sinônimos": "synonyms", + "Sinónimos": "synonyms", "Verbetes derivados": "derived", } diff --git a/src/wiktextract/extractor/pt/translation.py b/src/wiktextract/extractor/pt/translation.py index 1cc7189f..c2251c92 100644 --- a/src/wiktextract/extractor/pt/translation.py +++ b/src/wiktextract/extractor/pt/translation.py @@ -87,7 +87,7 @@ def extract_translation_list_item( ) ) elif isinstance(node, str) and re.search(r"\(.+\)", node) is not None: - roman = node.strip("() ") + roman = node.strip("() \n") for tr_data in translations: tr_data.roman = roman elif ( diff --git a/tests/test_pt_example.py b/tests/test_pt_example.py new file mode 100644 index 00000000..1def5fae --- /dev/null +++ b/tests/test_pt_example.py @@ -0,0 +1,129 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.pt.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestPtExample(TestCase): + maxDiff = None + + def setUp(self) -> None: + conf = WiktionaryConfig( + dump_file_lang_code="pt", + capture_language_codes=None, + ) + self.wxr = WiktextractContext( + Wtp( + lang_code="pt", + parser_function_aliases=conf.parser_function_aliases, + ), + conf, + ) + + def test_tradex_template(self): + self.wxr.wtp.add_page("Predefinição:-ryu-", 10, "Okinawano") + self.wxr.wtp.add_page("Predefinição:Substantivo", 10, "Substantivo") + self.wxr.wtp.add_page( + "Predefinição:tradex", + 10, + """[[Categoria:Entrada com exemplo traduzido (Okinawano)|a]]''沖縄ぬ'''政治''' (うちなーぬしーじ)'' ('''governo''' de Okinawa)""", + ) + data = parse_page( + self.wxr, + "政治", + """={{-ryu-}}= +=={{Substantivo|ryu}}== +# [[governo]] +#*{{tradex|ryu|沖縄ぬ'''政治''' (うちなーぬしーじ)|'''governo''' de Okinawa}}""", + ) + self.assertEqual( + data[0]["senses"][0], + { + "categories": ["Entrada com exemplo traduzido (Okinawano)"], + "glosses": ["governo"], + "examples": [ + { + "text": "沖縄ぬ政治 (うちなーぬしーじ)", + "translation": "governo de Okinawa", + } + ], + }, + ) + + def test_small_tag_in_example(self): + self.wxr.wtp.add_page("Predefinição:-en-", 10, "Inglês") + data = parse_page( + self.wxr, + "book", + """={{-en-}}= +==Substantivo== +'''book''' +# [[livro]] +#* ''My life is an open '''book'''. (I have no secrets.)'': Minha vida é um livro aberto. (Não tenho segredos.)""", + ) + self.assertEqual( + data[0]["senses"][0], + { + "glosses": ["livro"], + "examples": [ + { + "text": "My life is an open book. (I have no secrets.)", + "translation": "Minha vida é um livro aberto. (Não tenho segredos.)", + } + ], + }, + ) + + def test_OESP_template(self): + self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") + self.wxr.wtp.add_page( + "Predefinição:OESP", + 10, + "(notícia do jornal ''O Estado de S. Paulo'' de 08 de abril de 2008)", + ) + data = parse_page( + self.wxr, + "livro", + """={{-pt-}}= +==Substantivo== +# objeto +#* ''Com verba pública, '''livro''' técnico ainda é restrito.'' {{OESP|2008|abril|08}}""", + ) + self.assertEqual( + data[0]["senses"][0], + { + "glosses": ["objeto"], + "examples": [ + { + "text": "Com verba pública, livro técnico ainda é restrito.", + "ref": "notícia do jornal O Estado de S. Paulo de 08 de abril de 2008", + } + ], + }, + ) + + def test_double_italic_nodes(self): + self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") + data = parse_page( + self.wxr, + "diabo", + """={{-pt-}}= +==Substantivo1== +# espírito +#* ''“O '''diabo''' é o pai do rock!”.'' (passagem da composição ''“Rock do Diabo”'' de Raul Seixas/Paulo Coelho, 1975)""", + ) + self.assertEqual( + data[0]["senses"][0], + { + "glosses": ["espírito"], + "examples": [ + { + "text": "“O diabo é o pai do rock!”.", + "ref": "passagem da composição “Rock do Diabo” de Raul Seixas/Paulo Coelho, 1975", + } + ], + }, + ) diff --git a/tests/test_pt_gloss.py b/tests/test_pt_gloss.py index 10a12671..8e5d0e4a 100644 --- a/tests/test_pt_gloss.py +++ b/tests/test_pt_gloss.py @@ -74,85 +74,3 @@ def test_escopo(self): } ], ) - - def test_tradex_template(self): - self.wxr.wtp.add_page("Predefinição:-ryu-", 10, "Okinawano") - self.wxr.wtp.add_page("Predefinição:Substantivo", 10, "Substantivo") - self.wxr.wtp.add_page( - "Predefinição:tradex", - 10, - """[[Categoria:Entrada com exemplo traduzido (Okinawano)|a]]''沖縄ぬ'''政治''' (うちなーぬしーじ)'' ('''governo''' de Okinawa)""", - ) - data = parse_page( - self.wxr, - "政治", - """={{-ryu-}}= -=={{Substantivo|ryu}}== -# [[governo]] -#*{{tradex|ryu|沖縄ぬ'''政治''' (うちなーぬしーじ)|'''governo''' de Okinawa}}""", - ) - self.assertEqual( - data[0]["senses"][0], - { - "categories": ["Entrada com exemplo traduzido (Okinawano)"], - "glosses": ["governo"], - "examples": [ - { - "text": "沖縄ぬ政治 (うちなーぬしーじ)", - "translation": "governo de Okinawa", - } - ], - }, - ) - - def test_small_tag_in_example(self): - self.wxr.wtp.add_page("Predefinição:-en-", 10, "Inglês") - data = parse_page( - self.wxr, - "book", - """={{-en-}}= -==Substantivo== -'''book''' -# [[livro]] -#* ''My life is an open '''book'''. (I have no secrets.)'': Minha vida é um livro aberto. (Não tenho segredos.)""", - ) - self.assertEqual( - data[0]["senses"][0], - { - "glosses": ["livro"], - "examples": [ - { - "text": "My life is an open book. (I have no secrets.)", - "translation": "Minha vida é um livro aberto. (Não tenho segredos.)", - } - ], - }, - ) - - def test_OESP_template(self): - self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") - self.wxr.wtp.add_page( - "Predefinição:OESP", - 10, - "(notícia do jornal ''O Estado de S. Paulo'' de 08 de abril de 2008)", - ) - data = parse_page( - self.wxr, - "livro", - """={{-pt-}}= -==Substantivo== -# objeto -#* ''Com verba pública, '''livro''' técnico ainda é restrito.'' {{OESP|2008|abril|08}}""", - ) - self.assertEqual( - data[0]["senses"][0], - { - "glosses": ["objeto"], - "examples": [ - { - "text": "Com verba pública, livro técnico ainda é restrito.", - "ref": "notícia do jornal O Estado de S. Paulo de 08 de abril de 2008", - } - ], - }, - ) diff --git a/tests/test_pt_linkage.py b/tests/test_pt_linkage.py index be96be59..a4cc1a7b 100644 --- a/tests/test_pt_linkage.py +++ b/tests/test_pt_linkage.py @@ -113,3 +113,35 @@ def test_link_preto(self): } ], ) + + def test_nested_list(self): + self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") + data = parse_page( + self.wxr, + "cão", + """={{-pt-}}= +==Substantivo== +# animal +===Sinônimos=== +* De '''1''' (animal mamífero, carnívoro e quadrúpede): +** [[cachorro]] +** {{escopo2|Brasil|RS}} [[cusco]] +*De '''3''' (gênio do mal): +** vide [[Wikisaurus:diabo]]""", + ) + self.assertEqual( + data[0]["synonyms"], + [ + { + "word": "cachorro", + "sense": "animal mamífero, carnívoro e quadrúpede", + "sense_index": 1, + }, + { + "word": "cusco", + "sense": "animal mamífero, carnívoro e quadrúpede", + "sense_index": 1, + "raw_tags": ["Brasil", "RS"], + }, + ], + ) diff --git a/tests/test_pt_sound.py b/tests/test_pt_sound.py new file mode 100644 index 00000000..a384690f --- /dev/null +++ b/tests/test_pt_sound.py @@ -0,0 +1,62 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.pt.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestPtSound(TestCase): + maxDiff = None + + def setUp(self) -> None: + conf = WiktionaryConfig( + dump_file_lang_code="pt", + capture_language_codes=None, + ) + self.wxr = WiktextractContext( + Wtp( + lang_code="pt", + parser_function_aliases=conf.parser_function_aliases, + ), + conf, + ) + + def test_subsection(self): + self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") + self.wxr.wtp.add_page( + "Predefinição:pronúncia", + 10, + """Pronúncia[[Categoria:Entrada com pronúncia (Português)|olho]]""", + ) + self.wxr.wtp.add_page("Predefinição:AFI", 10, "{{{1}}}") + data = parse_page( + self.wxr, + "olho", + """={{-pt-}}= +==Substantivo== +# órgão +=={{pronúncia|pt}}== +===Brasil=== +* '''Forma verbal''': +** [[AFI]]: {{AFI|/ˈɔ.ʎʊ/}} +** [[X-SAMPA]]: /"O.LU/""", + ) + self.assertEqual( + data[0]["sounds"], + [ + { + "ipa": "/ˈɔ.ʎʊ/", + "raw_tags": ["Brasil", "Forma verbal"], + }, + { + "ipa": '/"O.LU/', + "raw_tags": ["Brasil", "Forma verbal"], + "tags": ["X-SAMPA"], + }, + ], + ) + self.assertEqual( + data[0]["categories"], ["Entrada com pronúncia (Português)"] + )