From dea2b26b7a780f56a3bdcd6356012a81be0eb559 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 2 Dec 2024 14:32:26 +0800 Subject: [PATCH 1/3] [pt] add parser function aliases configurations --- src/wiktextract/config.py | 2 ++ src/wiktextract/data/pt/config.json | 9 +++++++++ src/wiktextract/extractor/de/pronunciation.py | 4 +++- src/wiktextract/wiktwords.py | 5 ++++- 4 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 src/wiktextract/data/pt/config.json diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py index e2a34de8..c21686b1 100644 --- a/src/wiktextract/config.py +++ b/src/wiktextract/config.py @@ -64,6 +64,7 @@ class WiktionaryConfig: "save_ns_names", "extract_ns_names", "allowed_html_tags", + "parser_function_aliases", ) def __init__( @@ -128,6 +129,7 @@ def __init__( # these are extracted namespaces self.extract_ns_names = ["Main"] self.allowed_html_tags: dict[str, HTMLTagData] = {} + self.parser_function_aliases: dict[str, str] = {} self.load_edition_settings() def merge_return(self, ret: CollatedErrorReturnData): diff --git a/src/wiktextract/data/pt/config.json b/src/wiktextract/data/pt/config.json new file mode 100644 index 00000000..db73a37b --- /dev/null +++ b/src/wiktextract/data/pt/config.json @@ -0,0 +1,9 @@ +{ + "parser_function_aliases": { + "#se": "#if", + "#seigual": "#ifeq", + "#seerro": "#iferror", + "#seexiste": "#ifexist", + "#seexpr": "#ifexpr" + } +} diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py index db461eae..d55d5323 100644 --- a/src/wiktextract/extractor/de/pronunciation.py +++ b/src/wiktextract/extractor/de/pronunciation.py @@ -81,6 +81,8 @@ def extract_audio_template( for link_node in expanded_node.find_child(NodeKind.LINK): link_str = clean_node(wxr, None, link_node) if "(" in link_str: - sound.raw_tags.append(link_str[link_str.index("(") + 1:].strip(")")) + sound.raw_tags.append( + link_str[link_str.index("(") + 1 :].strip(")") + ) clean_node(wxr, sound, expanded_node) return sound diff --git a/src/wiktextract/wiktwords.py b/src/wiktextract/wiktwords.py index 408b577d..6fe467eb 100755 --- a/src/wiktextract/wiktwords.py +++ b/src/wiktextract/wiktwords.py @@ -395,8 +395,11 @@ def main(): wtp = Wtp( db_path=args.db_path, lang_code=args.dump_file_language_code, - template_override_funcs=template_override_fns, + template_override_funcs=template_override_fns + if args.dump_file_language_code == "en" + else {}, extension_tags=conf.allowed_html_tags, + parser_function_aliases=conf.parser_function_aliases, quiet=args.quiet, ) wxr = WiktextractContext(wtp, conf) From c23b79c36eef2e4f0064bb1e88da1bf01be31343 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 2 Dec 2024 16:10:22 +0800 Subject: [PATCH 2/3] [pt] extract gloss tag and topic template "escopo" and "escopo2" --- src/wiktextract/extractor/pt/page.py | 14 ++++-- src/wiktextract/extractor/pt/pos.py | 48 +++++++++++++++++- tests/test_pt_gloss.py | 74 ++++++++++++++++++++++++++++ 3 files changed, 131 insertions(+), 5 deletions(-) create mode 100644 tests/test_pt_gloss.py diff --git a/src/wiktextract/extractor/pt/page.py b/src/wiktextract/extractor/pt/page.py index e953f7bb..a626b08f 100644 --- a/src/wiktextract/extractor/pt/page.py +++ b/src/wiktextract/extractor/pt/page.py @@ -4,7 +4,6 @@ LEVEL_KIND_FLAGS, LevelNode, NodeKind, - WikiNode, ) from ...page import clean_node @@ -23,7 +22,14 @@ def parse_section( cats = {} title_text = clean_node(wxr, cats, level_node.largs) if title_text in POS_DATA: - extract_pos_section(wxr, page_data, base_data, level_node, title_text) + extract_pos_section( + wxr, + page_data, + base_data, + level_node, + title_text, + cats.get("categories", []), + ) def parse_page( @@ -35,7 +41,8 @@ def parse_page( tree = wxr.wtp.parse(page_text) page_data: list[WordEntry] = [] for level1_node in tree.find_child(NodeKind.LEVEL1): - lang_name = clean_node(wxr, None, level1_node.largs) + lang_cats = {} + lang_name = clean_node(wxr, lang_cats, level1_node.largs) lang_code = "unknown" for lang_template in level1_node.find_content(NodeKind.TEMPLATE): lang_code = lang_template.template_name.strip("-") @@ -51,6 +58,7 @@ def parse_page( lang_code=lang_code, lang=lang_name, pos="unknown", + categories=lang_cats.get("categories", []), ) for next_level_node in level1_node.find_child(LEVEL_KIND_FLAGS): parse_section(wxr, page_data, base_data, next_level_node) diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py index 83b8dc33..703c4695 100644 --- a/src/wiktextract/extractor/pt/pos.py +++ b/src/wiktextract/extractor/pt/pos.py @@ -1,4 +1,4 @@ -from wikitextprocessor import LevelNode, NodeKind, WikiNode +from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode from ...page import clean_node from ...wxr_context import WiktextractContext @@ -12,12 +12,14 @@ def extract_pos_section( base_data: WordEntry, level_node: LevelNode, pos_title: str, + categories: list[str], ) -> None: page_data.append(base_data.model_copy(deep=True)) page_data[-1].pos_title = pos_title pos_data = POS_DATA[pos_title] page_data[-1].pos = pos_data["pos"] page_data[-1].tags.extend(pos_data.get("tags", [])) + page_data[-1].categories.extend(categories) for list_index, list_node in level_node.find_child(NodeKind.LIST, True): if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): @@ -30,9 +32,51 @@ def extract_gloss_list_item( word_entry: WordEntry, list_item_node: WikiNode, ) -> None: - gloss_nodes = list(list_item_node.invert_find_child(NodeKind.LIST)) + gloss_nodes = [] sense = Sense() + for node in list_item_node.children: + if isinstance(node, TemplateNode): + if node.template_name == "escopo": + extract_escopo_template(wxr, sense, node) + elif node.template_name == "escopo2": + extract_escopo2_template(wxr, sense, node) + else: + gloss_nodes.append(node) + elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: + pass + else: + gloss_nodes.append(node) + gloss_str = clean_node(wxr, sense, gloss_nodes) if len(gloss_str) > 0: sense.glosses.append(gloss_str) word_entry.senses.append(sense) + + +def extract_escopo_template( + wxr: WiktextractContext, + sense: Sense, + t_node: TemplateNode, +) -> None: + # https://pt.wiktionary.org/wiki/Predefinição:escopo + for arg in range(2, 9): + if arg not in t_node.template_parameters: + break + sense.raw_tags.append( + clean_node(wxr, None, t_node.template_parameters[arg]) + ) + clean_node(wxr, sense, t_node) + + +def extract_escopo2_template( + wxr: WiktextractContext, + sense: Sense, + t_node: TemplateNode, +) -> None: + # https://pt.wiktionary.org/wiki/Predefinição:escopo2 + for arg in range(1, 4): + if arg not in t_node.parameters: + break + sense.raw_tags.append( + clean_node(wxr, None, t_node.template_parameters[arg]) + ) diff --git a/tests/test_pt_gloss.py b/tests/test_pt_gloss.py new file mode 100644 index 00000000..5120bdbc --- /dev/null +++ b/tests/test_pt_gloss.py @@ -0,0 +1,74 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.pt.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestPtGloss(TestCase): + maxDiff = None + + def setUp(self) -> None: + conf = WiktionaryConfig( + dump_file_lang_code="pt", + capture_language_codes=None, + ) + self.wxr = WiktextractContext( + Wtp( + lang_code="pt", + parser_function_aliases=conf.parser_function_aliases, + ), + conf, + ) + + def test_escopo(self): + self.wxr.wtp.add_page( + "Predefinição:-pt-", + 10, + "Português[[Categoria:!Entrada (Português)]]", + ) + self.wxr.wtp.add_page( + "Predefinição:Substantivo", + 10, + "Substantivo[[Categoria:Substantivo (Português)]]", + ) + self.wxr.wtp.add_page( + "Predefinição:escopo", + 10, + """(''[[Categoria:Português brasileiro]]Brasil e [[Categoria:Coloquialismo (Português)]]popular'')""", + ) + data = parse_page( + self.wxr, + "cão", + """={{-pt-}}= +=={{Substantivo|pt}}== +# {{escopo|pt|Brasil|popular}} [[gênio]] do [[mal]] em geral ("capeta")""", + ) + self.assertEqual( + data, + [ + { + "lang": "Português", + "lang_code": "pt", + "pos": "noun", + "pos_title": "Substantivo", + "categories": [ + "!Entrada (Português)", + "Substantivo (Português)", + ], + "senses": [ + { + "categories": [ + "Português brasileiro", + "Coloquialismo (Português)", + ], + "glosses": ['gênio do mal em geral ("capeta")'], + "raw_tags": ["Brasil", "popular"], + } + ], + "word": "cão", + } + ], + ) From fe6bb0c176ba28eea4a3ce48f091b3d4850eb2eb Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 2 Dec 2024 17:02:11 +0800 Subject: [PATCH 3/3] [pt] extract example list --- src/wiktextract/extractor/pt/models.py | 7 +++++++ src/wiktextract/extractor/pt/pos.py | 21 +++++++++++++++++---- tests/test_pt_gloss.py | 4 +++- 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/src/wiktextract/extractor/pt/models.py b/src/wiktextract/extractor/pt/models.py index 609324d7..72251087 100644 --- a/src/wiktextract/extractor/pt/models.py +++ b/src/wiktextract/extractor/pt/models.py @@ -10,12 +10,19 @@ class PortugueseBaseModel(BaseModel): ) +class Example(PortugueseBaseModel): + text: str = "" + translation: str = "" + ref: str = "" + + class Sense(PortugueseBaseModel): glosses: list[str] = [] tags: list[str] = [] raw_tags: list[str] = [] categories: list[str] = [] topics: list[str] = [] + examples: list[Example] = [] class WordEntry(PortugueseBaseModel): diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py index 703c4695..a422706e 100644 --- a/src/wiktextract/extractor/pt/pos.py +++ b/src/wiktextract/extractor/pt/pos.py @@ -2,7 +2,7 @@ from ...page import clean_node from ...wxr_context import WiktextractContext -from .models import Sense, WordEntry +from .models import Example, Sense, WordEntry from .section_titles import POS_DATA @@ -30,11 +30,11 @@ def extract_pos_section( def extract_gloss_list_item( wxr: WiktextractContext, word_entry: WordEntry, - list_item_node: WikiNode, + list_item: WikiNode, ) -> None: gloss_nodes = [] sense = Sense() - for node in list_item_node.children: + for node in list_item.children: if isinstance(node, TemplateNode): if node.template_name == "escopo": extract_escopo_template(wxr, sense, node) @@ -43,7 +43,9 @@ def extract_gloss_list_item( else: gloss_nodes.append(node) elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: - pass + if node.sarg.endswith("*"): + for next_list_item in node.find_child(NodeKind.LIST_ITEM): + extract_example_list_item(wxr, sense, next_list_item) else: gloss_nodes.append(node) @@ -80,3 +82,14 @@ def extract_escopo2_template( sense.raw_tags.append( clean_node(wxr, None, t_node.template_parameters[arg]) ) + + +def extract_example_list_item( + wxr: WiktextractContext, + sense: Sense, + list_item: WikiNode, +) -> None: + example = Example() + example.text = clean_node(wxr, sense, list_item.children) + if example.text != "": + sense.examples.append(example) diff --git a/tests/test_pt_gloss.py b/tests/test_pt_gloss.py index 5120bdbc..8e5d0e4a 100644 --- a/tests/test_pt_gloss.py +++ b/tests/test_pt_gloss.py @@ -44,7 +44,8 @@ def test_escopo(self): "cão", """={{-pt-}}= =={{Substantivo|pt}}== -# {{escopo|pt|Brasil|popular}} [[gênio]] do [[mal]] em geral ("capeta")""", +# {{escopo|pt|Brasil|popular}} [[gênio]] do [[mal]] em geral ("capeta") +#* ''O '''cão''' em forma de gente.''""", ) self.assertEqual( data, @@ -66,6 +67,7 @@ def test_escopo(self): ], "glosses": ['gênio do mal em geral ("capeta")'], "raw_tags": ["Brasil", "popular"], + "examples": [{"text": "O cão em forma de gente."}], } ], "word": "cão",