From 14b24fba18661ef199b9907ed55e2bab6058b1fb Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 17 Dec 2024 10:23:27 +0800 Subject: [PATCH 1/5] [it] extract plain text tags in linkage list --- src/wiktextract/extractor/it/linkage.py | 4 +++- tests/test_it_linkage.py | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/wiktextract/extractor/it/linkage.py b/src/wiktextract/extractor/it/linkage.py index 75abd5d5..484a84df 100644 --- a/src/wiktextract/extractor/it/linkage.py +++ b/src/wiktextract/extractor/it/linkage.py @@ -43,7 +43,9 @@ def extract_linkage_list_item( elif isinstance(node, str): for word_str in node.split(","): word_str = word_str.strip() - if word_str != "": + if word_str.startswith("(") and word_str.endswith(")"): + raw_tags.append(word_str.strip("()")) + elif word_str != "": linkages.append(Linkage(word=word_str, raw_tags=raw_tags)) raw_tags.clear() diff --git a/tests/test_it_linkage.py b/tests/test_it_linkage.py index 3aabea24..ea968969 100644 --- a/tests/test_it_linkage.py +++ b/tests/test_it_linkage.py @@ -42,3 +42,19 @@ def test_synonyms(self): {"word": "intenso"}, ], ) + + def test_text_tag(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "cane", + """== {{-it-}} == +===Sostantivo=== +# [[animale]] +===Iperonimi=== +* (dominio) [[eucariote]]""", + ) + self.assertEqual( + data[0]["hypernyms"], + [{"word": "eucariote", "raw_tags": ["dominio"]}], + ) From 22517f2aabb52ca4a48c3b0b53110ed3ef9124b4 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 17 Dec 2024 10:46:25 +0800 Subject: [PATCH 2/5] [it] extract proverb section --- src/wiktextract/extractor/it/linkage.py | 23 ++++++++++++++++++++++- src/wiktextract/extractor/it/models.py | 1 + tests/test_it_linkage.py | 21 +++++++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/src/wiktextract/extractor/it/linkage.py b/src/wiktextract/extractor/it/linkage.py index 484a84df..0314fd99 100644 --- a/src/wiktextract/extractor/it/linkage.py +++ b/src/wiktextract/extractor/it/linkage.py @@ -14,7 +14,11 @@ def extract_linkage_section( linkages = [] for list_node in level_node.find_child(NodeKind.LIST): for list_item in list_node.find_child(NodeKind.LIST_ITEM): - linkages.extend(extract_linkage_list_item(wxr, list_item)) + linkages.extend( + extract_proverb_list_item(wxr, list_item) + if linkage_type == "proverbs" + else extract_linkage_list_item(wxr, list_item) + ) for data in page_data: if data.lang_code == page_data[-1].lang_code: @@ -50,3 +54,20 @@ def extract_linkage_list_item( raw_tags.clear() return linkages + + +def extract_proverb_list_item( + wxr: WiktextractContext, list_item: WikiNode +) -> list[Linkage]: + proverb = Linkage(word="") + for index, node in enumerate(list_item.children): + if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: + proverb.word = clean_node(wxr, None, node) + elif isinstance(node, str) and ":" in node: + proverb.sense = clean_node( + wxr, + None, + [node[node.index(":") + 1 :]] + list_item.children[index + 1 :], + ) + break + return [proverb] if proverb.word != "" else [] diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py index 7ba272a7..44cf2f7f 100644 --- a/src/wiktextract/extractor/it/models.py +++ b/src/wiktextract/extractor/it/models.py @@ -72,6 +72,7 @@ class Linkage(ItalianBaseModel): word: str tags: list[str] = [] raw_tags: list[str] = [] + sense: str = "" class WordEntry(ItalianBaseModel): diff --git a/tests/test_it_linkage.py b/tests/test_it_linkage.py index ea968969..f1cf5c23 100644 --- a/tests/test_it_linkage.py +++ b/tests/test_it_linkage.py @@ -58,3 +58,24 @@ def test_text_tag(self): data[0]["hypernyms"], [{"word": "eucariote", "raw_tags": ["dominio"]}], ) + + def test_proverbs(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "cane", + """== {{-it-}} == +===Sostantivo=== +# [[animale]] +===Proverbi e modi di dire=== +* ''Menare il '''can''' per l'aia'': tergiversare, prendere tempo""", + ) + self.assertEqual( + data[0]["proverbs"], + [ + { + "word": "Menare il can per l'aia", + "sense": "tergiversare, prendere tempo", + } + ], + ) From 1645b6f12403a1422b40fe4c7bcddd8f7c06dd12 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 17 Dec 2024 12:17:43 +0800 Subject: [PATCH 3/5] [it] override "Template:-verb-" it's contents are inside "noinclude" tag --- src/wiktextract/data/overrides/it.json | 5 +++++ src/wiktextract/extractor/it/translation.py | 1 + tests/test_it_etymology.py | 2 +- tests/test_it_translation.py | 2 +- 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/wiktextract/data/overrides/it.json b/src/wiktextract/data/overrides/it.json index e02978bd..42fd7447 100644 --- a/src/wiktextract/data/overrides/it.json +++ b/src/wiktextract/data/overrides/it.json @@ -8,5 +8,10 @@ "body": "===Note / Riferimenti===\n", "namespace_id": 10, "need_pre_expand": true + }, + "Template:-verb-": { + "body": "{{Sezione voce|Immagine=Open_book_01.svg|Dimensione=30px|Sezione=verbo|Sezione al plurale=verbi|Genere=m|Lingua={{{1|}}}}}{{#invoke:Categorizzazione verbi italiani|main|{{{1|}}}}}", + "namespace_id": 10, + "need_pre_expand": true } } diff --git a/src/wiktextract/extractor/it/translation.py b/src/wiktextract/extractor/it/translation.py index 8467177c..e1cde868 100644 --- a/src/wiktextract/extractor/it/translation.py +++ b/src/wiktextract/extractor/it/translation.py @@ -13,6 +13,7 @@ def extract_translation_section( page_data: list[WordEntry], level_node: LevelNode, ) -> None: + # https://it.wiktionary.org/wiki/Aiuto:Traduzioni sense = "" translations = [] cats = {} diff --git a/tests/test_it_etymology.py b/tests/test_it_etymology.py index e228a363..110bc18a 100644 --- a/tests/test_it_etymology.py +++ b/tests/test_it_etymology.py @@ -7,7 +7,7 @@ from wiktextract.wxr_context import WiktextractContext -class TestItGloss(TestCase): +class TestItEtymology(TestCase): maxDiff = None def setUp(self) -> None: diff --git a/tests/test_it_translation.py b/tests/test_it_translation.py index 5566da77..c91ee5e6 100644 --- a/tests/test_it_translation.py +++ b/tests/test_it_translation.py @@ -7,7 +7,7 @@ from wiktextract.wxr_context import WiktextractContext -class TestItGloss(TestCase): +class TestItTranslation(TestCase): maxDiff = None def setUp(self) -> None: From b7ab69f1ec2eded41ab5968c9420b348692f021d Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 17 Dec 2024 15:34:57 +0800 Subject: [PATCH 4/5] [it] handle pos subsection templates --- src/wiktextract/extractor/it/pos.py | 37 +++++++++- tests/test_it_gloss.py | 104 ++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+), 1 deletion(-) diff --git a/src/wiktextract/extractor/it/pos.py b/src/wiktextract/extractor/it/pos.py index f863ff48..c8b59530 100644 --- a/src/wiktextract/extractor/it/pos.py +++ b/src/wiktextract/extractor/it/pos.py @@ -7,8 +7,24 @@ from .section_titles import POS_DATA from .tag_form_line import extract_tag_form_line_nodes +# https://it.wiktionary.org/wiki/Categoria:Template_per_i_verbi +POS_SUBSECTION_TEMPLATES = frozenset( + [ + "-participio passato-", + "-participio presente-", + "Ausiliare", + "Deponente", + "Intransitivo", + "Medio", + "Passivo", + "Reciproco", + "Riflessivo", + "Transitivo", + ] +) -def extract_pos_section( + +def add_new_pos_data( wxr: WiktextractContext, page_data: list[WordEntry], base_data: WordEntry, @@ -23,6 +39,15 @@ def extract_pos_section( for link_node in level_node.find_child(NodeKind.LINK): clean_node(wxr, page_data[-1], link_node) + +def extract_pos_section( + wxr: WiktextractContext, + page_data: list[WordEntry], + base_data: WordEntry, + level_node: LevelNode, + pos_title: str, +) -> None: + add_new_pos_data(wxr, page_data, base_data, level_node, pos_title) first_gloss_list_index = len(level_node.children) for index, node in enumerate(level_node.children): if ( @@ -35,6 +60,16 @@ def extract_pos_section( extract_gloss_list_item(wxr, page_data[-1], list_item) if index < first_gloss_list_index: first_gloss_list_index = index + elif ( + isinstance(node, TemplateNode) + and node.template_name in POS_SUBSECTION_TEMPLATES + ): + if len(page_data[-1].senses) > 0: + add_new_pos_data( + wxr, page_data, base_data, level_node, pos_title + ) + raw_tag = clean_node(wxr, page_data[-1], node).strip("= \n") + page_data[-1].raw_tags.append(raw_tag) extract_tag_form_line_nodes( wxr, page_data[-1], level_node.children[:first_gloss_list_index] diff --git a/tests/test_it_gloss.py b/tests/test_it_gloss.py index 2863cbfd..35774147 100644 --- a/tests/test_it_gloss.py +++ b/tests/test_it_gloss.py @@ -54,3 +54,107 @@ def test_gloss_list(self): } ], ) + + def test_double_pos_subsection_templates(self): + self.wxr.wtp.add_page("Template:-la-", 10, "Latino") + self.wxr.wtp.add_page( + "Template:Intransitivo", + 10, + """====[[intransitivo|Intransitivo]]==== +[[Categoria:Verbi intransitivi_in_latino]]""", + ) + self.wxr.wtp.add_page( + "Template:Deponente", + 10, + """====[[deponente|Deponente]]==== +[[Categoria:Verbi deponenti_in_latino]]""", + ) + data = parse_page( + self.wxr, + "aboriscor", + """== {{-la-}} == +===[[Image:Open_book_01.svg|30px|]]''[[verbo|Verbo]]''=== +[[Categoria:Verbi in latino]] +{{Intransitivo|la}} +{{Deponente|la}} +'''ăbŏriscor''' + +# [[venir]] [[meno]]""", + ) + self.assertEqual( + data, + [ + { + "lang": "Latino", + "lang_code": "la", + "word": "aboriscor", + "pos": "verb", + "pos_title": "Verbo", + "categories": [ + "Verbi in latino", + "Verbi intransitivi_in_latino", + "Verbi deponenti_in_latino", + ], + "senses": [{"glosses": ["venir meno"]}], + "raw_tags": ["Intransitivo", "Deponente"], + } + ], + ) + + def test_subsecton_template_add_new_word_entry(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + self.wxr.wtp.add_page( + "Template:Ausiliare", + 10, + """====[[ausiliare|Ausiliare]]==== +[[Categoria:Verbi ausiliari_in_italiano]]""", + ) + self.wxr.wtp.add_page( + "Template:Intransitivo", + 10, + """====[[intransitivo|Intransitivo]]==== +[[Categoria:Verbi intransitivi_in_latino]]""", + ) + data = parse_page( + self.wxr, + "essere", + """== {{-it-}} == +===[[Image:Open_book_01.svg|30px|]]''[[verbo|Verbo]]''=== +[[Categoria:Verbi in italiano]] +{{Ausiliare|it}} +# serve per la coniugazione + +{{Intransitivo|it}} +# Questo verbo serve per dire""", + ) + self.assertEqual( + data, + [ + { + "lang": "Italiano", + "lang_code": "it", + "word": "essere", + "pos": "verb", + "pos_title": "Verbo", + "categories": [ + "Verbi in italiano", + "Verbi ausiliari_in_italiano", + ], + "senses": [{"glosses": ["serve per la coniugazione"]}], + "raw_tags": ["Ausiliare"], + }, + { + "lang": "Italiano", + "lang_code": "it", + "word": "essere", + "pos": "verb", + "pos_title": "Verbo", + "categories": [ + "Verbi in italiano", + "Verbi intransitivi_in_latino", + ], + "senses": [{"glosses": ["Questo verbo serve per dire"]}], + "raw_tags": ["Intransitivo"], + }, + ], + ) From d533e569fdd61a8bad6bc6d26079aa738f097161 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 17 Dec 2024 16:58:51 +0800 Subject: [PATCH 5/5] [it] handle example list with more than one italic nodes layout --- src/wiktextract/extractor/it/example.py | 44 ++++++++++++-- src/wiktextract/extractor/it/pos.py | 12 ++-- tests/test_it_example.py | 76 +++++++++++++++++++++++++ 3 files changed, 122 insertions(+), 10 deletions(-) diff --git a/src/wiktextract/extractor/it/example.py b/src/wiktextract/extractor/it/example.py index d8ce4ad6..10b56d58 100644 --- a/src/wiktextract/extractor/it/example.py +++ b/src/wiktextract/extractor/it/example.py @@ -14,12 +14,15 @@ def extract_example_list_item( text_nodes = [] roman = "" translation = "" + ref = "" + has_zh_tradsem = False for index, node in enumerate(list_item.children): if ( isinstance(node, TemplateNode) and node.template_name == "zh-tradsem" ): examples.extend(extract_zh_tradsem(wxr, node)) + has_zh_tradsem = True elif isinstance(node, WikiNode): match node.kind: case NodeKind.ITALIC: @@ -39,17 +42,38 @@ def extract_example_list_item( case _ if lang_code in ["zh", "ja"]: if before_italic: text_nodes.append(node) - elif ( - isinstance(node, str) and lang_code in ["zh", "ja"] and "-" in node - ): + elif isinstance(node, str) and "-" in node: + for t_node in list_item.find_child(NodeKind.TEMPLATE): + if t_node.template_name == "Term": + ref = clean_node(wxr, None, t_node).strip("()") + break translation = clean_node( wxr, sense, wxr.wtp.node_to_wikitext( [node[node.index("-") + 1 :]] - + list_item.children[index + 1 :] + + [ + n + for n in list_item.children[index + 1 :] + if not ( + isinstance(n, TemplateNode) + and n.template_name == "Term" + ) + ] ), ) + if not has_zh_tradsem and len(examples) > 1: + examples.clear() + examples.append( + Example( + text=clean_node( + wxr, + None, + list_item.children[:index] + + [node[: node.index("-")]], + ) + ) + ) break elif lang_code in ["zh", "ja"] and len(examples) == 0 and before_italic: text_nodes.append(node) @@ -69,11 +93,23 @@ def extract_example_list_item( ) examples.append(example) + if not has_zh_tradsem and len(examples) > 1: + examples.clear() + examples.append( + Example( + text=clean_node( + wxr, None, list(list_item.invert_find_child(NodeKind.LIST)) + ) + ) + ) + for example in examples: if roman != "": example.roman = roman if translation != "": example.translation = translation + if ref != "": + example.ref = ref if example.text != "": sense.examples.append(example) diff --git a/src/wiktextract/extractor/it/pos.py b/src/wiktextract/extractor/it/pos.py index c8b59530..bded7d5c 100644 --- a/src/wiktextract/extractor/it/pos.py +++ b/src/wiktextract/extractor/it/pos.py @@ -91,12 +91,7 @@ def extract_gloss_list_item( else: gloss_nodes.append(t_str) elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: - if node.sarg.endswith("*"): - for example_list_item in node.find_child(NodeKind.LIST_ITEM): - extract_example_list_item( - wxr, sense, example_list_item, word_entry.lang_code - ) - elif ( + if ( node.sarg.endswith(":") and len(sense.examples) > 0 and sense.examples[-1].translation == "" @@ -105,6 +100,11 @@ def extract_gloss_list_item( sense.examples[-1].translation = clean_node( wxr, sense, tr_list_item.children ) + elif node.sarg.endswith(("*", ":")): + for example_list_item in node.find_child(NodeKind.LIST_ITEM): + extract_example_list_item( + wxr, sense, example_list_item, word_entry.lang_code + ) else: gloss_nodes.append(node) gloss_str = clean_node(wxr, sense, gloss_nodes) diff --git a/tests/test_it_example.py b/tests/test_it_example.py index 11b6747c..e8079358 100644 --- a/tests/test_it_example.py +++ b/tests/test_it_example.py @@ -139,3 +139,79 @@ def test_zh_tradsem(self): } ], ) + + def test_double_italic_nodes_with_translation(self): + self.wxr.wtp.add_page("Template:-en-", 10, "Inglese") + data = parse_page( + self.wxr, + "water", + """== {{-en-}} == +===Sostantivo=== +# acqua +#: ''May I have a glass of '''water'''?'' - ''Posso avere un bicchiere d''''acqua'''''?""", + ) + self.assertEqual( + data[0]["senses"], + [ + { + "glosses": ["acqua"], + "examples": [ + { + "text": "May I have a glass of water?", + "translation": "Posso avere un bicchiere d'acqua?", + } + ], + } + ], + ) + + def test_double_italic_nodes_no_translation(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "essere", + """== {{-it-}} == +===Sostantivo=== +#chi [[esiste]] +#* ''gli '''esseri''' viventi''; ''gli '''esseri''' animati''""", + ) + self.assertEqual( + data[0]["senses"], + [ + { + "glosses": ["chi esiste"], + "examples": [ + {"text": "gli esseri viventi; gli esseri animati"} + ], + } + ], + ) + + def test_term_ref_template(self): + self.wxr.wtp.add_page("Template:-la-", 10, "Latino") + self.wxr.wtp.add_page("Template:Term", 10, "({{{1}}})") + data = parse_page( + self.wxr, + "libero", + """== {{-la-}} == +===Verbo=== +# [[assolvere]], [[liberare]] dalle [[accuse]], [[giudicare]] [[innocente]] +#* ''et eum omni [[ignominia]] '''liberat''''' - e lo [[assolve]] da ogni [[ignominia]] {{Term|[[:w:Marco Tullio Cicerone|Cicerone]], [[:w:Pro Cluentio|Pro Cluentio]], [[:s:la:Pro_Aulo_Cluentio_Habito|XLVII, 132]]}}""", + ) + self.assertEqual( + data[0]["senses"], + [ + { + "glosses": [ + "assolvere, liberare dalle accuse, giudicare innocente" + ], + "examples": [ + { + "text": "et eum omni ignominia liberat", + "translation": "e lo assolve da ogni ignominia", + "ref": "Cicerone, Pro Cluentio, XLVII, 132", + } + ], + } + ], + )