From 049ee8dac80bfd3f865e368d4631773d3a041c75 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 12 Dec 2024 15:01:07 +0800 Subject: [PATCH 1/3] [it] extract Chinese and Japanese example lists --- src/wiktextract/extractor/it/example.py | 92 +++++++++++++++++++++--- src/wiktextract/extractor/it/models.py | 6 ++ src/wiktextract/extractor/it/pos.py | 13 +++- tests/test_it_example.py | 96 +++++++++++++++++++++++++ 4 files changed, 197 insertions(+), 10 deletions(-) diff --git a/src/wiktextract/extractor/it/example.py b/src/wiktextract/extractor/it/example.py index 6117b854..d8ce4ad6 100644 --- a/src/wiktextract/extractor/it/example.py +++ b/src/wiktextract/extractor/it/example.py @@ -1,24 +1,98 @@ -from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor import NodeKind, TemplateNode, WikiNode from ...page import clean_node from ...wxr_context import WiktextractContext +from ..ruby import extract_ruby from .models import Example, Sense def extract_example_list_item( - wxr: WiktextractContext, sense: Sense, list_item: WikiNode + wxr: WiktextractContext, sense: Sense, list_item: WikiNode, lang_code: str ) -> None: - example = Example() - for node in list_item.children: - if isinstance(node, WikiNode): + examples = [] + before_italic = True + text_nodes = [] + roman = "" + translation = "" + for index, node in enumerate(list_item.children): + if ( + isinstance(node, TemplateNode) + and node.template_name == "zh-tradsem" + ): + examples.extend(extract_zh_tradsem(wxr, node)) + elif isinstance(node, WikiNode): match node.kind: case NodeKind.ITALIC: - example.text = clean_node(wxr, sense, node) + if lang_code in ["zh", "ja"]: + if before_italic: + roman = clean_node(wxr, sense, node) + before_italic = False + else: + examples.append( + Example(text=clean_node(wxr, sense, node)) + ) case NodeKind.LIST: for tr_list_item in node.find_child(NodeKind.LIST_ITEM): - example.translation = clean_node( + translation = clean_node( wxr, sense, tr_list_item.children ) + case _ if lang_code in ["zh", "ja"]: + if before_italic: + text_nodes.append(node) + elif ( + isinstance(node, str) and lang_code in ["zh", "ja"] and "-" in node + ): + translation = clean_node( + wxr, + sense, + wxr.wtp.node_to_wikitext( + [node[node.index("-") + 1 :]] + + list_item.children[index + 1 :] + ), + ) + break + elif lang_code in ["zh", "ja"] and len(examples) == 0 and before_italic: + text_nodes.append(node) + + if lang_code in ["zh", "ja"] and len(examples) == 0 and len(text_nodes) > 0: + expanded_nodes = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(text_nodes), expand_all=True + ) + example = Example() + example.ruby, node_without_ruby = extract_ruby( + wxr, expanded_nodes.children + ) + example.text = ( + clean_node(wxr, sense, node_without_ruby) + .replace(" ", "") + .strip("(") + ) + examples.append(example) + + for example in examples: + if roman != "": + example.roman = roman + if translation != "": + example.translation = translation + if example.text != "": + sense.examples.append(example) + + +def extract_zh_tradsem( + wxr: WiktextractContext, t_node: TemplateNode +) -> list[Example]: + # https://it.wiktionary.org/wiki/Template:zh-tradsem + examples = [] + for arg_index in [1, 2]: + arg_value = clean_node( + wxr, None, t_node.template_parameters.get(arg_index, "") + ).replace(" ", "") + if arg_value != "": + example = Example(text=arg_value) + if arg_index == 1: + example.tags.append("Traditional Chinese") + elif arg_index == 2: + example.tags.append("Simplified Chinese") + examples.append(example) - if example.text != "": - sense.examples.append(example) + return examples diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py index 113da01c..7b374e96 100644 --- a/src/wiktextract/extractor/it/models.py +++ b/src/wiktextract/extractor/it/models.py @@ -14,6 +14,12 @@ class Example(ItalianBaseModel): text: str = "" translation: str = "" ref: str = "" + ruby: list[tuple[str, ...]] = Field( + default=[], description="Japanese Kanji and furigana" + ) + roman: str = "" + tags: list[str] = [] + raw_tags: list[str] = [] class Sense(ItalianBaseModel): diff --git a/src/wiktextract/extractor/it/pos.py b/src/wiktextract/extractor/it/pos.py index 590cbd56..91f8a9bc 100644 --- a/src/wiktextract/extractor/it/pos.py +++ b/src/wiktextract/extractor/it/pos.py @@ -47,7 +47,18 @@ def extract_gloss_list_item( elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: if node.sarg.endswith("*"): for example_list_item in node.find_child(NodeKind.LIST_ITEM): - extract_example_list_item(wxr, sense, example_list_item) + extract_example_list_item( + wxr, sense, example_list_item, word_entry.lang_code + ) + elif ( + node.sarg.endswith(":") + and len(sense.examples) > 0 + and sense.examples[-1].translation == "" + ): + for tr_list_item in node.find_child(NodeKind.LIST_ITEM): + sense.examples[-1].translation = clean_node( + wxr, sense, tr_list_item.children + ) else: gloss_nodes.append(node) gloss_str = clean_node(wxr, sense, gloss_nodes) diff --git a/tests/test_it_example.py b/tests/test_it_example.py index ae66a81e..11b6747c 100644 --- a/tests/test_it_example.py +++ b/tests/test_it_example.py @@ -43,3 +43,99 @@ def test_list_example(self): } ], ) + + def test_all_in_one_line(self): + self.wxr.wtp.add_page("Template:-zh-", 10, "Cinese") + data = parse_page( + self.wxr, + "幼虫", + """== {{-zh-}} == +===Sostantivo=== +# larva +#* [[苍蝇]] [[的]]'''幼虫''' ''cāngyíng de '''yòuchóng''''' - [[larva]] di [[mosca]], [[bigattino]]""", + ) + self.assertEqual( + data[0]["senses"], + [ + { + "glosses": ["larva"], + "examples": [ + { + "text": "苍蝇的幼虫", + "roman": "cāngyíng de yòuchóng", + "translation": "larva di mosca, bigattino", + } + ], + } + ], + ) + + def test_ja_r(self): + self.wxr.wtp.add_page("Template:-ja-", 10, "Giapponese") + self.wxr.wtp.add_page( + "Template:ja-r", + 10, + """{{#switch:{{{1}}} +| 今 = [[今#Giapponese| (いま)]] +| 行く = [[行く#Giapponese| ()]] +| よ = [[よ#Giapponese|]] +}}""", + ) + data = parse_page( + self.wxr, + "行く", + """== {{-ja-}} == +===Verbo=== +# andare +#* {{ja-r|今|いま|rom=-}}'''{{ja-r|行く|いく|rom=-}}'''{{ja-r|よ|rom=-}}! (''ima '''iku''' yo!'') +#: ''sto '''andando'''!''""", + ) + self.assertEqual( + data[0]["senses"], + [ + { + "glosses": ["andare"], + "examples": [ + { + "text": "今行くよ!", + "roman": "ima iku yo!", + "translation": "sto andando!", + "ruby": [("今", "いま"), ("行", "い")], + } + ], + } + ], + ) + + def test_zh_tradsem(self): + self.wxr.wtp.add_page("Template:-zh-", 10, "Cinese") + data = parse_page( + self.wxr, + "可能", + """== {{-zh-}} == +===Aggettivo=== +# probabile +#* {{zh-tradsem|[[一]] [[個]]'''可能'''[[的]] [[事件]]|[[一]] [[个]]'''可能'''[[的]] [[事件]]}} ''yī ge '''kěnéng''' de shìjiàn'' - un [[evento]] [[possibile]]""", + ) + self.assertEqual( + data[0]["senses"], + [ + { + "glosses": ["probabile"], + "examples": [ + { + "text": "一個可能的事件", + "roman": "yī ge kěnéng de shìjiàn", + "translation": "un evento possibile", + "tags": ["Traditional Chinese"], + }, + { + "text": "一个可能的事件", + "roman": "yī ge kěnéng de shìjiàn", + "translation": "un evento possibile", + "tags": ["Simplified Chinese"], + }, + ], + } + ], + ) From 22ee0bfd400c29c9392ef927580d461bfbc0466d Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 12 Dec 2024 16:39:25 +0800 Subject: [PATCH 2/3] [it] extract translation section --- src/wiktextract/extractor/it/models.py | 14 ++++ src/wiktextract/extractor/it/page.py | 3 + src/wiktextract/extractor/it/translation.py | 85 +++++++++++++++++++++ tests/test_it_translation.py | 54 +++++++++++++ 4 files changed, 156 insertions(+) create mode 100644 src/wiktextract/extractor/it/translation.py create mode 100644 tests/test_it_translation.py diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py index 7b374e96..b3e99345 100644 --- a/src/wiktextract/extractor/it/models.py +++ b/src/wiktextract/extractor/it/models.py @@ -30,6 +30,19 @@ class Sense(ItalianBaseModel): examples: list[Example] = [] +class Translation(ItalianBaseModel): + lang_code: str = Field( + default="", + description="Wiktionary language code of the translation term", + ) + lang: str = Field(default="", description="Translation language name") + word: str = Field(default="", description="Translation term") + sense: str = Field(default="", description="Translation gloss") + tags: list[str] = [] + raw_tags: list[str] = [] + roman: str = "" + + class WordEntry(ItalianBaseModel): model_config = ConfigDict(title="Italian Wiktionary") word: str = Field(description="Word string", min_length=1) @@ -41,3 +54,4 @@ class WordEntry(ItalianBaseModel): categories: list[str] = [] tags: list[str] = [] raw_tags: list[str] = [] + translations: list[Translation] = [] diff --git a/src/wiktextract/extractor/it/page.py b/src/wiktextract/extractor/it/page.py index 3be347cd..46b2b224 100644 --- a/src/wiktextract/extractor/it/page.py +++ b/src/wiktextract/extractor/it/page.py @@ -7,6 +7,7 @@ from .models import Sense, WordEntry from .pos import extract_pos_section from .section_titles import POS_DATA +from .translation import extract_translation_section def parse_section( @@ -18,6 +19,8 @@ def parse_section( title_text = clean_node(wxr, None, level_node.largs) if title_text in POS_DATA: extract_pos_section(wxr, page_data, base_data, level_node, title_text) + elif title_text == "Traduzione": + extract_translation_section(wxr, page_data, level_node) for next_level in level_node.find_child(LEVEL_KIND_FLAGS): parse_section(wxr, page_data, base_data, next_level) diff --git a/src/wiktextract/extractor/it/translation.py b/src/wiktextract/extractor/it/translation.py new file mode 100644 index 00000000..8467177c --- /dev/null +++ b/src/wiktextract/extractor/it/translation.py @@ -0,0 +1,85 @@ +import re + +from mediawiki_langcodes import name_to_code +from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Translation, WordEntry + + +def extract_translation_section( + wxr: WiktextractContext, + page_data: list[WordEntry], + level_node: LevelNode, +) -> None: + sense = "" + translations = [] + cats = {} + for node in level_node.children: + if isinstance(node, TemplateNode) and node.template_name == "Trad1": + sense = clean_node(wxr, cats, node.template_parameters.get(1, "")) + elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: + for list_item in node.find_child(NodeKind.LIST_ITEM): + translations.extend( + extract_translation_list_item(wxr, list_item, sense) + ) + + for data in page_data: + if data.lang_code == page_data[-1].lang_code: + data.translations.extend(translations) + data.categories.extend(cats.get("categories", [])) + + +TR_GENDER_TAGS = { + "c": "common", + "f": "feminine", + "m": "masculine", + "n": "neuter", +} + + +def extract_translation_list_item( + wxr: WiktextractContext, list_item: WikiNode, sense: str +) -> list[Translation]: + translations = [] + lang_name = "unknown" + lang_code = "unknown" + before_colon = True + for index, node in enumerate(list_item.children): + if before_colon and isinstance(node, str) and ":" in node: + before_colon = False + lang_name = clean_node(wxr, None, list_item.children[:index]) + for n in list_item.children[:index]: + if isinstance(n, TemplateNode): + lang_code = n.template_name + break + if lang_code == "unknown": + new_lang_code = name_to_code(lang_name, "it") + if new_lang_code != "": + lang_code = new_lang_code + elif not before_colon and isinstance(node, WikiNode): + match node.kind: + case NodeKind.LINK: + word = clean_node(wxr, None, node) + if word != "": + translations.append( + Translation( + word=word, + sense=sense, + lang=lang_name, + lang_code=lang_code, + ) + ) + case NodeKind.ITALIC: + raw_tag = clean_node(wxr, None, node) + if raw_tag in TR_GENDER_TAGS and len(translations) > 0: + translations[-1].tags.append(TR_GENDER_TAGS[raw_tag]) + elif raw_tag != "" and len(translations) > 0: + translations[-1].raw_tags.append(raw_tag) + elif not before_colon and isinstance(node, str): + m = re.search(r"\((.+)\)", node) + if m is not None and len(translations) > 0: + translations[-1].roman = m.group(1) + + return translations diff --git a/tests/test_it_translation.py b/tests/test_it_translation.py new file mode 100644 index 00000000..5566da77 --- /dev/null +++ b/tests/test_it_translation.py @@ -0,0 +1,54 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.it.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestItGloss(TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="it"), + WiktionaryConfig( + dump_file_lang_code="it", capture_language_codes=None + ), + ) + + def test_common_lists(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + self.wxr.wtp.add_page("Template:ar", 10, "arabo") + data = parse_page( + self.wxr, + "cane", + """== {{-it-}} == +===Sostantivo=== +# [[animale]] +===Traduzione=== +{{Trad1|animale}} +:*{{ar}}: [[كَلْب]] (kalb) ''m'' +:*[[romagnolo]]: [[chèn]] ''m''""", + ) + self.assertEqual( + data[0]["translations"], + [ + { + "word": "كَلْب", + "lang_code": "ar", + "lang": "arabo", + "roman": "kalb", + "tags": ["masculine"], + "sense": "animale", + }, + { + "word": "chèn", + "lang_code": "rgn", + "lang": "romagnolo", + "tags": ["masculine"], + "sense": "animale", + }, + ], + ) From 76e755704ee600f347f850dee5e1b5987e126a1b Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 12 Dec 2024 16:55:50 +0800 Subject: [PATCH 3/3] [it] override "-trad1-" translation section template --- src/wiktextract/data/overrides/it.json | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 src/wiktextract/data/overrides/it.json diff --git a/src/wiktextract/data/overrides/it.json b/src/wiktextract/data/overrides/it.json new file mode 100644 index 00000000..a0076d36 --- /dev/null +++ b/src/wiktextract/data/overrides/it.json @@ -0,0 +1,7 @@ +{ + "Template:-trad1-": { + "body": "===Traduzione===\n", + "namespace_id": 10, + "need_pre_expand": true + } +}