diff --git a/src/wiktextract/data/overrides/it.json b/src/wiktextract/data/overrides/it.json
new file mode 100644
index 00000000..a0076d36
--- /dev/null
+++ b/src/wiktextract/data/overrides/it.json
@@ -0,0 +1,7 @@
+{
+ "Template:-trad1-": {
+ "body": "===Traduzione===\n",
+ "namespace_id": 10,
+ "need_pre_expand": true
+ }
+}
diff --git a/src/wiktextract/extractor/it/example.py b/src/wiktextract/extractor/it/example.py
index 6117b854..d8ce4ad6 100644
--- a/src/wiktextract/extractor/it/example.py
+++ b/src/wiktextract/extractor/it/example.py
@@ -1,24 +1,98 @@
-from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor import NodeKind, TemplateNode, WikiNode
from ...page import clean_node
from ...wxr_context import WiktextractContext
+from ..ruby import extract_ruby
from .models import Example, Sense
def extract_example_list_item(
- wxr: WiktextractContext, sense: Sense, list_item: WikiNode
+ wxr: WiktextractContext, sense: Sense, list_item: WikiNode, lang_code: str
) -> None:
- example = Example()
- for node in list_item.children:
- if isinstance(node, WikiNode):
+ examples = []
+ before_italic = True
+ text_nodes = []
+ roman = ""
+ translation = ""
+ for index, node in enumerate(list_item.children):
+ if (
+ isinstance(node, TemplateNode)
+ and node.template_name == "zh-tradsem"
+ ):
+ examples.extend(extract_zh_tradsem(wxr, node))
+ elif isinstance(node, WikiNode):
match node.kind:
case NodeKind.ITALIC:
- example.text = clean_node(wxr, sense, node)
+ if lang_code in ["zh", "ja"]:
+ if before_italic:
+ roman = clean_node(wxr, sense, node)
+ before_italic = False
+ else:
+ examples.append(
+ Example(text=clean_node(wxr, sense, node))
+ )
case NodeKind.LIST:
for tr_list_item in node.find_child(NodeKind.LIST_ITEM):
- example.translation = clean_node(
+ translation = clean_node(
wxr, sense, tr_list_item.children
)
+ case _ if lang_code in ["zh", "ja"]:
+ if before_italic:
+ text_nodes.append(node)
+ elif (
+ isinstance(node, str) and lang_code in ["zh", "ja"] and "-" in node
+ ):
+ translation = clean_node(
+ wxr,
+ sense,
+ wxr.wtp.node_to_wikitext(
+ [node[node.index("-") + 1 :]]
+ + list_item.children[index + 1 :]
+ ),
+ )
+ break
+ elif lang_code in ["zh", "ja"] and len(examples) == 0 and before_italic:
+ text_nodes.append(node)
+
+ if lang_code in ["zh", "ja"] and len(examples) == 0 and len(text_nodes) > 0:
+ expanded_nodes = wxr.wtp.parse(
+ wxr.wtp.node_to_wikitext(text_nodes), expand_all=True
+ )
+ example = Example()
+ example.ruby, node_without_ruby = extract_ruby(
+ wxr, expanded_nodes.children
+ )
+ example.text = (
+ clean_node(wxr, sense, node_without_ruby)
+ .replace(" ", "")
+ .strip("(")
+ )
+ examples.append(example)
+
+ for example in examples:
+ if roman != "":
+ example.roman = roman
+ if translation != "":
+ example.translation = translation
+ if example.text != "":
+ sense.examples.append(example)
+
+
+def extract_zh_tradsem(
+ wxr: WiktextractContext, t_node: TemplateNode
+) -> list[Example]:
+ # https://it.wiktionary.org/wiki/Template:zh-tradsem
+ examples = []
+ for arg_index in [1, 2]:
+ arg_value = clean_node(
+ wxr, None, t_node.template_parameters.get(arg_index, "")
+ ).replace(" ", "")
+ if arg_value != "":
+ example = Example(text=arg_value)
+ if arg_index == 1:
+ example.tags.append("Traditional Chinese")
+ elif arg_index == 2:
+ example.tags.append("Simplified Chinese")
+ examples.append(example)
- if example.text != "":
- sense.examples.append(example)
+ return examples
diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py
index 113da01c..b3e99345 100644
--- a/src/wiktextract/extractor/it/models.py
+++ b/src/wiktextract/extractor/it/models.py
@@ -14,6 +14,12 @@ class Example(ItalianBaseModel):
text: str = ""
translation: str = ""
ref: str = ""
+ ruby: list[tuple[str, ...]] = Field(
+ default=[], description="Japanese Kanji and furigana"
+ )
+ roman: str = ""
+ tags: list[str] = []
+ raw_tags: list[str] = []
class Sense(ItalianBaseModel):
@@ -24,6 +30,19 @@ class Sense(ItalianBaseModel):
examples: list[Example] = []
+class Translation(ItalianBaseModel):
+ lang_code: str = Field(
+ default="",
+ description="Wiktionary language code of the translation term",
+ )
+ lang: str = Field(default="", description="Translation language name")
+ word: str = Field(default="", description="Translation term")
+ sense: str = Field(default="", description="Translation gloss")
+ tags: list[str] = []
+ raw_tags: list[str] = []
+ roman: str = ""
+
+
class WordEntry(ItalianBaseModel):
model_config = ConfigDict(title="Italian Wiktionary")
word: str = Field(description="Word string", min_length=1)
@@ -35,3 +54,4 @@ class WordEntry(ItalianBaseModel):
categories: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
+ translations: list[Translation] = []
diff --git a/src/wiktextract/extractor/it/page.py b/src/wiktextract/extractor/it/page.py
index 3be347cd..46b2b224 100644
--- a/src/wiktextract/extractor/it/page.py
+++ b/src/wiktextract/extractor/it/page.py
@@ -7,6 +7,7 @@
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA
+from .translation import extract_translation_section
def parse_section(
@@ -18,6 +19,8 @@ def parse_section(
title_text = clean_node(wxr, None, level_node.largs)
if title_text in POS_DATA:
extract_pos_section(wxr, page_data, base_data, level_node, title_text)
+ elif title_text == "Traduzione":
+ extract_translation_section(wxr, page_data, level_node)
for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)
diff --git a/src/wiktextract/extractor/it/pos.py b/src/wiktextract/extractor/it/pos.py
index 590cbd56..91f8a9bc 100644
--- a/src/wiktextract/extractor/it/pos.py
+++ b/src/wiktextract/extractor/it/pos.py
@@ -47,7 +47,18 @@ def extract_gloss_list_item(
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
if node.sarg.endswith("*"):
for example_list_item in node.find_child(NodeKind.LIST_ITEM):
- extract_example_list_item(wxr, sense, example_list_item)
+ extract_example_list_item(
+ wxr, sense, example_list_item, word_entry.lang_code
+ )
+ elif (
+ node.sarg.endswith(":")
+ and len(sense.examples) > 0
+ and sense.examples[-1].translation == ""
+ ):
+ for tr_list_item in node.find_child(NodeKind.LIST_ITEM):
+ sense.examples[-1].translation = clean_node(
+ wxr, sense, tr_list_item.children
+ )
else:
gloss_nodes.append(node)
gloss_str = clean_node(wxr, sense, gloss_nodes)
diff --git a/src/wiktextract/extractor/it/translation.py b/src/wiktextract/extractor/it/translation.py
new file mode 100644
index 00000000..8467177c
--- /dev/null
+++ b/src/wiktextract/extractor/it/translation.py
@@ -0,0 +1,85 @@
+import re
+
+from mediawiki_langcodes import name_to_code
+from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Translation, WordEntry
+
+
+def extract_translation_section(
+ wxr: WiktextractContext,
+ page_data: list[WordEntry],
+ level_node: LevelNode,
+) -> None:
+ sense = ""
+ translations = []
+ cats = {}
+ for node in level_node.children:
+ if isinstance(node, TemplateNode) and node.template_name == "Trad1":
+ sense = clean_node(wxr, cats, node.template_parameters.get(1, ""))
+ elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+ for list_item in node.find_child(NodeKind.LIST_ITEM):
+ translations.extend(
+ extract_translation_list_item(wxr, list_item, sense)
+ )
+
+ for data in page_data:
+ if data.lang_code == page_data[-1].lang_code:
+ data.translations.extend(translations)
+ data.categories.extend(cats.get("categories", []))
+
+
+TR_GENDER_TAGS = {
+ "c": "common",
+ "f": "feminine",
+ "m": "masculine",
+ "n": "neuter",
+}
+
+
+def extract_translation_list_item(
+ wxr: WiktextractContext, list_item: WikiNode, sense: str
+) -> list[Translation]:
+ translations = []
+ lang_name = "unknown"
+ lang_code = "unknown"
+ before_colon = True
+ for index, node in enumerate(list_item.children):
+ if before_colon and isinstance(node, str) and ":" in node:
+ before_colon = False
+ lang_name = clean_node(wxr, None, list_item.children[:index])
+ for n in list_item.children[:index]:
+ if isinstance(n, TemplateNode):
+ lang_code = n.template_name
+ break
+ if lang_code == "unknown":
+ new_lang_code = name_to_code(lang_name, "it")
+ if new_lang_code != "":
+ lang_code = new_lang_code
+ elif not before_colon and isinstance(node, WikiNode):
+ match node.kind:
+ case NodeKind.LINK:
+ word = clean_node(wxr, None, node)
+ if word != "":
+ translations.append(
+ Translation(
+ word=word,
+ sense=sense,
+ lang=lang_name,
+ lang_code=lang_code,
+ )
+ )
+ case NodeKind.ITALIC:
+ raw_tag = clean_node(wxr, None, node)
+ if raw_tag in TR_GENDER_TAGS and len(translations) > 0:
+ translations[-1].tags.append(TR_GENDER_TAGS[raw_tag])
+ elif raw_tag != "" and len(translations) > 0:
+ translations[-1].raw_tags.append(raw_tag)
+ elif not before_colon and isinstance(node, str):
+ m = re.search(r"\((.+)\)", node)
+ if m is not None and len(translations) > 0:
+ translations[-1].roman = m.group(1)
+
+ return translations
diff --git a/tests/test_it_example.py b/tests/test_it_example.py
index ae66a81e..11b6747c 100644
--- a/tests/test_it_example.py
+++ b/tests/test_it_example.py
@@ -43,3 +43,99 @@ def test_list_example(self):
}
],
)
+
+ def test_all_in_one_line(self):
+ self.wxr.wtp.add_page("Template:-zh-", 10, "Cinese")
+ data = parse_page(
+ self.wxr,
+ "幼虫",
+ """== {{-zh-}} ==
+===Sostantivo===
+# larva
+#* [[苍蝇]] [[的]]'''幼虫''' ''cāngyíng de '''yòuchóng''''' - [[larva]] di [[mosca]], [[bigattino]]""",
+ )
+ self.assertEqual(
+ data[0]["senses"],
+ [
+ {
+ "glosses": ["larva"],
+ "examples": [
+ {
+ "text": "苍蝇的幼虫",
+ "roman": "cāngyíng de yòuchóng",
+ "translation": "larva di mosca, bigattino",
+ }
+ ],
+ }
+ ],
+ )
+
+ def test_ja_r(self):
+ self.wxr.wtp.add_page("Template:-ja-", 10, "Giapponese")
+ self.wxr.wtp.add_page(
+ "Template:ja-r",
+ 10,
+ """{{#switch:{{{1}}}
+| 今 = [[今#Giapponese|今]]
+| 行く = [[行く#Giapponese|行く]]
+| よ = [[よ#Giapponese|よ]]
+}}""",
+ )
+ data = parse_page(
+ self.wxr,
+ "行く",
+ """== {{-ja-}} ==
+===Verbo===
+# andare
+#* {{ja-r|今|いま|rom=-}}'''{{ja-r|行く|いく|rom=-}}'''{{ja-r|よ|rom=-}}! (''ima '''iku''' yo!'')
+#: ''sto '''andando'''!''""",
+ )
+ self.assertEqual(
+ data[0]["senses"],
+ [
+ {
+ "glosses": ["andare"],
+ "examples": [
+ {
+ "text": "今行くよ!",
+ "roman": "ima iku yo!",
+ "translation": "sto andando!",
+ "ruby": [("今", "いま"), ("行", "い")],
+ }
+ ],
+ }
+ ],
+ )
+
+ def test_zh_tradsem(self):
+ self.wxr.wtp.add_page("Template:-zh-", 10, "Cinese")
+ data = parse_page(
+ self.wxr,
+ "可能",
+ """== {{-zh-}} ==
+===Aggettivo===
+# probabile
+#* {{zh-tradsem|[[一]] [[個]]'''可能'''[[的]] [[事件]]|[[一]] [[个]]'''可能'''[[的]] [[事件]]}} ''yī ge '''kěnéng''' de shìjiàn'' - un [[evento]] [[possibile]]""",
+ )
+ self.assertEqual(
+ data[0]["senses"],
+ [
+ {
+ "glosses": ["probabile"],
+ "examples": [
+ {
+ "text": "一個可能的事件",
+ "roman": "yī ge kěnéng de shìjiàn",
+ "translation": "un evento possibile",
+ "tags": ["Traditional Chinese"],
+ },
+ {
+ "text": "一个可能的事件",
+ "roman": "yī ge kěnéng de shìjiàn",
+ "translation": "un evento possibile",
+ "tags": ["Simplified Chinese"],
+ },
+ ],
+ }
+ ],
+ )
diff --git a/tests/test_it_translation.py b/tests/test_it_translation.py
new file mode 100644
index 00000000..5566da77
--- /dev/null
+++ b/tests/test_it_translation.py
@@ -0,0 +1,54 @@
+from unittest import TestCase
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.it.page import parse_page
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestItGloss(TestCase):
+ maxDiff = None
+
+ def setUp(self) -> None:
+ self.wxr = WiktextractContext(
+ Wtp(lang_code="it"),
+ WiktionaryConfig(
+ dump_file_lang_code="it", capture_language_codes=None
+ ),
+ )
+
+ def test_common_lists(self):
+ self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
+ self.wxr.wtp.add_page("Template:ar", 10, "arabo")
+ data = parse_page(
+ self.wxr,
+ "cane",
+ """== {{-it-}} ==
+===Sostantivo===
+# [[animale]]
+===Traduzione===
+{{Trad1|animale}}
+:*{{ar}}: [[كَلْب]] (kalb) ''m''
+:*[[romagnolo]]: [[chèn]] ''m''""",
+ )
+ self.assertEqual(
+ data[0]["translations"],
+ [
+ {
+ "word": "كَلْب",
+ "lang_code": "ar",
+ "lang": "arabo",
+ "roman": "kalb",
+ "tags": ["masculine"],
+ "sense": "animale",
+ },
+ {
+ "word": "chèn",
+ "lang_code": "rgn",
+ "lang": "romagnolo",
+ "tags": ["masculine"],
+ "sense": "animale",
+ },
+ ],
+ )