Skip to content

Commit

Permalink
Merge pull request #945 from xxyzz/it
Browse files Browse the repository at this point in the history
[it] extract zh and ja example lists, extract translation section
  • Loading branch information
xxyzz authored Dec 12, 2024
2 parents ba9f46d + 76e7557 commit 0ddf970
Show file tree
Hide file tree
Showing 8 changed files with 360 additions and 10 deletions.
7 changes: 7 additions & 0 deletions src/wiktextract/data/overrides/it.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"Template:-trad1-": {
"body": "===Traduzione===\n",
"namespace_id": 10,
"need_pre_expand": true
}
}
92 changes: 83 additions & 9 deletions src/wiktextract/extractor/it/example.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,98 @@
from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor import NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from ..ruby import extract_ruby
from .models import Example, Sense


def extract_example_list_item(
wxr: WiktextractContext, sense: Sense, list_item: WikiNode
wxr: WiktextractContext, sense: Sense, list_item: WikiNode, lang_code: str
) -> None:
example = Example()
for node in list_item.children:
if isinstance(node, WikiNode):
examples = []
before_italic = True
text_nodes = []
roman = ""
translation = ""
for index, node in enumerate(list_item.children):
if (
isinstance(node, TemplateNode)
and node.template_name == "zh-tradsem"
):
examples.extend(extract_zh_tradsem(wxr, node))
elif isinstance(node, WikiNode):
match node.kind:
case NodeKind.ITALIC:
example.text = clean_node(wxr, sense, node)
if lang_code in ["zh", "ja"]:
if before_italic:
roman = clean_node(wxr, sense, node)
before_italic = False
else:
examples.append(
Example(text=clean_node(wxr, sense, node))
)
case NodeKind.LIST:
for tr_list_item in node.find_child(NodeKind.LIST_ITEM):
example.translation = clean_node(
translation = clean_node(
wxr, sense, tr_list_item.children
)
case _ if lang_code in ["zh", "ja"]:
if before_italic:
text_nodes.append(node)
elif (
isinstance(node, str) and lang_code in ["zh", "ja"] and "-" in node
):
translation = clean_node(
wxr,
sense,
wxr.wtp.node_to_wikitext(
[node[node.index("-") + 1 :]]
+ list_item.children[index + 1 :]
),
)
break
elif lang_code in ["zh", "ja"] and len(examples) == 0 and before_italic:
text_nodes.append(node)

if lang_code in ["zh", "ja"] and len(examples) == 0 and len(text_nodes) > 0:
expanded_nodes = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(text_nodes), expand_all=True
)
example = Example()
example.ruby, node_without_ruby = extract_ruby(
wxr, expanded_nodes.children
)
example.text = (
clean_node(wxr, sense, node_without_ruby)
.replace(" ", "")
.strip("(")
)
examples.append(example)

for example in examples:
if roman != "":
example.roman = roman
if translation != "":
example.translation = translation
if example.text != "":
sense.examples.append(example)


def extract_zh_tradsem(
wxr: WiktextractContext, t_node: TemplateNode
) -> list[Example]:
# https://it.wiktionary.org/wiki/Template:zh-tradsem
examples = []
for arg_index in [1, 2]:
arg_value = clean_node(
wxr, None, t_node.template_parameters.get(arg_index, "")
).replace(" ", "")
if arg_value != "":
example = Example(text=arg_value)
if arg_index == 1:
example.tags.append("Traditional Chinese")
elif arg_index == 2:
example.tags.append("Simplified Chinese")
examples.append(example)

if example.text != "":
sense.examples.append(example)
return examples
20 changes: 20 additions & 0 deletions src/wiktextract/extractor/it/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ class Example(ItalianBaseModel):
text: str = ""
translation: str = ""
ref: str = ""
ruby: list[tuple[str, ...]] = Field(
default=[], description="Japanese Kanji and furigana"
)
roman: str = ""
tags: list[str] = []
raw_tags: list[str] = []


class Sense(ItalianBaseModel):
Expand All @@ -24,6 +30,19 @@ class Sense(ItalianBaseModel):
examples: list[Example] = []


class Translation(ItalianBaseModel):
lang_code: str = Field(
default="",
description="Wiktionary language code of the translation term",
)
lang: str = Field(default="", description="Translation language name")
word: str = Field(default="", description="Translation term")
sense: str = Field(default="", description="Translation gloss")
tags: list[str] = []
raw_tags: list[str] = []
roman: str = ""


class WordEntry(ItalianBaseModel):
model_config = ConfigDict(title="Italian Wiktionary")
word: str = Field(description="Word string", min_length=1)
Expand All @@ -35,3 +54,4 @@ class WordEntry(ItalianBaseModel):
categories: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
translations: list[Translation] = []
3 changes: 3 additions & 0 deletions src/wiktextract/extractor/it/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA
from .translation import extract_translation_section


def parse_section(
Expand All @@ -18,6 +19,8 @@ def parse_section(
title_text = clean_node(wxr, None, level_node.largs)
if title_text in POS_DATA:
extract_pos_section(wxr, page_data, base_data, level_node, title_text)
elif title_text == "Traduzione":
extract_translation_section(wxr, page_data, level_node)

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)
Expand Down
13 changes: 12 additions & 1 deletion src/wiktextract/extractor/it/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,18 @@ def extract_gloss_list_item(
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
if node.sarg.endswith("*"):
for example_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(wxr, sense, example_list_item)
extract_example_list_item(
wxr, sense, example_list_item, word_entry.lang_code
)
elif (
node.sarg.endswith(":")
and len(sense.examples) > 0
and sense.examples[-1].translation == ""
):
for tr_list_item in node.find_child(NodeKind.LIST_ITEM):
sense.examples[-1].translation = clean_node(
wxr, sense, tr_list_item.children
)
else:
gloss_nodes.append(node)
gloss_str = clean_node(wxr, sense, gloss_nodes)
Expand Down
85 changes: 85 additions & 0 deletions src/wiktextract/extractor/it/translation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import re

from mediawiki_langcodes import name_to_code
from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Translation, WordEntry


def extract_translation_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
level_node: LevelNode,
) -> None:
sense = ""
translations = []
cats = {}
for node in level_node.children:
if isinstance(node, TemplateNode) and node.template_name == "Trad1":
sense = clean_node(wxr, cats, node.template_parameters.get(1, ""))
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
for list_item in node.find_child(NodeKind.LIST_ITEM):
translations.extend(
extract_translation_list_item(wxr, list_item, sense)
)

for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.translations.extend(translations)
data.categories.extend(cats.get("categories", []))


TR_GENDER_TAGS = {
"c": "common",
"f": "feminine",
"m": "masculine",
"n": "neuter",
}


def extract_translation_list_item(
wxr: WiktextractContext, list_item: WikiNode, sense: str
) -> list[Translation]:
translations = []
lang_name = "unknown"
lang_code = "unknown"
before_colon = True
for index, node in enumerate(list_item.children):
if before_colon and isinstance(node, str) and ":" in node:
before_colon = False
lang_name = clean_node(wxr, None, list_item.children[:index])
for n in list_item.children[:index]:
if isinstance(n, TemplateNode):
lang_code = n.template_name
break
if lang_code == "unknown":
new_lang_code = name_to_code(lang_name, "it")
if new_lang_code != "":
lang_code = new_lang_code
elif not before_colon and isinstance(node, WikiNode):
match node.kind:
case NodeKind.LINK:
word = clean_node(wxr, None, node)
if word != "":
translations.append(
Translation(
word=word,
sense=sense,
lang=lang_name,
lang_code=lang_code,
)
)
case NodeKind.ITALIC:
raw_tag = clean_node(wxr, None, node)
if raw_tag in TR_GENDER_TAGS and len(translations) > 0:
translations[-1].tags.append(TR_GENDER_TAGS[raw_tag])
elif raw_tag != "" and len(translations) > 0:
translations[-1].raw_tags.append(raw_tag)
elif not before_colon and isinstance(node, str):
m = re.search(r"\((.+)\)", node)
if m is not None and len(translations) > 0:
translations[-1].roman = m.group(1)

return translations
96 changes: 96 additions & 0 deletions tests/test_it_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,99 @@ def test_list_example(self):
}
],
)

def test_all_in_one_line(self):
self.wxr.wtp.add_page("Template:-zh-", 10, "Cinese")
data = parse_page(
self.wxr,
"幼虫",
"""== {{-zh-}} ==
===Sostantivo===
# larva
#* [[苍蝇]] [[的]]'''幼虫''' ''cāngyíng de '''yòuchóng''''' - [[larva]] di [[mosca]], [[bigattino]]""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": ["larva"],
"examples": [
{
"text": "苍蝇的幼虫",
"roman": "cāngyíng de yòuchóng",
"translation": "larva di mosca, bigattino",
}
],
}
],
)

def test_ja_r(self):
self.wxr.wtp.add_page("Template:-ja-", 10, "Giapponese")
self.wxr.wtp.add_page(
"Template:ja-r",
10,
"""{{#switch:{{{1}}}
| 今 = <span class="Jpan" lang="ja">[[今#Giapponese|<span><ruby>今<rp>&nbsp;(</rp><rt>いま</rt><rp>)</rp></ruby></span>]]</span>
| 行く = <span class="Jpan" lang="ja">[[行く#Giapponese|<span><ruby>行<rp>&nbsp;(</rp><rt>い</rt><rp>)</rp></ruby>く</span>]]</span>
| よ = <span class="Jpan" lang="ja">[[よ#Giapponese|<span>よ</span>]]</span>
}}""",
)
data = parse_page(
self.wxr,
"行く",
"""== {{-ja-}} ==
===Verbo===
# andare
#* {{ja-r|今|いま|rom=-}}'''{{ja-r|行く|いく|rom=-}}'''{{ja-r|よ|rom=-}}! (''ima '''iku''' yo!'')
#: ''sto '''andando'''!''""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": ["andare"],
"examples": [
{
"text": "今行くよ!",
"roman": "ima iku yo!",
"translation": "sto andando!",
"ruby": [("今", "いま"), ("行", "い")],
}
],
}
],
)

def test_zh_tradsem(self):
self.wxr.wtp.add_page("Template:-zh-", 10, "Cinese")
data = parse_page(
self.wxr,
"可能",
"""== {{-zh-}} ==
===Aggettivo===
# probabile
#* {{zh-tradsem|[[一]] [[個]]'''可能'''[[的]] [[事件]]|[[一]] [[个]]'''可能'''[[的]] [[事件]]}} ''yī ge '''kěnéng''' de shìjiàn'' - un [[evento]] [[possibile]]""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": ["probabile"],
"examples": [
{
"text": "一個可能的事件",
"roman": "yī ge kěnéng de shìjiàn",
"translation": "un evento possibile",
"tags": ["Traditional Chinese"],
},
{
"text": "一个可能的事件",
"roman": "yī ge kěnéng de shìjiàn",
"translation": "un evento possibile",
"tags": ["Simplified Chinese"],
},
],
}
],
)
Loading

0 comments on commit 0ddf970

Please sign in to comment.