Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[it] improve pos and proverb sections code #948

Merged
merged 5 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/wiktextract/data/overrides/it.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,10 @@
"body": "===Note / Riferimenti===\n",
"namespace_id": 10,
"need_pre_expand": true
},
"Template:-verb-": {
"body": "{{Sezione voce|Immagine=Open_book_01.svg|Dimensione=30px|Sezione=verbo|Sezione al plurale=verbi|Genere=m|Lingua={{{1|}}}}}{{#invoke:Categorizzazione verbi italiani|main|{{{1|}}}}}",
"namespace_id": 10,
"need_pre_expand": true
}
}
44 changes: 40 additions & 4 deletions src/wiktextract/extractor/it/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,15 @@ def extract_example_list_item(
text_nodes = []
roman = ""
translation = ""
ref = ""
has_zh_tradsem = False
for index, node in enumerate(list_item.children):
if (
isinstance(node, TemplateNode)
and node.template_name == "zh-tradsem"
):
examples.extend(extract_zh_tradsem(wxr, node))
has_zh_tradsem = True
elif isinstance(node, WikiNode):
match node.kind:
case NodeKind.ITALIC:
Expand All @@ -39,17 +42,38 @@ def extract_example_list_item(
case _ if lang_code in ["zh", "ja"]:
if before_italic:
text_nodes.append(node)
elif (
isinstance(node, str) and lang_code in ["zh", "ja"] and "-" in node
):
elif isinstance(node, str) and "-" in node:
for t_node in list_item.find_child(NodeKind.TEMPLATE):
if t_node.template_name == "Term":
ref = clean_node(wxr, None, t_node).strip("()")
break
translation = clean_node(
wxr,
sense,
wxr.wtp.node_to_wikitext(
[node[node.index("-") + 1 :]]
+ list_item.children[index + 1 :]
+ [
n
for n in list_item.children[index + 1 :]
if not (
isinstance(n, TemplateNode)
and n.template_name == "Term"
)
]
),
)
if not has_zh_tradsem and len(examples) > 1:
examples.clear()
examples.append(
Example(
text=clean_node(
wxr,
None,
list_item.children[:index]
+ [node[: node.index("-")]],
)
)
)
break
elif lang_code in ["zh", "ja"] and len(examples) == 0 and before_italic:
text_nodes.append(node)
Expand All @@ -69,11 +93,23 @@ def extract_example_list_item(
)
examples.append(example)

if not has_zh_tradsem and len(examples) > 1:
examples.clear()
examples.append(
Example(
text=clean_node(
wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
)
)
)

for example in examples:
if roman != "":
example.roman = roman
if translation != "":
example.translation = translation
if ref != "":
example.ref = ref
if example.text != "":
sense.examples.append(example)

Expand Down
27 changes: 25 additions & 2 deletions src/wiktextract/extractor/it/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@ def extract_linkage_section(
linkages = []
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
linkages.extend(extract_linkage_list_item(wxr, list_item))
linkages.extend(
extract_proverb_list_item(wxr, list_item)
if linkage_type == "proverbs"
else extract_linkage_list_item(wxr, list_item)
)

for data in page_data:
if data.lang_code == page_data[-1].lang_code:
Expand Down Expand Up @@ -43,8 +47,27 @@ def extract_linkage_list_item(
elif isinstance(node, str):
for word_str in node.split(","):
word_str = word_str.strip()
if word_str != "":
if word_str.startswith("(") and word_str.endswith(")"):
raw_tags.append(word_str.strip("()"))
elif word_str != "":
linkages.append(Linkage(word=word_str, raw_tags=raw_tags))
raw_tags.clear()

return linkages


def extract_proverb_list_item(
wxr: WiktextractContext, list_item: WikiNode
) -> list[Linkage]:
proverb = Linkage(word="")
for index, node in enumerate(list_item.children):
if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
proverb.word = clean_node(wxr, None, node)
elif isinstance(node, str) and ":" in node:
proverb.sense = clean_node(
wxr,
None,
[node[node.index(":") + 1 :]] + list_item.children[index + 1 :],
)
break
return [proverb] if proverb.word != "" else []
1 change: 1 addition & 0 deletions src/wiktextract/extractor/it/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class Linkage(ItalianBaseModel):
word: str
tags: list[str] = []
raw_tags: list[str] = []
sense: str = ""


class WordEntry(ItalianBaseModel):
Expand Down
49 changes: 42 additions & 7 deletions src/wiktextract/extractor/it/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,24 @@
from .section_titles import POS_DATA
from .tag_form_line import extract_tag_form_line_nodes

# https://it.wiktionary.org/wiki/Categoria:Template_per_i_verbi
POS_SUBSECTION_TEMPLATES = frozenset(
[
"-participio passato-",
"-participio presente-",
"Ausiliare",
"Deponente",
"Intransitivo",
"Medio",
"Passivo",
"Reciproco",
"Riflessivo",
"Transitivo",
]
)

def extract_pos_section(

def add_new_pos_data(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
Expand All @@ -23,6 +39,15 @@ def extract_pos_section(
for link_node in level_node.find_child(NodeKind.LINK):
clean_node(wxr, page_data[-1], link_node)


def extract_pos_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
level_node: LevelNode,
pos_title: str,
) -> None:
add_new_pos_data(wxr, page_data, base_data, level_node, pos_title)
first_gloss_list_index = len(level_node.children)
for index, node in enumerate(level_node.children):
if (
Expand All @@ -35,6 +60,16 @@ def extract_pos_section(
extract_gloss_list_item(wxr, page_data[-1], list_item)
if index < first_gloss_list_index:
first_gloss_list_index = index
elif (
isinstance(node, TemplateNode)
and node.template_name in POS_SUBSECTION_TEMPLATES
):
if len(page_data[-1].senses) > 0:
add_new_pos_data(
wxr, page_data, base_data, level_node, pos_title
)
raw_tag = clean_node(wxr, page_data[-1], node).strip("= \n")
page_data[-1].raw_tags.append(raw_tag)

extract_tag_form_line_nodes(
wxr, page_data[-1], level_node.children[:first_gloss_list_index]
Expand All @@ -56,12 +91,7 @@ def extract_gloss_list_item(
else:
gloss_nodes.append(t_str)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
if node.sarg.endswith("*"):
for example_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(
wxr, sense, example_list_item, word_entry.lang_code
)
elif (
if (
node.sarg.endswith(":")
and len(sense.examples) > 0
and sense.examples[-1].translation == ""
Expand All @@ -70,6 +100,11 @@ def extract_gloss_list_item(
sense.examples[-1].translation = clean_node(
wxr, sense, tr_list_item.children
)
elif node.sarg.endswith(("*", ":")):
for example_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(
wxr, sense, example_list_item, word_entry.lang_code
)
else:
gloss_nodes.append(node)
gloss_str = clean_node(wxr, sense, gloss_nodes)
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/it/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def extract_translation_section(
page_data: list[WordEntry],
level_node: LevelNode,
) -> None:
# https://it.wiktionary.org/wiki/Aiuto:Traduzioni
sense = ""
translations = []
cats = {}
Expand Down
2 changes: 1 addition & 1 deletion tests/test_it_etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from wiktextract.wxr_context import WiktextractContext


class TestItGloss(TestCase):
class TestItEtymology(TestCase):
maxDiff = None

def setUp(self) -> None:
Expand Down
76 changes: 76 additions & 0 deletions tests/test_it_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,79 @@ def test_zh_tradsem(self):
}
],
)

def test_double_italic_nodes_with_translation(self):
self.wxr.wtp.add_page("Template:-en-", 10, "Inglese")
data = parse_page(
self.wxr,
"water",
"""== {{-en-}} ==
===Sostantivo===
# acqua
#: ''May I have a glass of '''water'''?'' - ''Posso avere un bicchiere d''''acqua'''''?""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": ["acqua"],
"examples": [
{
"text": "May I have a glass of water?",
"translation": "Posso avere un bicchiere d'acqua?",
}
],
}
],
)

def test_double_italic_nodes_no_translation(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
data = parse_page(
self.wxr,
"essere",
"""== {{-it-}} ==
===Sostantivo===
#chi [[esiste]]
#* ''gli '''esseri''' viventi''; ''gli '''esseri''' animati''""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": ["chi esiste"],
"examples": [
{"text": "gli esseri viventi; gli esseri animati"}
],
}
],
)

def test_term_ref_template(self):
self.wxr.wtp.add_page("Template:-la-", 10, "Latino")
self.wxr.wtp.add_page("Template:Term", 10, "({{{1}}})")
data = parse_page(
self.wxr,
"libero",
"""== {{-la-}} ==
===Verbo===
# [[assolvere]], [[liberare]] dalle [[accuse]], [[giudicare]] [[innocente]]
#* ''et eum omni [[ignominia]] '''liberat''''' - e lo [[assolve]] da ogni [[ignominia]] {{Term|[[:w:Marco Tullio Cicerone|Cicerone]], [[:w:Pro Cluentio|Pro Cluentio]], [[:s:la:Pro_Aulo_Cluentio_Habito|XLVII, 132]]}}""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": [
"assolvere, liberare dalle accuse, giudicare innocente"
],
"examples": [
{
"text": "et eum omni ignominia liberat",
"translation": "e lo assolve da ogni ignominia",
"ref": "Cicerone, Pro Cluentio, XLVII, 132",
}
],
}
],
)
Loading
Loading