Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[it] extract forms line, etymology, pronunciation sections #946

Merged
merged 5 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/wiktextract/data/overrides/it.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,10 @@
"body": "===Traduzione===\n",
"namespace_id": 10,
"need_pre_expand": true
},
"Template:-ref-": {
"body": "===Note / Riferimenti===\n",
"namespace_id": 10,
"need_pre_expand": true
}
}
47 changes: 47 additions & 0 deletions src/wiktextract/extractor/it/etymology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Example, WordEntry


def extract_etymology_section(
wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
) -> None:
etymology_texts = []
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
e_str = clean_node(wxr, None, list_item.children)
if e_str != "":
etymology_texts.append(e_str)

if len(etymology_texts) == 0:
e_str = clean_node(
wxr, None, list(level_node.invert_find_child(LEVEL_KIND_FLAGS))
)
if e_str != "":
etymology_texts.append(e_str)

for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.etymology_texts.extend(etymology_texts)


def extract_citation_section(
wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
) -> None:
examples = []
for t_node in level_node.find_child(NodeKind.TEMPLATE):
if t_node.template_name.lower() == "quote":
example = Example()
example.text = clean_node(
wxr, None, t_node.template_parameters.get(1, "")
)
example.ref = clean_node(
wxr, None, t_node.template_parameters.get(2, "")
)
if example.text != "":
examples.append(example)
for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.etymology_examples.extend(examples)
24 changes: 24 additions & 0 deletions src/wiktextract/extractor/it/inflection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from wikitextprocessor import TemplateNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Form, WordEntry


def extract_tabs_template(
wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
) -> None:
# https://it.wiktionary.org/wiki/Template:Tabs
tags = [
["masculine", "singular"],
["masculine", "plural"],
["feminine", "singular"],
["feminine", "plural"],
]
for arg_name in range(1, 5):
arg_value = clean_node(
wxr, None, node.template_parameters.get(arg_name, "")
)
if arg_value not in ["", wxr.wtp.title]:
form = Form(form=arg_value, tags=tags[arg_name - 1])
word_entry.forms.append(form)
24 changes: 24 additions & 0 deletions src/wiktextract/extractor/it/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,25 @@ class Translation(ItalianBaseModel):
roman: str = ""


class Form(ItalianBaseModel):
form: str = ""
tags: list[str] = []
raw_tags: list[str] = []


class Sound(ItalianBaseModel):
ipa: str = Field(default="", description="International Phonetic Alphabet")
audio: str = Field(default="", description="Audio file name")
wav_url: str = ""
oga_url: str = ""
ogg_url: str = ""
mp3_url: str = ""
opus_url: str = ""
flac_url: str = ""
tags: list[str] = []
raw_tags: list[str] = []


class WordEntry(ItalianBaseModel):
model_config = ConfigDict(title="Italian Wiktionary")
word: str = Field(description="Word string", min_length=1)
Expand All @@ -55,3 +74,8 @@ class WordEntry(ItalianBaseModel):
tags: list[str] = []
raw_tags: list[str] = []
translations: list[Translation] = []
forms: list[Form] = []
etymology_texts: list[str] = []
etymology_examples: list[Example] = []
hyphenation: str = ""
sounds: list[Sound] = []
12 changes: 12 additions & 0 deletions src/wiktextract/extractor/it/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .etymology import extract_citation_section, extract_etymology_section
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA
from .sound import extract_hyphenation_section, extract_pronunciation_section
from .translation import extract_translation_section


Expand All @@ -21,6 +23,14 @@ def parse_section(
extract_pos_section(wxr, page_data, base_data, level_node, title_text)
elif title_text == "Traduzione":
extract_translation_section(wxr, page_data, level_node)
elif title_text == "Etimologia / Derivazione":
extract_etymology_section(wxr, page_data, level_node)
elif title_text == "Citazione":
extract_citation_section(wxr, page_data, level_node)
elif title_text == "Sillabazione":
extract_hyphenation_section(wxr, page_data, level_node)
elif title_text == "Pronuncia":
extract_pronunciation_section(wxr, page_data, level_node)

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)
Expand All @@ -37,6 +47,8 @@ def parse_page(
for level2_node in tree.find_child(NodeKind.LEVEL2):
lang_cats = {}
lang_name = clean_node(wxr, lang_cats, level2_node.largs)
if lang_name in ["Altri progetti", "Note / Riferimenti"]:
continue
lang_code = "unknown"
for lang_template in level2_node.find_content(NodeKind.TEMPLATE):
lang_code = lang_template.template_name.strip("-")
Expand Down
19 changes: 16 additions & 3 deletions src/wiktextract/extractor/it/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .example import extract_example_list_item
from .models import Sense, WordEntry
from .section_titles import POS_DATA
from .tag_form_line import extract_tag_form_line_nodes


def extract_pos_section(
Expand All @@ -22,10 +23,22 @@ def extract_pos_section(
for link_node in level_node.find_child(NodeKind.LINK):
clean_node(wxr, page_data[-1], link_node)

for list_node in level_node.find_child(NodeKind.LIST):
if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
first_gloss_list_index = len(level_node.children)
for index, node in enumerate(level_node.children):
if (
isinstance(node, WikiNode)
and node.kind == NodeKind.LIST
and node.sarg.startswith("#")
and node.sarg.endswith("#")
):
for list_item in node.find_child(NodeKind.LIST_ITEM):
extract_gloss_list_item(wxr, page_data[-1], list_item)
if index < first_gloss_list_index:
first_gloss_list_index = index

extract_tag_form_line_nodes(
wxr, page_data[-1], level_node.children[:first_gloss_list_index]
)


def extract_gloss_list_item(
Expand Down
47 changes: 47 additions & 0 deletions src/wiktextract/extractor/it/sound.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from wikitextprocessor import LevelNode, NodeKind

from ...page import clean_node
from ...wxr_context import WiktextractContext
from ..share import set_sound_file_url_fields
from .models import Sound, WordEntry


def extract_hyphenation_section(
wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
) -> None:
hyphenation = ""
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
hyphenation = clean_node(wxr, None, list_item.children)
for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.hyphenation = hyphenation


def extract_pronunciation_section(
wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
) -> None:
sounds = []
for t_node in level_node.find_child(NodeKind.TEMPLATE):
match t_node.template_name.lower():
case "ipa":
ipa = clean_node(
wxr, None, t_node.template_parameters.get(1, "")
)
if ipa != "":
sounds.append(Sound(ipa=ipa))
case "audio":
sound_file = clean_node(
wxr, None, t_node.template_parameters.get(1, "")
)
if sound_file != "":
if len(sounds) > 0:
set_sound_file_url_fields(wxr, sound_file, sounds[-1])
else:
sound = Sound()
set_sound_file_url_fields(wxr, sound_file, sound)
sounds.append(sound)

for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.sounds.extend(sounds)
52 changes: 52 additions & 0 deletions src/wiktextract/extractor/it/tag_form_line.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from wikitextprocessor import NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .inflection import extract_tabs_template
from .models import Form, WordEntry


def extract_tag_form_line_nodes(
wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str]
) -> None:
# https://it.wiktionary.org/wiki/Wikizionario:Manuale_di_stile#Genere_e_numero,_declinazione_o_paradigma
for node in nodes:
if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
extract_italic_tag_node(wxr, word_entry, node)
elif isinstance(node, TemplateNode):
match node.template_name.lower():
case "tabs":
extract_tabs_template(wxr, word_entry, node)
case "linkp":
form = clean_node(
wxr, None, node.template_parameters.get(1, "")
)
if form != "":
word_entry.forms.append(
Form(form=form, tags=["plural"])
)


ITALIC_TAGS = {
"c": "common",
"coll": "collective",
"f": "feminine",
"m": "masculine",
"n": "neuter",
"pl": "plural",
"sing": "singular",
"prom": "common",
"inv": "invariable",
}


def extract_italic_tag_node(
wxr: WiktextractContext, word_entry: WordEntry, node: WikiNode
) -> None:
# https://it.wiktionary.org/wiki/Wikizionario:Genere
italic_str = clean_node(wxr, None, node)
for raw_tag in italic_str.split():
if raw_tag in ITALIC_TAGS:
word_entry.tags.append(ITALIC_TAGS[raw_tag])
else:
word_entry.raw_tags.append(raw_tag)
62 changes: 62 additions & 0 deletions tests/test_it_etymology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.it.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestItGloss(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="it"),
WiktionaryConfig(
dump_file_lang_code="it", capture_language_codes=None
),
)

def test_quote_template(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
data = parse_page(
self.wxr,
"cane",
"""== {{-it-}} ==
===Sostantivo===
# {{Term|mammalogia|it}} [[animale]]
===Etimologia / Derivazione===
dal latino canis
====Citazione====
{{Quote
|Cane affamato non teme bastone
|[[q:Giovanni Verga|Giovanni Verga]]}}""",
)
self.assertEqual(data[0]["etymology_texts"], ["dal latino canis"])
self.assertEqual(
data[0]["etymology_examples"],
[
{
"text": "Cane affamato non teme bastone",
"ref": "Giovanni Verga",
}
],
)

def test_list(self):
self.wxr.wtp.add_page("Template:-la-", 10, "Latino")
data = parse_page(
self.wxr,
"cane",
"""== {{-it-}} ==
===Sostantivo, forma flessa===
# {{Term|mammalogia|it}} [[animale]]
===Etimologia / Derivazione===
* (sostantivo) vedi [[canis#Latino|canis]]
* (voce verbale) vedi [[cano#Latino|canō]]""",
)
self.assertEqual(
data[0]["etymology_texts"],
["(sostantivo) vedi canis", "(voce verbale) vedi canō"],
)
Loading
Loading