Skip to content

Commit

Permalink
Merge pull request #946 from xxyzz/it
Browse files Browse the repository at this point in the history
[it] extract forms line, etymology, pronunciation sections
  • Loading branch information
xxyzz authored Dec 13, 2024
2 parents 0ddf970 + e175500 commit 8a39820
Show file tree
Hide file tree
Showing 11 changed files with 394 additions and 3 deletions.
5 changes: 5 additions & 0 deletions src/wiktextract/data/overrides/it.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,10 @@
"body": "===Traduzione===\n",
"namespace_id": 10,
"need_pre_expand": true
},
"Template:-ref-": {
"body": "===Note / Riferimenti===\n",
"namespace_id": 10,
"need_pre_expand": true
}
}
47 changes: 47 additions & 0 deletions src/wiktextract/extractor/it/etymology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Example, WordEntry


def extract_etymology_section(
wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
) -> None:
etymology_texts = []
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
e_str = clean_node(wxr, None, list_item.children)
if e_str != "":
etymology_texts.append(e_str)

if len(etymology_texts) == 0:
e_str = clean_node(
wxr, None, list(level_node.invert_find_child(LEVEL_KIND_FLAGS))
)
if e_str != "":
etymology_texts.append(e_str)

for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.etymology_texts.extend(etymology_texts)


def extract_citation_section(
wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
) -> None:
examples = []
for t_node in level_node.find_child(NodeKind.TEMPLATE):
if t_node.template_name.lower() == "quote":
example = Example()
example.text = clean_node(
wxr, None, t_node.template_parameters.get(1, "")
)
example.ref = clean_node(
wxr, None, t_node.template_parameters.get(2, "")
)
if example.text != "":
examples.append(example)
for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.etymology_examples.extend(examples)
24 changes: 24 additions & 0 deletions src/wiktextract/extractor/it/inflection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from wikitextprocessor import TemplateNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Form, WordEntry


def extract_tabs_template(
wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
) -> None:
# https://it.wiktionary.org/wiki/Template:Tabs
tags = [
["masculine", "singular"],
["masculine", "plural"],
["feminine", "singular"],
["feminine", "plural"],
]
for arg_name in range(1, 5):
arg_value = clean_node(
wxr, None, node.template_parameters.get(arg_name, "")
)
if arg_value not in ["", wxr.wtp.title]:
form = Form(form=arg_value, tags=tags[arg_name - 1])
word_entry.forms.append(form)
24 changes: 24 additions & 0 deletions src/wiktextract/extractor/it/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,25 @@ class Translation(ItalianBaseModel):
roman: str = ""


class Form(ItalianBaseModel):
form: str = ""
tags: list[str] = []
raw_tags: list[str] = []


class Sound(ItalianBaseModel):
ipa: str = Field(default="", description="International Phonetic Alphabet")
audio: str = Field(default="", description="Audio file name")
wav_url: str = ""
oga_url: str = ""
ogg_url: str = ""
mp3_url: str = ""
opus_url: str = ""
flac_url: str = ""
tags: list[str] = []
raw_tags: list[str] = []


class WordEntry(ItalianBaseModel):
model_config = ConfigDict(title="Italian Wiktionary")
word: str = Field(description="Word string", min_length=1)
Expand All @@ -55,3 +74,8 @@ class WordEntry(ItalianBaseModel):
tags: list[str] = []
raw_tags: list[str] = []
translations: list[Translation] = []
forms: list[Form] = []
etymology_texts: list[str] = []
etymology_examples: list[Example] = []
hyphenation: str = ""
sounds: list[Sound] = []
12 changes: 12 additions & 0 deletions src/wiktextract/extractor/it/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .etymology import extract_citation_section, extract_etymology_section
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA
from .sound import extract_hyphenation_section, extract_pronunciation_section
from .translation import extract_translation_section


Expand All @@ -21,6 +23,14 @@ def parse_section(
extract_pos_section(wxr, page_data, base_data, level_node, title_text)
elif title_text == "Traduzione":
extract_translation_section(wxr, page_data, level_node)
elif title_text == "Etimologia / Derivazione":
extract_etymology_section(wxr, page_data, level_node)
elif title_text == "Citazione":
extract_citation_section(wxr, page_data, level_node)
elif title_text == "Sillabazione":
extract_hyphenation_section(wxr, page_data, level_node)
elif title_text == "Pronuncia":
extract_pronunciation_section(wxr, page_data, level_node)

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)
Expand All @@ -37,6 +47,8 @@ def parse_page(
for level2_node in tree.find_child(NodeKind.LEVEL2):
lang_cats = {}
lang_name = clean_node(wxr, lang_cats, level2_node.largs)
if lang_name in ["Altri progetti", "Note / Riferimenti"]:
continue
lang_code = "unknown"
for lang_template in level2_node.find_content(NodeKind.TEMPLATE):
lang_code = lang_template.template_name.strip("-")
Expand Down
19 changes: 16 additions & 3 deletions src/wiktextract/extractor/it/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .example import extract_example_list_item
from .models import Sense, WordEntry
from .section_titles import POS_DATA
from .tag_form_line import extract_tag_form_line_nodes


def extract_pos_section(
Expand All @@ -22,10 +23,22 @@ def extract_pos_section(
for link_node in level_node.find_child(NodeKind.LINK):
clean_node(wxr, page_data[-1], link_node)

for list_node in level_node.find_child(NodeKind.LIST):
if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
first_gloss_list_index = len(level_node.children)
for index, node in enumerate(level_node.children):
if (
isinstance(node, WikiNode)
and node.kind == NodeKind.LIST
and node.sarg.startswith("#")
and node.sarg.endswith("#")
):
for list_item in node.find_child(NodeKind.LIST_ITEM):
extract_gloss_list_item(wxr, page_data[-1], list_item)
if index < first_gloss_list_index:
first_gloss_list_index = index

extract_tag_form_line_nodes(
wxr, page_data[-1], level_node.children[:first_gloss_list_index]
)


def extract_gloss_list_item(
Expand Down
47 changes: 47 additions & 0 deletions src/wiktextract/extractor/it/sound.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from wikitextprocessor import LevelNode, NodeKind

from ...page import clean_node
from ...wxr_context import WiktextractContext
from ..share import set_sound_file_url_fields
from .models import Sound, WordEntry


def extract_hyphenation_section(
wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
) -> None:
hyphenation = ""
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
hyphenation = clean_node(wxr, None, list_item.children)
for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.hyphenation = hyphenation


def extract_pronunciation_section(
wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
) -> None:
sounds = []
for t_node in level_node.find_child(NodeKind.TEMPLATE):
match t_node.template_name.lower():
case "ipa":
ipa = clean_node(
wxr, None, t_node.template_parameters.get(1, "")
)
if ipa != "":
sounds.append(Sound(ipa=ipa))
case "audio":
sound_file = clean_node(
wxr, None, t_node.template_parameters.get(1, "")
)
if sound_file != "":
if len(sounds) > 0:
set_sound_file_url_fields(wxr, sound_file, sounds[-1])
else:
sound = Sound()
set_sound_file_url_fields(wxr, sound_file, sound)
sounds.append(sound)

for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.sounds.extend(sounds)
52 changes: 52 additions & 0 deletions src/wiktextract/extractor/it/tag_form_line.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from wikitextprocessor import NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .inflection import extract_tabs_template
from .models import Form, WordEntry


def extract_tag_form_line_nodes(
wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str]
) -> None:
# https://it.wiktionary.org/wiki/Wikizionario:Manuale_di_stile#Genere_e_numero,_declinazione_o_paradigma
for node in nodes:
if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
extract_italic_tag_node(wxr, word_entry, node)
elif isinstance(node, TemplateNode):
match node.template_name.lower():
case "tabs":
extract_tabs_template(wxr, word_entry, node)
case "linkp":
form = clean_node(
wxr, None, node.template_parameters.get(1, "")
)
if form != "":
word_entry.forms.append(
Form(form=form, tags=["plural"])
)


ITALIC_TAGS = {
"c": "common",
"coll": "collective",
"f": "feminine",
"m": "masculine",
"n": "neuter",
"pl": "plural",
"sing": "singular",
"prom": "common",
"inv": "invariable",
}


def extract_italic_tag_node(
wxr: WiktextractContext, word_entry: WordEntry, node: WikiNode
) -> None:
# https://it.wiktionary.org/wiki/Wikizionario:Genere
italic_str = clean_node(wxr, None, node)
for raw_tag in italic_str.split():
if raw_tag in ITALIC_TAGS:
word_entry.tags.append(ITALIC_TAGS[raw_tag])
else:
word_entry.raw_tags.append(raw_tag)
62 changes: 62 additions & 0 deletions tests/test_it_etymology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.it.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestItGloss(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="it"),
WiktionaryConfig(
dump_file_lang_code="it", capture_language_codes=None
),
)

def test_quote_template(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
data = parse_page(
self.wxr,
"cane",
"""== {{-it-}} ==
===Sostantivo===
# {{Term|mammalogia|it}} [[animale]]
===Etimologia / Derivazione===
dal latino canis
====Citazione====
{{Quote
|Cane affamato non teme bastone
|[[q:Giovanni Verga|Giovanni Verga]]}}""",
)
self.assertEqual(data[0]["etymology_texts"], ["dal latino canis"])
self.assertEqual(
data[0]["etymology_examples"],
[
{
"text": "Cane affamato non teme bastone",
"ref": "Giovanni Verga",
}
],
)

def test_list(self):
self.wxr.wtp.add_page("Template:-la-", 10, "Latino")
data = parse_page(
self.wxr,
"cane",
"""== {{-it-}} ==
===Sostantivo, forma flessa===
# {{Term|mammalogia|it}} [[animale]]
===Etimologia / Derivazione===
* (sostantivo) vedi [[canis#Latino|canis]]
* (voce verbale) vedi [[cano#Latino|canō]]""",
)
self.assertEqual(
data[0]["etymology_texts"],
["(sostantivo) vedi canis", "(voce verbale) vedi canō"],
)
Loading

0 comments on commit 8a39820

Please sign in to comment.