diff --git a/src/wiktextract/data/overrides/it.json b/src/wiktextract/data/overrides/it.json index a0076d36..e02978bd 100644 --- a/src/wiktextract/data/overrides/it.json +++ b/src/wiktextract/data/overrides/it.json @@ -3,5 +3,10 @@ "body": "===Traduzione===\n", "namespace_id": 10, "need_pre_expand": true + }, + "Template:-ref-": { + "body": "===Note / Riferimenti===\n", + "namespace_id": 10, + "need_pre_expand": true } } diff --git a/src/wiktextract/extractor/it/etymology.py b/src/wiktextract/extractor/it/etymology.py new file mode 100644 index 00000000..8092e7af --- /dev/null +++ b/src/wiktextract/extractor/it/etymology.py @@ -0,0 +1,47 @@ +from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Example, WordEntry + + +def extract_etymology_section( + wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode +) -> None: + etymology_texts = [] + for list_node in level_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + e_str = clean_node(wxr, None, list_item.children) + if e_str != "": + etymology_texts.append(e_str) + + if len(etymology_texts) == 0: + e_str = clean_node( + wxr, None, list(level_node.invert_find_child(LEVEL_KIND_FLAGS)) + ) + if e_str != "": + etymology_texts.append(e_str) + + for data in page_data: + if data.lang_code == page_data[-1].lang_code: + data.etymology_texts.extend(etymology_texts) + + +def extract_citation_section( + wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode +) -> None: + examples = [] + for t_node in level_node.find_child(NodeKind.TEMPLATE): + if t_node.template_name.lower() == "quote": + example = Example() + example.text = clean_node( + wxr, None, t_node.template_parameters.get(1, "") + ) + example.ref = clean_node( + wxr, None, t_node.template_parameters.get(2, "") + ) + if example.text != "": + examples.append(example) + for data in page_data: + if data.lang_code == page_data[-1].lang_code: + data.etymology_examples.extend(examples) diff --git a/src/wiktextract/extractor/it/inflection.py b/src/wiktextract/extractor/it/inflection.py new file mode 100644 index 00000000..1f9a1eed --- /dev/null +++ b/src/wiktextract/extractor/it/inflection.py @@ -0,0 +1,24 @@ +from wikitextprocessor import TemplateNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Form, WordEntry + + +def extract_tabs_template( + wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode +) -> None: + # https://it.wiktionary.org/wiki/Template:Tabs + tags = [ + ["masculine", "singular"], + ["masculine", "plural"], + ["feminine", "singular"], + ["feminine", "plural"], + ] + for arg_name in range(1, 5): + arg_value = clean_node( + wxr, None, node.template_parameters.get(arg_name, "") + ) + if arg_value not in ["", wxr.wtp.title]: + form = Form(form=arg_value, tags=tags[arg_name - 1]) + word_entry.forms.append(form) diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py index b3e99345..1e42a0f5 100644 --- a/src/wiktextract/extractor/it/models.py +++ b/src/wiktextract/extractor/it/models.py @@ -43,6 +43,25 @@ class Translation(ItalianBaseModel): roman: str = "" +class Form(ItalianBaseModel): + form: str = "" + tags: list[str] = [] + raw_tags: list[str] = [] + + +class Sound(ItalianBaseModel): + ipa: str = Field(default="", description="International Phonetic Alphabet") + audio: str = Field(default="", description="Audio file name") + wav_url: str = "" + oga_url: str = "" + ogg_url: str = "" + mp3_url: str = "" + opus_url: str = "" + flac_url: str = "" + tags: list[str] = [] + raw_tags: list[str] = [] + + class WordEntry(ItalianBaseModel): model_config = ConfigDict(title="Italian Wiktionary") word: str = Field(description="Word string", min_length=1) @@ -55,3 +74,8 @@ class WordEntry(ItalianBaseModel): tags: list[str] = [] raw_tags: list[str] = [] translations: list[Translation] = [] + forms: list[Form] = [] + etymology_texts: list[str] = [] + etymology_examples: list[Example] = [] + hyphenation: str = "" + sounds: list[Sound] = [] diff --git a/src/wiktextract/extractor/it/page.py b/src/wiktextract/extractor/it/page.py index 46b2b224..7817a40b 100644 --- a/src/wiktextract/extractor/it/page.py +++ b/src/wiktextract/extractor/it/page.py @@ -4,9 +4,11 @@ from ...page import clean_node from ...wxr_context import WiktextractContext +from .etymology import extract_citation_section, extract_etymology_section from .models import Sense, WordEntry from .pos import extract_pos_section from .section_titles import POS_DATA +from .sound import extract_hyphenation_section, extract_pronunciation_section from .translation import extract_translation_section @@ -21,6 +23,14 @@ def parse_section( extract_pos_section(wxr, page_data, base_data, level_node, title_text) elif title_text == "Traduzione": extract_translation_section(wxr, page_data, level_node) + elif title_text == "Etimologia / Derivazione": + extract_etymology_section(wxr, page_data, level_node) + elif title_text == "Citazione": + extract_citation_section(wxr, page_data, level_node) + elif title_text == "Sillabazione": + extract_hyphenation_section(wxr, page_data, level_node) + elif title_text == "Pronuncia": + extract_pronunciation_section(wxr, page_data, level_node) for next_level in level_node.find_child(LEVEL_KIND_FLAGS): parse_section(wxr, page_data, base_data, next_level) @@ -37,6 +47,8 @@ def parse_page( for level2_node in tree.find_child(NodeKind.LEVEL2): lang_cats = {} lang_name = clean_node(wxr, lang_cats, level2_node.largs) + if lang_name in ["Altri progetti", "Note / Riferimenti"]: + continue lang_code = "unknown" for lang_template in level2_node.find_content(NodeKind.TEMPLATE): lang_code = lang_template.template_name.strip("-") diff --git a/src/wiktextract/extractor/it/pos.py b/src/wiktextract/extractor/it/pos.py index 91f8a9bc..f50a3605 100644 --- a/src/wiktextract/extractor/it/pos.py +++ b/src/wiktextract/extractor/it/pos.py @@ -5,6 +5,7 @@ from .example import extract_example_list_item from .models import Sense, WordEntry from .section_titles import POS_DATA +from .tag_form_line import extract_tag_form_line_nodes def extract_pos_section( @@ -22,10 +23,22 @@ def extract_pos_section( for link_node in level_node.find_child(NodeKind.LINK): clean_node(wxr, page_data[-1], link_node) - for list_node in level_node.find_child(NodeKind.LIST): - if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): - for list_item in list_node.find_child(NodeKind.LIST_ITEM): + first_gloss_list_index = len(level_node.children) + for index, node in enumerate(level_node.children): + if ( + isinstance(node, WikiNode) + and node.kind == NodeKind.LIST + and node.sarg.startswith("#") + and node.sarg.endswith("#") + ): + for list_item in node.find_child(NodeKind.LIST_ITEM): extract_gloss_list_item(wxr, page_data[-1], list_item) + if index < first_gloss_list_index: + first_gloss_list_index = index + + extract_tag_form_line_nodes( + wxr, page_data[-1], level_node.children[:first_gloss_list_index] + ) def extract_gloss_list_item( diff --git a/src/wiktextract/extractor/it/sound.py b/src/wiktextract/extractor/it/sound.py new file mode 100644 index 00000000..08b9074d --- /dev/null +++ b/src/wiktextract/extractor/it/sound.py @@ -0,0 +1,47 @@ +from wikitextprocessor import LevelNode, NodeKind + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from ..share import set_sound_file_url_fields +from .models import Sound, WordEntry + + +def extract_hyphenation_section( + wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode +) -> None: + hyphenation = "" + for list_node in level_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + hyphenation = clean_node(wxr, None, list_item.children) + for data in page_data: + if data.lang_code == page_data[-1].lang_code: + data.hyphenation = hyphenation + + +def extract_pronunciation_section( + wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode +) -> None: + sounds = [] + for t_node in level_node.find_child(NodeKind.TEMPLATE): + match t_node.template_name.lower(): + case "ipa": + ipa = clean_node( + wxr, None, t_node.template_parameters.get(1, "") + ) + if ipa != "": + sounds.append(Sound(ipa=ipa)) + case "audio": + sound_file = clean_node( + wxr, None, t_node.template_parameters.get(1, "") + ) + if sound_file != "": + if len(sounds) > 0: + set_sound_file_url_fields(wxr, sound_file, sounds[-1]) + else: + sound = Sound() + set_sound_file_url_fields(wxr, sound_file, sound) + sounds.append(sound) + + for data in page_data: + if data.lang_code == page_data[-1].lang_code: + data.sounds.extend(sounds) diff --git a/src/wiktextract/extractor/it/tag_form_line.py b/src/wiktextract/extractor/it/tag_form_line.py new file mode 100644 index 00000000..97a42682 --- /dev/null +++ b/src/wiktextract/extractor/it/tag_form_line.py @@ -0,0 +1,52 @@ +from wikitextprocessor import NodeKind, TemplateNode, WikiNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .inflection import extract_tabs_template +from .models import Form, WordEntry + + +def extract_tag_form_line_nodes( + wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str] +) -> None: + # https://it.wiktionary.org/wiki/Wikizionario:Manuale_di_stile#Genere_e_numero,_declinazione_o_paradigma + for node in nodes: + if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: + extract_italic_tag_node(wxr, word_entry, node) + elif isinstance(node, TemplateNode): + match node.template_name.lower(): + case "tabs": + extract_tabs_template(wxr, word_entry, node) + case "linkp": + form = clean_node( + wxr, None, node.template_parameters.get(1, "") + ) + if form != "": + word_entry.forms.append( + Form(form=form, tags=["plural"]) + ) + + +ITALIC_TAGS = { + "c": "common", + "coll": "collective", + "f": "feminine", + "m": "masculine", + "n": "neuter", + "pl": "plural", + "sing": "singular", + "prom": "common", + "inv": "invariable", +} + + +def extract_italic_tag_node( + wxr: WiktextractContext, word_entry: WordEntry, node: WikiNode +) -> None: + # https://it.wiktionary.org/wiki/Wikizionario:Genere + italic_str = clean_node(wxr, None, node) + for raw_tag in italic_str.split(): + if raw_tag in ITALIC_TAGS: + word_entry.tags.append(ITALIC_TAGS[raw_tag]) + else: + word_entry.raw_tags.append(raw_tag) diff --git a/tests/test_it_etymology.py b/tests/test_it_etymology.py new file mode 100644 index 00000000..e228a363 --- /dev/null +++ b/tests/test_it_etymology.py @@ -0,0 +1,62 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.it.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestItGloss(TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="it"), + WiktionaryConfig( + dump_file_lang_code="it", capture_language_codes=None + ), + ) + + def test_quote_template(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "cane", + """== {{-it-}} == +===Sostantivo=== +# {{Term|mammalogia|it}} [[animale]] +===Etimologia / Derivazione=== +dal latino canis +====Citazione==== +{{Quote +|Cane affamato non teme bastone +|[[q:Giovanni Verga|Giovanni Verga]]}}""", + ) + self.assertEqual(data[0]["etymology_texts"], ["dal latino canis"]) + self.assertEqual( + data[0]["etymology_examples"], + [ + { + "text": "Cane affamato non teme bastone", + "ref": "Giovanni Verga", + } + ], + ) + + def test_list(self): + self.wxr.wtp.add_page("Template:-la-", 10, "Latino") + data = parse_page( + self.wxr, + "cane", + """== {{-it-}} == +===Sostantivo, forma flessa=== +# {{Term|mammalogia|it}} [[animale]] +===Etimologia / Derivazione=== +* (sostantivo) vedi [[canis#Latino|canis]] +* (voce verbale) vedi [[cano#Latino|canō]]""", + ) + self.assertEqual( + data[0]["etymology_texts"], + ["(sostantivo) vedi canis", "(voce verbale) vedi canō"], + ) diff --git a/tests/test_it_forms.py b/tests/test_it_forms.py new file mode 100644 index 00000000..64c3320a --- /dev/null +++ b/tests/test_it_forms.py @@ -0,0 +1,57 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.it.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestItForms(TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="it"), + WiktionaryConfig( + dump_file_lang_code="it", capture_language_codes=None + ), + ) + + def test_tabs_template(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "cane", + """== {{-it-}} == +===Sostantivo=== +{{Pn|w}} ''m sing'' +{{Tabs|cane|cani|cagna|cagne}} + +# {{Term|mammalogia|it}} [[animale]]""", + ) + self.assertEqual( + data[0]["forms"], + [ + {"form": "cani", "tags": ["masculine", "plural"]}, + {"form": "cagna", "tags": ["feminine", "singular"]}, + {"form": "cagne", "tags": ["feminine", "plural"]}, + ], + ) + self.assertEqual(data[0]["tags"], ["masculine", "singular"]) + + def test_linkp_template(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "cagna", + """== {{-it-}} == +===Sostantivo=== +{{Pn}} ''f sing'' {{Linkp|cagne}} +# {{Term|zoologia|it|mammalogia}} femmina del [[cane]]]""", + ) + self.assertEqual( + data[0]["forms"], + [{"form": "cagne", "tags": ["plural"]}], + ) + self.assertEqual(data[0]["tags"], ["feminine", "singular"]) diff --git a/tests/test_it_sound.py b/tests/test_it_sound.py new file mode 100644 index 00000000..62c695b2 --- /dev/null +++ b/tests/test_it_sound.py @@ -0,0 +1,48 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.it.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestItSound(TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="it"), + WiktionaryConfig( + dump_file_lang_code="it", capture_language_codes=None + ), + ) + + def test_hyphenation(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "cane", + """== {{-it-}} == +===Sostantivo=== +# {{Term|mammalogia|it}} [[animale]] +===Sillabazione=== +; cà | ne""", + ) + self.assertEqual(data[0]["hyphenation"], "cà | ne") + + def test_ipa_audio_templates(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "cane", + """== {{-it-}} == +===Sostantivo=== +# {{Term|mammalogia|it}} [[animale]] +===Pronuncia=== +{{IPA|/ˈkaːne/}} +{{Audio|it-cane.ogg}}""", + ) + sound = data[0]["sounds"][0] + self.assertEqual(sound["ipa"], "/ˈkaːne/") + self.assertEqual(sound["audio"], "it-cane.ogg")