From 60eb97f3c2f80354e13dc893671e53edd7c75a42 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 13 Dec 2024 13:59:30 +0800 Subject: [PATCH 1/5] [it] extract tag and form data under POS section title --- src/wiktextract/extractor/it/inflection.py | 24 ++++++++ src/wiktextract/extractor/it/models.py | 7 +++ src/wiktextract/extractor/it/pos.py | 19 ++++++- src/wiktextract/extractor/it/tag_form_line.py | 52 +++++++++++++++++ tests/test_it_forms.py | 57 +++++++++++++++++++ 5 files changed, 156 insertions(+), 3 deletions(-) create mode 100644 src/wiktextract/extractor/it/inflection.py create mode 100644 src/wiktextract/extractor/it/tag_form_line.py create mode 100644 tests/test_it_forms.py diff --git a/src/wiktextract/extractor/it/inflection.py b/src/wiktextract/extractor/it/inflection.py new file mode 100644 index 00000000..1f9a1eed --- /dev/null +++ b/src/wiktextract/extractor/it/inflection.py @@ -0,0 +1,24 @@ +from wikitextprocessor import TemplateNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Form, WordEntry + + +def extract_tabs_template( + wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode +) -> None: + # https://it.wiktionary.org/wiki/Template:Tabs + tags = [ + ["masculine", "singular"], + ["masculine", "plural"], + ["feminine", "singular"], + ["feminine", "plural"], + ] + for arg_name in range(1, 5): + arg_value = clean_node( + wxr, None, node.template_parameters.get(arg_name, "") + ) + if arg_value not in ["", wxr.wtp.title]: + form = Form(form=arg_value, tags=tags[arg_name - 1]) + word_entry.forms.append(form) diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py index b3e99345..d6d4c4f0 100644 --- a/src/wiktextract/extractor/it/models.py +++ b/src/wiktextract/extractor/it/models.py @@ -43,6 +43,12 @@ class Translation(ItalianBaseModel): roman: str = "" +class Form(ItalianBaseModel): + form: str = "" + tags: list[str] = [] + raw_tags: list[str] = [] + + class WordEntry(ItalianBaseModel): model_config = ConfigDict(title="Italian Wiktionary") word: str = Field(description="Word string", min_length=1) @@ -55,3 +61,4 @@ class WordEntry(ItalianBaseModel): tags: list[str] = [] raw_tags: list[str] = [] translations: list[Translation] = [] + forms: list[Form] = [] diff --git a/src/wiktextract/extractor/it/pos.py b/src/wiktextract/extractor/it/pos.py index 91f8a9bc..f50a3605 100644 --- a/src/wiktextract/extractor/it/pos.py +++ b/src/wiktextract/extractor/it/pos.py @@ -5,6 +5,7 @@ from .example import extract_example_list_item from .models import Sense, WordEntry from .section_titles import POS_DATA +from .tag_form_line import extract_tag_form_line_nodes def extract_pos_section( @@ -22,10 +23,22 @@ def extract_pos_section( for link_node in level_node.find_child(NodeKind.LINK): clean_node(wxr, page_data[-1], link_node) - for list_node in level_node.find_child(NodeKind.LIST): - if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): - for list_item in list_node.find_child(NodeKind.LIST_ITEM): + first_gloss_list_index = len(level_node.children) + for index, node in enumerate(level_node.children): + if ( + isinstance(node, WikiNode) + and node.kind == NodeKind.LIST + and node.sarg.startswith("#") + and node.sarg.endswith("#") + ): + for list_item in node.find_child(NodeKind.LIST_ITEM): extract_gloss_list_item(wxr, page_data[-1], list_item) + if index < first_gloss_list_index: + first_gloss_list_index = index + + extract_tag_form_line_nodes( + wxr, page_data[-1], level_node.children[:first_gloss_list_index] + ) def extract_gloss_list_item( diff --git a/src/wiktextract/extractor/it/tag_form_line.py b/src/wiktextract/extractor/it/tag_form_line.py new file mode 100644 index 00000000..97a42682 --- /dev/null +++ b/src/wiktextract/extractor/it/tag_form_line.py @@ -0,0 +1,52 @@ +from wikitextprocessor import NodeKind, TemplateNode, WikiNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .inflection import extract_tabs_template +from .models import Form, WordEntry + + +def extract_tag_form_line_nodes( + wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str] +) -> None: + # https://it.wiktionary.org/wiki/Wikizionario:Manuale_di_stile#Genere_e_numero,_declinazione_o_paradigma + for node in nodes: + if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: + extract_italic_tag_node(wxr, word_entry, node) + elif isinstance(node, TemplateNode): + match node.template_name.lower(): + case "tabs": + extract_tabs_template(wxr, word_entry, node) + case "linkp": + form = clean_node( + wxr, None, node.template_parameters.get(1, "") + ) + if form != "": + word_entry.forms.append( + Form(form=form, tags=["plural"]) + ) + + +ITALIC_TAGS = { + "c": "common", + "coll": "collective", + "f": "feminine", + "m": "masculine", + "n": "neuter", + "pl": "plural", + "sing": "singular", + "prom": "common", + "inv": "invariable", +} + + +def extract_italic_tag_node( + wxr: WiktextractContext, word_entry: WordEntry, node: WikiNode +) -> None: + # https://it.wiktionary.org/wiki/Wikizionario:Genere + italic_str = clean_node(wxr, None, node) + for raw_tag in italic_str.split(): + if raw_tag in ITALIC_TAGS: + word_entry.tags.append(ITALIC_TAGS[raw_tag]) + else: + word_entry.raw_tags.append(raw_tag) diff --git a/tests/test_it_forms.py b/tests/test_it_forms.py new file mode 100644 index 00000000..64c3320a --- /dev/null +++ b/tests/test_it_forms.py @@ -0,0 +1,57 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.it.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestItForms(TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="it"), + WiktionaryConfig( + dump_file_lang_code="it", capture_language_codes=None + ), + ) + + def test_tabs_template(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "cane", + """== {{-it-}} == +===Sostantivo=== +{{Pn|w}} ''m sing'' +{{Tabs|cane|cani|cagna|cagne}} + +# {{Term|mammalogia|it}} [[animale]]""", + ) + self.assertEqual( + data[0]["forms"], + [ + {"form": "cani", "tags": ["masculine", "plural"]}, + {"form": "cagna", "tags": ["feminine", "singular"]}, + {"form": "cagne", "tags": ["feminine", "plural"]}, + ], + ) + self.assertEqual(data[0]["tags"], ["masculine", "singular"]) + + def test_linkp_template(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "cagna", + """== {{-it-}} == +===Sostantivo=== +{{Pn}} ''f sing'' {{Linkp|cagne}} +# {{Term|zoologia|it|mammalogia}} femmina del [[cane]]]""", + ) + self.assertEqual( + data[0]["forms"], + [{"form": "cagne", "tags": ["plural"]}], + ) + self.assertEqual(data[0]["tags"], ["feminine", "singular"]) From 536f1d6bb8ce810aee9ac2ab3da27dec5810e7ff Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 13 Dec 2024 15:35:50 +0800 Subject: [PATCH 2/5] [it] override "-ref-" template remove `div` tag around section title wikitext --- src/wiktextract/data/overrides/it.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/wiktextract/data/overrides/it.json b/src/wiktextract/data/overrides/it.json index a0076d36..e02978bd 100644 --- a/src/wiktextract/data/overrides/it.json +++ b/src/wiktextract/data/overrides/it.json @@ -3,5 +3,10 @@ "body": "===Traduzione===\n", "namespace_id": 10, "need_pre_expand": true + }, + "Template:-ref-": { + "body": "===Note / Riferimenti===\n", + "namespace_id": 10, + "need_pre_expand": true } } From a118006f1f57e1d27b1c4b89c930fee6536e0daa Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 13 Dec 2024 17:01:27 +0800 Subject: [PATCH 3/5] [it] extract etymology and citation sections --- src/wiktextract/extractor/it/etymology.py | 47 +++++++++++++++++ src/wiktextract/extractor/it/models.py | 2 + src/wiktextract/extractor/it/page.py | 7 +++ tests/test_it_etymology.py | 62 +++++++++++++++++++++++ 4 files changed, 118 insertions(+) create mode 100644 src/wiktextract/extractor/it/etymology.py create mode 100644 tests/test_it_etymology.py diff --git a/src/wiktextract/extractor/it/etymology.py b/src/wiktextract/extractor/it/etymology.py new file mode 100644 index 00000000..8092e7af --- /dev/null +++ b/src/wiktextract/extractor/it/etymology.py @@ -0,0 +1,47 @@ +from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Example, WordEntry + + +def extract_etymology_section( + wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode +) -> None: + etymology_texts = [] + for list_node in level_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + e_str = clean_node(wxr, None, list_item.children) + if e_str != "": + etymology_texts.append(e_str) + + if len(etymology_texts) == 0: + e_str = clean_node( + wxr, None, list(level_node.invert_find_child(LEVEL_KIND_FLAGS)) + ) + if e_str != "": + etymology_texts.append(e_str) + + for data in page_data: + if data.lang_code == page_data[-1].lang_code: + data.etymology_texts.extend(etymology_texts) + + +def extract_citation_section( + wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode +) -> None: + examples = [] + for t_node in level_node.find_child(NodeKind.TEMPLATE): + if t_node.template_name.lower() == "quote": + example = Example() + example.text = clean_node( + wxr, None, t_node.template_parameters.get(1, "") + ) + example.ref = clean_node( + wxr, None, t_node.template_parameters.get(2, "") + ) + if example.text != "": + examples.append(example) + for data in page_data: + if data.lang_code == page_data[-1].lang_code: + data.etymology_examples.extend(examples) diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py index d6d4c4f0..1bd3bd82 100644 --- a/src/wiktextract/extractor/it/models.py +++ b/src/wiktextract/extractor/it/models.py @@ -62,3 +62,5 @@ class WordEntry(ItalianBaseModel): raw_tags: list[str] = [] translations: list[Translation] = [] forms: list[Form] = [] + etymology_texts: list[str] = [] + etymology_examples: list[Example] = [] diff --git a/src/wiktextract/extractor/it/page.py b/src/wiktextract/extractor/it/page.py index 46b2b224..b1a7dcb1 100644 --- a/src/wiktextract/extractor/it/page.py +++ b/src/wiktextract/extractor/it/page.py @@ -4,6 +4,7 @@ from ...page import clean_node from ...wxr_context import WiktextractContext +from .etymology import extract_citation_section, extract_etymology_section from .models import Sense, WordEntry from .pos import extract_pos_section from .section_titles import POS_DATA @@ -21,6 +22,10 @@ def parse_section( extract_pos_section(wxr, page_data, base_data, level_node, title_text) elif title_text == "Traduzione": extract_translation_section(wxr, page_data, level_node) + elif title_text == "Etimologia / Derivazione": + extract_etymology_section(wxr, page_data, level_node) + elif title_text == "Citazione": + extract_citation_section(wxr, page_data, level_node) for next_level in level_node.find_child(LEVEL_KIND_FLAGS): parse_section(wxr, page_data, base_data, next_level) @@ -37,6 +42,8 @@ def parse_page( for level2_node in tree.find_child(NodeKind.LEVEL2): lang_cats = {} lang_name = clean_node(wxr, lang_cats, level2_node.largs) + if lang_name in ["Altri progetti", "Note / Riferimenti"]: + continue lang_code = "unknown" for lang_template in level2_node.find_content(NodeKind.TEMPLATE): lang_code = lang_template.template_name.strip("-") diff --git a/tests/test_it_etymology.py b/tests/test_it_etymology.py new file mode 100644 index 00000000..e228a363 --- /dev/null +++ b/tests/test_it_etymology.py @@ -0,0 +1,62 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.it.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestItGloss(TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="it"), + WiktionaryConfig( + dump_file_lang_code="it", capture_language_codes=None + ), + ) + + def test_quote_template(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "cane", + """== {{-it-}} == +===Sostantivo=== +# {{Term|mammalogia|it}} [[animale]] +===Etimologia / Derivazione=== +dal latino canis +====Citazione==== +{{Quote +|Cane affamato non teme bastone +|[[q:Giovanni Verga|Giovanni Verga]]}}""", + ) + self.assertEqual(data[0]["etymology_texts"], ["dal latino canis"]) + self.assertEqual( + data[0]["etymology_examples"], + [ + { + "text": "Cane affamato non teme bastone", + "ref": "Giovanni Verga", + } + ], + ) + + def test_list(self): + self.wxr.wtp.add_page("Template:-la-", 10, "Latino") + data = parse_page( + self.wxr, + "cane", + """== {{-it-}} == +===Sostantivo, forma flessa=== +# {{Term|mammalogia|it}} [[animale]] +===Etimologia / Derivazione=== +* (sostantivo) vedi [[canis#Latino|canis]] +* (voce verbale) vedi [[cano#Latino|canō]]""", + ) + self.assertEqual( + data[0]["etymology_texts"], + ["(sostantivo) vedi canis", "(voce verbale) vedi canō"], + ) From 611bf725671513de685c124c923058fd9202ad5c Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 13 Dec 2024 17:28:18 +0800 Subject: [PATCH 4/5] [it] extract hyphenation section --- src/wiktextract/extractor/it/models.py | 1 + src/wiktextract/extractor/it/page.py | 3 +++ src/wiktextract/extractor/it/sound.py | 17 ++++++++++++++ tests/test_it_sound.py | 32 ++++++++++++++++++++++++++ 4 files changed, 53 insertions(+) create mode 100644 src/wiktextract/extractor/it/sound.py create mode 100644 tests/test_it_sound.py diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py index 1bd3bd82..d5f50924 100644 --- a/src/wiktextract/extractor/it/models.py +++ b/src/wiktextract/extractor/it/models.py @@ -64,3 +64,4 @@ class WordEntry(ItalianBaseModel): forms: list[Form] = [] etymology_texts: list[str] = [] etymology_examples: list[Example] = [] + hyphenation: str = "" diff --git a/src/wiktextract/extractor/it/page.py b/src/wiktextract/extractor/it/page.py index b1a7dcb1..dc9c17de 100644 --- a/src/wiktextract/extractor/it/page.py +++ b/src/wiktextract/extractor/it/page.py @@ -8,6 +8,7 @@ from .models import Sense, WordEntry from .pos import extract_pos_section from .section_titles import POS_DATA +from .sound import extract_hyphenation_section from .translation import extract_translation_section @@ -26,6 +27,8 @@ def parse_section( extract_etymology_section(wxr, page_data, level_node) elif title_text == "Citazione": extract_citation_section(wxr, page_data, level_node) + elif title_text == "Sillabazione": + extract_hyphenation_section(wxr, page_data, level_node) for next_level in level_node.find_child(LEVEL_KIND_FLAGS): parse_section(wxr, page_data, base_data, next_level) diff --git a/src/wiktextract/extractor/it/sound.py b/src/wiktextract/extractor/it/sound.py new file mode 100644 index 00000000..6988762b --- /dev/null +++ b/src/wiktextract/extractor/it/sound.py @@ -0,0 +1,17 @@ +from wikitextprocessor import LevelNode, NodeKind + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import WordEntry + + +def extract_hyphenation_section( + wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode +) -> None: + hyphenation = "" + for list_node in level_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + hyphenation = clean_node(wxr, None, list_item.children) + for data in page_data: + if data.lang_code == page_data[-1].lang_code: + data.hyphenation = hyphenation diff --git a/tests/test_it_sound.py b/tests/test_it_sound.py new file mode 100644 index 00000000..a5a5a820 --- /dev/null +++ b/tests/test_it_sound.py @@ -0,0 +1,32 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.it.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestItSound(TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="it"), + WiktionaryConfig( + dump_file_lang_code="it", capture_language_codes=None + ), + ) + + def test_hyphenation(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "cane", + """== {{-it-}} == +===Sostantivo=== +# {{Term|mammalogia|it}} [[animale]] +===Sillabazione=== +; cà | ne""", + ) + self.assertEqual(data[0]["hyphenation"], "cà | ne") From e17550032c890d2079fe20079a0ddef960d28ac9 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 13 Dec 2024 17:44:25 +0800 Subject: [PATCH 5/5] [it] extract pronunciation section --- src/wiktextract/extractor/it/models.py | 14 +++++++++++ src/wiktextract/extractor/it/page.py | 4 +++- src/wiktextract/extractor/it/sound.py | 32 +++++++++++++++++++++++++- tests/test_it_sound.py | 16 +++++++++++++ 4 files changed, 64 insertions(+), 2 deletions(-) diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py index d5f50924..1e42a0f5 100644 --- a/src/wiktextract/extractor/it/models.py +++ b/src/wiktextract/extractor/it/models.py @@ -49,6 +49,19 @@ class Form(ItalianBaseModel): raw_tags: list[str] = [] +class Sound(ItalianBaseModel): + ipa: str = Field(default="", description="International Phonetic Alphabet") + audio: str = Field(default="", description="Audio file name") + wav_url: str = "" + oga_url: str = "" + ogg_url: str = "" + mp3_url: str = "" + opus_url: str = "" + flac_url: str = "" + tags: list[str] = [] + raw_tags: list[str] = [] + + class WordEntry(ItalianBaseModel): model_config = ConfigDict(title="Italian Wiktionary") word: str = Field(description="Word string", min_length=1) @@ -65,3 +78,4 @@ class WordEntry(ItalianBaseModel): etymology_texts: list[str] = [] etymology_examples: list[Example] = [] hyphenation: str = "" + sounds: list[Sound] = [] diff --git a/src/wiktextract/extractor/it/page.py b/src/wiktextract/extractor/it/page.py index dc9c17de..7817a40b 100644 --- a/src/wiktextract/extractor/it/page.py +++ b/src/wiktextract/extractor/it/page.py @@ -8,7 +8,7 @@ from .models import Sense, WordEntry from .pos import extract_pos_section from .section_titles import POS_DATA -from .sound import extract_hyphenation_section +from .sound import extract_hyphenation_section, extract_pronunciation_section from .translation import extract_translation_section @@ -29,6 +29,8 @@ def parse_section( extract_citation_section(wxr, page_data, level_node) elif title_text == "Sillabazione": extract_hyphenation_section(wxr, page_data, level_node) + elif title_text == "Pronuncia": + extract_pronunciation_section(wxr, page_data, level_node) for next_level in level_node.find_child(LEVEL_KIND_FLAGS): parse_section(wxr, page_data, base_data, next_level) diff --git a/src/wiktextract/extractor/it/sound.py b/src/wiktextract/extractor/it/sound.py index 6988762b..08b9074d 100644 --- a/src/wiktextract/extractor/it/sound.py +++ b/src/wiktextract/extractor/it/sound.py @@ -2,7 +2,8 @@ from ...page import clean_node from ...wxr_context import WiktextractContext -from .models import WordEntry +from ..share import set_sound_file_url_fields +from .models import Sound, WordEntry def extract_hyphenation_section( @@ -15,3 +16,32 @@ def extract_hyphenation_section( for data in page_data: if data.lang_code == page_data[-1].lang_code: data.hyphenation = hyphenation + + +def extract_pronunciation_section( + wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode +) -> None: + sounds = [] + for t_node in level_node.find_child(NodeKind.TEMPLATE): + match t_node.template_name.lower(): + case "ipa": + ipa = clean_node( + wxr, None, t_node.template_parameters.get(1, "") + ) + if ipa != "": + sounds.append(Sound(ipa=ipa)) + case "audio": + sound_file = clean_node( + wxr, None, t_node.template_parameters.get(1, "") + ) + if sound_file != "": + if len(sounds) > 0: + set_sound_file_url_fields(wxr, sound_file, sounds[-1]) + else: + sound = Sound() + set_sound_file_url_fields(wxr, sound_file, sound) + sounds.append(sound) + + for data in page_data: + if data.lang_code == page_data[-1].lang_code: + data.sounds.extend(sounds) diff --git a/tests/test_it_sound.py b/tests/test_it_sound.py index a5a5a820..62c695b2 100644 --- a/tests/test_it_sound.py +++ b/tests/test_it_sound.py @@ -30,3 +30,19 @@ def test_hyphenation(self): ; cà | ne""", ) self.assertEqual(data[0]["hyphenation"], "cà | ne") + + def test_ipa_audio_templates(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "cane", + """== {{-it-}} == +===Sostantivo=== +# {{Term|mammalogia|it}} [[animale]] +===Pronuncia=== +{{IPA|/ˈkaːne/}} +{{Audio|it-cane.ogg}}""", + ) + sound = data[0]["sounds"][0] + self.assertEqual(sound["ipa"], "/ˈkaːne/") + self.assertEqual(sound["audio"], "it-cane.ogg")