diff --git a/src/wiktextract/extractor/it/analyze_template.py b/src/wiktextract/extractor/it/analyze_template.py index d76e34af..1a482a50 100644 --- a/src/wiktextract/extractor/it/analyze_template.py +++ b/src/wiktextract/extractor/it/analyze_template.py @@ -55,6 +55,10 @@ # https://it.wiktionary.org/wiki/Categoria:Template_per_gli_aggettivi "Template:-agg form-", "Template:-agg num form-", + # POS + # https://it.wiktionary.org/wiki/Categoria:Template_altre_voci + "Template:-conf-", + "Template:-kanji-", # other sections # https://it.wiktionary.org/wiki/Categoria:Template_sezione "Template:-esempio-", diff --git a/src/wiktextract/extractor/it/inflection.py b/src/wiktextract/extractor/it/inflection.py index 1f9a1eed..768a5c86 100644 --- a/src/wiktextract/extractor/it/inflection.py +++ b/src/wiktextract/extractor/it/inflection.py @@ -1,8 +1,9 @@ -from wikitextprocessor import TemplateNode +from wikitextprocessor import NodeKind, TemplateNode from ...page import clean_node from ...wxr_context import WiktextractContext from .models import Form, WordEntry +from .tags import translate_raw_tags def extract_tabs_template( @@ -22,3 +23,56 @@ def extract_tabs_template( if arg_value not in ["", wxr.wtp.title]: form = Form(form=arg_value, tags=tags[arg_name - 1]) word_entry.forms.append(form) + + +def extract_it_decl_agg_template( + wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode +) -> None: + # https://it.wiktionary.org/wiki/Template:It-decl-agg4 + # https://it.wiktionary.org/wiki/Template:It-decl-agg2 + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + for table in expanded_node.find_child(NodeKind.TABLE): + raw_tag = "" + col_tags = [] + for row in table.find_child(NodeKind.TABLE_ROW): + row_tag = "" + col_index = 0 + for cell in row.find_child( + NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL + ): + match cell.kind: + case NodeKind.TABLE_HEADER_CELL: + col_span = cell.attrs.get("colspan", "") + if col_span != "": + raw_tag = clean_node(wxr, None, cell) + elif ( + len( + [ + n + for n in row.find_child( + NodeKind.TABLE_HEADER_CELL + ) + ] + ) + == 1 + ): + row_tag = clean_node(wxr, None, cell) + else: + col_header = clean_node(wxr, None, cell) + if col_header != "": + col_tags.append(col_header) + case NodeKind.TABLE_CELL: + word = clean_node(wxr, None, cell) + if word not in ["", wxr.wtp.title]: + form = Form(form=word) + if raw_tag != "": + form.raw_tags.append(raw_tag) + if row_tag != "": + form.raw_tags.append(row_tag) + if col_index < len(col_tags): + form.raw_tags.append(col_tags[col_index]) + translate_raw_tags(form) + word_entry.forms.append(form) + col_index += 1 diff --git a/src/wiktextract/extractor/it/page.py b/src/wiktextract/extractor/it/page.py index 68f51ce2..ee0bb60d 100644 --- a/src/wiktextract/extractor/it/page.py +++ b/src/wiktextract/extractor/it/page.py @@ -21,18 +21,25 @@ def parse_section( ) -> None: title_text = clean_node(wxr, None, level_node.largs) if title_text in POS_DATA: + wxr.wtp.start_subsection(title_text) extract_pos_section(wxr, page_data, base_data, level_node, title_text) elif title_text == "Traduzione": + wxr.wtp.start_subsection(title_text) extract_translation_section(wxr, page_data, level_node) elif title_text == "Etimologia / Derivazione": + wxr.wtp.start_subsection(title_text) extract_etymology_section(wxr, page_data, level_node) elif title_text == "Citazione": + wxr.wtp.start_subsection(title_text) extract_citation_section(wxr, page_data, level_node) elif title_text == "Sillabazione": + wxr.wtp.start_subsection(title_text) extract_hyphenation_section(wxr, page_data, level_node) elif title_text == "Pronuncia": + wxr.wtp.start_subsection(title_text) extract_pronunciation_section(wxr, page_data, level_node) elif title_text in LINKAGE_SECTIONS: + wxr.wtp.start_subsection(title_text) extract_linkage_section( wxr, page_data, level_node, LINKAGE_SECTIONS[title_text] ) @@ -46,6 +53,7 @@ def parse_page( ) -> list[dict[str, Any]]: # page layout # https://it.wiktionary.org/wiki/Wikizionario:Manuale_di_stile + # https://it.wiktionary.org/wiki/Aiuto:Come_iniziare_una_pagina wxr.wtp.start_page(page_title) tree = wxr.wtp.parse(page_text, pre_expand=True) page_data: list[WordEntry] = [] diff --git a/src/wiktextract/extractor/it/pos.py b/src/wiktextract/extractor/it/pos.py index bded7d5c..12f3fee9 100644 --- a/src/wiktextract/extractor/it/pos.py +++ b/src/wiktextract/extractor/it/pos.py @@ -7,9 +7,9 @@ from .section_titles import POS_DATA from .tag_form_line import extract_tag_form_line_nodes -# https://it.wiktionary.org/wiki/Categoria:Template_per_i_verbi POS_SUBSECTION_TEMPLATES = frozenset( [ + # https://it.wiktionary.org/wiki/Categoria:Template_per_i_verbi "-participio passato-", "-participio presente-", "Ausiliare", @@ -19,7 +19,14 @@ "Passivo", "Reciproco", "Riflessivo", + "riflessivo", "Transitivo", + # https://it.wiktionary.org/wiki/Categoria:Template_vocabolo + "Attivo", + "attivo", + "Inpr", + "inpr", + "Riflpr", ] ) diff --git a/src/wiktextract/extractor/it/section_titles.py b/src/wiktextract/extractor/it/section_titles.py index b5360d75..c3631c46 100644 --- a/src/wiktextract/extractor/it/section_titles.py +++ b/src/wiktextract/extractor/it/section_titles.py @@ -1,6 +1,8 @@ +# https://it.wiktionary.org/wiki/Wikizionario:Parti_del_discorso # https://it.wiktionary.org/wiki/Categoria:Template_parti_del_discorso # https://it.wiktionary.org/wiki/Categoria:Template_aggiornati # https://it.wiktionary.org/wiki/Categoria:Template_per_gli_aggettivi +# https://it.wiktionary.org/wiki/Categoria:Template_altre_voci POS_DATA = { "Acronimo / Abbreviazione": {"pos": "abbrev", "tags": ["abbreviation"]}, "Articolo": {"pos": "article"}, @@ -61,6 +63,8 @@ }, "Codice / Simbolo": {"pos": "symbol"}, "Carattere hiragana": {"pos": "character", "tags": ["hiragana"]}, + "Confisso": {"pos": "affix"}, + "Kanji": {"pos": "character", "tags": ["kanji"]}, } diff --git a/src/wiktextract/extractor/it/tag_form_line.py b/src/wiktextract/extractor/it/tag_form_line.py index 97a42682..b7e1da16 100644 --- a/src/wiktextract/extractor/it/tag_form_line.py +++ b/src/wiktextract/extractor/it/tag_form_line.py @@ -2,7 +2,7 @@ from ...page import clean_node from ...wxr_context import WiktextractContext -from .inflection import extract_tabs_template +from .inflection import extract_it_decl_agg_template, extract_tabs_template from .models import Form, WordEntry @@ -14,17 +14,12 @@ def extract_tag_form_line_nodes( if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: extract_italic_tag_node(wxr, word_entry, node) elif isinstance(node, TemplateNode): - match node.template_name.lower(): - case "tabs": - extract_tabs_template(wxr, word_entry, node) - case "linkp": - form = clean_node( - wxr, None, node.template_parameters.get(1, "") - ) - if form != "": - word_entry.forms.append( - Form(form=form, tags=["plural"]) - ) + if node.template_name.lower() == "tabs": + extract_tabs_template(wxr, word_entry, node) + elif node.template_name.lower() in FORM_LINK_TEMPLATES.keys(): + extract_form_link_template(wxr, word_entry, node) + elif node.template_name.lower().startswith("it-decl-agg"): + extract_it_decl_agg_template(wxr, word_entry, node) ITALIC_TAGS = { @@ -50,3 +45,28 @@ def extract_italic_tag_node( word_entry.tags.append(ITALIC_TAGS[raw_tag]) else: word_entry.raw_tags.append(raw_tag) + + +FORM_LINK_TEMPLATES = { + "linkf": ["feminine"], + "linkfp": ["feminine", "plural"], + "linkg": ["genitive"], + "linkm": ["masculine"], + "linkn": ["neuter"], + "linkmai": ["uppercase"], + "linkp": ["plural"], + "links": ["singular"], +} + + +def extract_form_link_template( + wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode +) -> None: + arg_name = 1 + while arg_name in t_node.template_parameters: + form = clean_node( + wxr, None, t_node.template_parameters.get(arg_name, "") + ) + if form != "": + word_entry.forms.append(Form(form=form, tags=["plural"])) + arg_name += 1 diff --git a/src/wiktextract/extractor/it/tags.py b/src/wiktextract/extractor/it/tags.py new file mode 100644 index 00000000..036ae1f9 --- /dev/null +++ b/src/wiktextract/extractor/it/tags.py @@ -0,0 +1,30 @@ +from .models import WordEntry + +TABLE_TAGS = { + # https://it.wiktionary.org/wiki/Template:It-decl-agg4 + "singolare": "singular", + "plurale": "plural", + "positivo": "positive", + "superlativo assoluto": ["absolute", "superlative"], + "maschile": "masculine", + "femminile": "feminine", + # https://it.wiktionary.org/wiki/Template:It-decl-agg2 + "m e f": ["masculine", "feminine"], +} + + +TAGS = {**TABLE_TAGS} + + +def translate_raw_tags(data: WordEntry) -> None: + raw_tags = [] + for raw_tag in data.raw_tags: + if raw_tag in TAGS: + tr_tag = TAGS[raw_tag] + if isinstance(tr_tag, str): + data.tags.append(tr_tag) + elif isinstance(tr_tag, list): + data.tags.extend(tr_tag) + else: + raw_tags.append(raw_tag) + data.raw_tags = raw_tags diff --git a/src/wiktextract/extractor/it/translation.py b/src/wiktextract/extractor/it/translation.py index e1cde868..b0fcba24 100644 --- a/src/wiktextract/extractor/it/translation.py +++ b/src/wiktextract/extractor/it/translation.py @@ -50,7 +50,11 @@ def extract_translation_list_item( for index, node in enumerate(list_item.children): if before_colon and isinstance(node, str) and ":" in node: before_colon = False - lang_name = clean_node(wxr, None, list_item.children[:index]) + lang_name = clean_node( + wxr, + None, + list_item.children[:index] + [node[: node.index(":")]], + ) for n in list_item.children[:index]: if isinstance(n, TemplateNode): lang_code = n.template_name diff --git a/src/wiktextract/extractor/pt/page.py b/src/wiktextract/extractor/pt/page.py index 55c70f03..eca169ec 100644 --- a/src/wiktextract/extractor/pt/page.py +++ b/src/wiktextract/extractor/pt/page.py @@ -98,9 +98,13 @@ def parse_page( for level1_node in tree.find_child(NodeKind.LEVEL1): lang_cats = {} lang_name = clean_node(wxr, lang_cats, level1_node.largs) + if lang_name == "": + lang_name = "unknown" lang_code = "unknown" for lang_template in level1_node.find_content(NodeKind.TEMPLATE): lang_code = lang_template.template_name.strip("-") + if lang_code == "": # template "--" + lang_code = "unknown" break if ( wxr.config.capture_language_codes is not None diff --git a/tests/test_it_forms.py b/tests/test_it_forms.py index 64c3320a..8ebeddf9 100644 --- a/tests/test_it_forms.py +++ b/tests/test_it_forms.py @@ -55,3 +55,31 @@ def test_linkp_template(self): [{"form": "cagne", "tags": ["plural"]}], ) self.assertEqual(data[0]["tags"], ["feminine", "singular"]) + + def test_it_decl_agg(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + self.wxr.wtp.add_page("Template:It-decl-agg4", 10, """{| +|- align="center" +|   +!bgcolor="#FFFFE0" color="#000"| ''[[singolare]]''  +!bgcolor="#FFFFE0" color="#000"| ''[[plurale]]''  +|- align="center" +!bgcolor="#FFFFE0" color="#000" colspan="3"| ''[[positivo]]''  +|- align="center" +!bgcolor="#FFFFE0" color="#000"| ''[[maschile]]''  +|  [[libero]]   +|  [[liberi]]   +|}""") + data = parse_page( + self.wxr, + "libero", + """== {{-it-}} == +===Aggettivo=== +{{It-decl-agg4|liber}} +{{Pn|w}} ''m sing'' +# non [[imprigionato]] o in [[schiavitù]]""", + ) + self.assertEqual( + data[0]["forms"], + [{"form": "liberi", "tags": ["positive", "masculine", "plural"]}], + ) diff --git a/tests/test_it_translation.py b/tests/test_it_translation.py index c91ee5e6..1ab8dfb7 100644 --- a/tests/test_it_translation.py +++ b/tests/test_it_translation.py @@ -52,3 +52,19 @@ def test_common_lists(self): }, ], ) + + def test_no_lang_name_template(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "Italia", + """== {{-it-}} == +===Nome proprio=== +# stato +===Traduzione=== +:* võro: [[Itaalia]]""", + ) + self.assertEqual( + data[0]["translations"], + [{"word": "Itaalia", "lang_code": "vro", "lang": "võro"}], + )