Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[it, pt] fix check JSON and pydantic errors, extract some forms templates #949

Merged
merged 5 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/wiktextract/extractor/it/analyze_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@
# https://it.wiktionary.org/wiki/Categoria:Template_per_gli_aggettivi
"Template:-agg form-",
"Template:-agg num form-",
# POS
# https://it.wiktionary.org/wiki/Categoria:Template_altre_voci
"Template:-conf-",
"Template:-kanji-",
# other sections
# https://it.wiktionary.org/wiki/Categoria:Template_sezione
"Template:-esempio-",
Expand Down
56 changes: 55 additions & 1 deletion src/wiktextract/extractor/it/inflection.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from wikitextprocessor import TemplateNode
from wikitextprocessor import NodeKind, TemplateNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Form, WordEntry
from .tags import translate_raw_tags


def extract_tabs_template(
Expand All @@ -22,3 +23,56 @@ def extract_tabs_template(
if arg_value not in ["", wxr.wtp.title]:
form = Form(form=arg_value, tags=tags[arg_name - 1])
word_entry.forms.append(form)


def extract_it_decl_agg_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
# https://it.wiktionary.org/wiki/Template:It-decl-agg4
# https://it.wiktionary.org/wiki/Template:It-decl-agg2
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for table in expanded_node.find_child(NodeKind.TABLE):
raw_tag = ""
col_tags = []
for row in table.find_child(NodeKind.TABLE_ROW):
row_tag = ""
col_index = 0
for cell in row.find_child(
NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
):
match cell.kind:
case NodeKind.TABLE_HEADER_CELL:
col_span = cell.attrs.get("colspan", "")
if col_span != "":
raw_tag = clean_node(wxr, None, cell)
elif (
len(
[
n
for n in row.find_child(
NodeKind.TABLE_HEADER_CELL
)
]
)
== 1
):
row_tag = clean_node(wxr, None, cell)
else:
col_header = clean_node(wxr, None, cell)
if col_header != "":
col_tags.append(col_header)
case NodeKind.TABLE_CELL:
word = clean_node(wxr, None, cell)
if word not in ["", wxr.wtp.title]:
form = Form(form=word)
if raw_tag != "":
form.raw_tags.append(raw_tag)
if row_tag != "":
form.raw_tags.append(row_tag)
if col_index < len(col_tags):
form.raw_tags.append(col_tags[col_index])
translate_raw_tags(form)
word_entry.forms.append(form)
col_index += 1
8 changes: 8 additions & 0 deletions src/wiktextract/extractor/it/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,25 @@ def parse_section(
) -> None:
title_text = clean_node(wxr, None, level_node.largs)
if title_text in POS_DATA:
wxr.wtp.start_subsection(title_text)
extract_pos_section(wxr, page_data, base_data, level_node, title_text)
elif title_text == "Traduzione":
wxr.wtp.start_subsection(title_text)
extract_translation_section(wxr, page_data, level_node)
elif title_text == "Etimologia / Derivazione":
wxr.wtp.start_subsection(title_text)
extract_etymology_section(wxr, page_data, level_node)
elif title_text == "Citazione":
wxr.wtp.start_subsection(title_text)
extract_citation_section(wxr, page_data, level_node)
elif title_text == "Sillabazione":
wxr.wtp.start_subsection(title_text)
extract_hyphenation_section(wxr, page_data, level_node)
elif title_text == "Pronuncia":
wxr.wtp.start_subsection(title_text)
extract_pronunciation_section(wxr, page_data, level_node)
elif title_text in LINKAGE_SECTIONS:
wxr.wtp.start_subsection(title_text)
extract_linkage_section(
wxr, page_data, level_node, LINKAGE_SECTIONS[title_text]
)
Expand All @@ -46,6 +53,7 @@ def parse_page(
) -> list[dict[str, Any]]:
# page layout
# https://it.wiktionary.org/wiki/Wikizionario:Manuale_di_stile
# https://it.wiktionary.org/wiki/Aiuto:Come_iniziare_una_pagina
wxr.wtp.start_page(page_title)
tree = wxr.wtp.parse(page_text, pre_expand=True)
page_data: list[WordEntry] = []
Expand Down
9 changes: 8 additions & 1 deletion src/wiktextract/extractor/it/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
from .section_titles import POS_DATA
from .tag_form_line import extract_tag_form_line_nodes

# https://it.wiktionary.org/wiki/Categoria:Template_per_i_verbi
POS_SUBSECTION_TEMPLATES = frozenset(
[
# https://it.wiktionary.org/wiki/Categoria:Template_per_i_verbi
"-participio passato-",
"-participio presente-",
"Ausiliare",
Expand All @@ -19,7 +19,14 @@
"Passivo",
"Reciproco",
"Riflessivo",
"riflessivo",
"Transitivo",
# https://it.wiktionary.org/wiki/Categoria:Template_vocabolo
"Attivo",
"attivo",
"Inpr",
"inpr",
"Riflpr",
]
)

Expand Down
4 changes: 4 additions & 0 deletions src/wiktextract/extractor/it/section_titles.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# https://it.wiktionary.org/wiki/Wikizionario:Parti_del_discorso
# https://it.wiktionary.org/wiki/Categoria:Template_parti_del_discorso
# https://it.wiktionary.org/wiki/Categoria:Template_aggiornati
# https://it.wiktionary.org/wiki/Categoria:Template_per_gli_aggettivi
# https://it.wiktionary.org/wiki/Categoria:Template_altre_voci
POS_DATA = {
"Acronimo / Abbreviazione": {"pos": "abbrev", "tags": ["abbreviation"]},
"Articolo": {"pos": "article"},
Expand Down Expand Up @@ -61,6 +63,8 @@
},
"Codice / Simbolo": {"pos": "symbol"},
"Carattere hiragana": {"pos": "character", "tags": ["hiragana"]},
"Confisso": {"pos": "affix"},
"Kanji": {"pos": "character", "tags": ["kanji"]},
}


Expand Down
44 changes: 32 additions & 12 deletions src/wiktextract/extractor/it/tag_form_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .inflection import extract_tabs_template
from .inflection import extract_it_decl_agg_template, extract_tabs_template
from .models import Form, WordEntry


Expand All @@ -14,17 +14,12 @@ def extract_tag_form_line_nodes(
if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
extract_italic_tag_node(wxr, word_entry, node)
elif isinstance(node, TemplateNode):
match node.template_name.lower():
case "tabs":
extract_tabs_template(wxr, word_entry, node)
case "linkp":
form = clean_node(
wxr, None, node.template_parameters.get(1, "")
)
if form != "":
word_entry.forms.append(
Form(form=form, tags=["plural"])
)
if node.template_name.lower() == "tabs":
extract_tabs_template(wxr, word_entry, node)
elif node.template_name.lower() in FORM_LINK_TEMPLATES.keys():
extract_form_link_template(wxr, word_entry, node)
elif node.template_name.lower().startswith("it-decl-agg"):
extract_it_decl_agg_template(wxr, word_entry, node)


ITALIC_TAGS = {
Expand All @@ -50,3 +45,28 @@ def extract_italic_tag_node(
word_entry.tags.append(ITALIC_TAGS[raw_tag])
else:
word_entry.raw_tags.append(raw_tag)


FORM_LINK_TEMPLATES = {
"linkf": ["feminine"],
"linkfp": ["feminine", "plural"],
"linkg": ["genitive"],
"linkm": ["masculine"],
"linkn": ["neuter"],
"linkmai": ["uppercase"],
"linkp": ["plural"],
"links": ["singular"],
}


def extract_form_link_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
arg_name = 1
while arg_name in t_node.template_parameters:
form = clean_node(
wxr, None, t_node.template_parameters.get(arg_name, "")
)
if form != "":
word_entry.forms.append(Form(form=form, tags=["plural"]))
arg_name += 1
30 changes: 30 additions & 0 deletions src/wiktextract/extractor/it/tags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from .models import WordEntry

TABLE_TAGS = {
# https://it.wiktionary.org/wiki/Template:It-decl-agg4
"singolare": "singular",
"plurale": "plural",
"positivo": "positive",
"superlativo assoluto": ["absolute", "superlative"],
"maschile": "masculine",
"femminile": "feminine",
# https://it.wiktionary.org/wiki/Template:It-decl-agg2
"m e f": ["masculine", "feminine"],
}


TAGS = {**TABLE_TAGS}


def translate_raw_tags(data: WordEntry) -> None:
raw_tags = []
for raw_tag in data.raw_tags:
if raw_tag in TAGS:
tr_tag = TAGS[raw_tag]
if isinstance(tr_tag, str):
data.tags.append(tr_tag)
elif isinstance(tr_tag, list):
data.tags.extend(tr_tag)
else:
raw_tags.append(raw_tag)
data.raw_tags = raw_tags
6 changes: 5 additions & 1 deletion src/wiktextract/extractor/it/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@ def extract_translation_list_item(
for index, node in enumerate(list_item.children):
if before_colon and isinstance(node, str) and ":" in node:
before_colon = False
lang_name = clean_node(wxr, None, list_item.children[:index])
lang_name = clean_node(
wxr,
None,
list_item.children[:index] + [node[: node.index(":")]],
)
for n in list_item.children[:index]:
if isinstance(n, TemplateNode):
lang_code = n.template_name
Expand Down
4 changes: 4 additions & 0 deletions src/wiktextract/extractor/pt/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,13 @@ def parse_page(
for level1_node in tree.find_child(NodeKind.LEVEL1):
lang_cats = {}
lang_name = clean_node(wxr, lang_cats, level1_node.largs)
if lang_name == "":
lang_name = "unknown"
lang_code = "unknown"
for lang_template in level1_node.find_content(NodeKind.TEMPLATE):
lang_code = lang_template.template_name.strip("-")
if lang_code == "": # template "--"
lang_code = "unknown"
break
if (
wxr.config.capture_language_codes is not None
Expand Down
28 changes: 28 additions & 0 deletions tests/test_it_forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,31 @@ def test_linkp_template(self):
[{"form": "cagne", "tags": ["plural"]}],
)
self.assertEqual(data[0]["tags"], ["feminine", "singular"])

def test_it_decl_agg(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
self.wxr.wtp.add_page("Template:It-decl-agg4", 10, """{|
|- align="center"
| &nbsp;
!bgcolor="#FFFFE0" color="#000"|&nbsp;''[[singolare]]''&nbsp;
!bgcolor="#FFFFE0" color="#000"|&nbsp;''[[plurale]]''&nbsp;
|- align="center"
!bgcolor="#FFFFE0" color="#000" colspan="3"|&nbsp;''[[positivo]]''&nbsp;
|- align="center"
!bgcolor="#FFFFE0" color="#000"|&nbsp;''[[maschile]]''&nbsp;
|&nbsp; [[libero]] &nbsp;
|&nbsp; [[liberi]] &nbsp;
|}""")
data = parse_page(
self.wxr,
"libero",
"""== {{-it-}} ==
===Aggettivo===
{{It-decl-agg4|liber}}
{{Pn|w}} ''m sing''
# non [[imprigionato]] o in [[schiavitù]]""",
)
self.assertEqual(
data[0]["forms"],
[{"form": "liberi", "tags": ["positive", "masculine", "plural"]}],
)
16 changes: 16 additions & 0 deletions tests/test_it_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,19 @@ def test_common_lists(self):
},
],
)

def test_no_lang_name_template(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
data = parse_page(
self.wxr,
"Italia",
"""== {{-it-}} ==
===Nome proprio===
# stato
===Traduzione===
:* võro: [[Itaalia]]""",
)
self.assertEqual(
data[0]["translations"],
[{"word": "Itaalia", "lang_code": "vro", "lang": "võro"}],
)
Loading