Skip to content

Commit

Permalink
Merge pull request #950 from xxyzz/it
Browse files Browse the repository at this point in the history
[it] extract "a cmp" and "it-conj" form templates
  • Loading branch information
xxyzz authored Dec 19, 2024
2 parents 58f2b7d + f54aa3c commit 85a9718
Show file tree
Hide file tree
Showing 5 changed files with 320 additions and 6 deletions.
147 changes: 146 additions & 1 deletion src/wiktextract/extractor/it/inflection.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from wikitextprocessor import NodeKind, TemplateNode
import re
from dataclasses import dataclass

from wikitextprocessor import NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
Expand Down Expand Up @@ -76,3 +79,145 @@ def extract_it_decl_agg_template(
translate_raw_tags(form)
word_entry.forms.append(form)
col_index += 1


def extract_appendix_conjugation_page(
wxr: WiktextractContext, word_entry: WordEntry, page_title: str
) -> None:
# https://it.wiktionary.org/wiki/Appendice:Coniugazioni
page_text = wxr.wtp.get_page_body(page_title, 100)
if page_text is None:
return
root = wxr.wtp.parse(page_text)
for t_node in root.find_child(NodeKind.TEMPLATE):
if t_node.template_name.lower() == "it-conj":
extract_it_conj_template(wxr, word_entry, t_node, page_title)


@dataclass
class TableHeader:
text: str
col_index: int
colspan: int
row_index: int
rowspan: int


def extract_it_conj_template(
wxr: WiktextractContext,
word_entry: WordEntry,
t_node: TemplateNode,
page_title: str,
) -> None:
# https://it.wiktionary.org/wiki/Template:It-conj
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for table in expanded_node.find_child(NodeKind.TABLE):
col_headers = []
row_header = ""
for row in table.find_child(NodeKind.TABLE_ROW):
col_index = 0
for cell in row.find_child(
NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
):
match cell.kind:
case NodeKind.TABLE_HEADER_CELL:
header_str = clean_node(wxr, None, cell)
if header_str in ["persona", "indicativo"]:
continue
elif header_str in ["condizionale", "congiuntivo"]:
col_headers.clear()
continue
elif header_str == "imperativo":
col_headers.clear()
row_header = "imperativo"
continue

if row.contain_node(NodeKind.TABLE_CELL):
row_header = header_str
else:
colspan = 1
colspan_str = cell.attrs.get("colspan", "1")
if re.fullmatch(r"\d+", colspan_str):
colspan = int(colspan_str)
col_headers.append(
TableHeader(
header_str, col_index, colspan, 0, 0
)
)
col_index += colspan
case NodeKind.TABLE_CELL:
cell_has_table = False
for cell_table in cell.find_child_recursively(
NodeKind.TABLE
):
extract_it_conj_cell_table(
wxr,
word_entry,
cell_table,
row_header,
col_headers,
page_title,
)
cell_has_table = True
if not cell_has_table:
for form_str in clean_node(
wxr, None, cell
).splitlines():
form_str = form_str.strip(", ")
if form_str.startswith("verbo di "):
continue # first row
if form_str not in ["", wxr.wtp.title]:
add_it_conj_form(
word_entry,
form_str,
page_title,
row_header,
col_index,
col_headers,
)
col_index += 1


def extract_it_conj_cell_table(
wxr: WiktextractContext,
word_entry: WordEntry,
table_node: WikiNode,
row_header: str,
col_headers: list[TableHeader],
page_title: str,
) -> None:
for row in table_node.find_child(NodeKind.TABLE_ROW):
for col_index, cell in enumerate(row.find_child(NodeKind.TABLE_CELL)):
for cell_str in clean_node(wxr, None, cell).splitlines():
if cell_str not in ["", wxr.wtp.title]:
add_it_conj_form(
word_entry,
cell_str,
page_title,
row_header,
col_index,
col_headers,
)


def add_it_conj_form(
word_entry: WordEntry,
form_str: str,
page_title: str,
row_header: str,
col_index: int,
col_headers: list[TableHeader],
) -> None:
form = Form(form=form_str, source=page_title)
if row_header != "":
form.raw_tags.append(row_header)
for col_header in col_headers:
if (
col_index >= col_header.col_index
and col_index < col_header.col_index + col_header.colspan
):
form.raw_tags.append(col_header.text)
translate_raw_tags(form)
word_entry.forms.append(form)
1 change: 1 addition & 0 deletions src/wiktextract/extractor/it/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class Form(ItalianBaseModel):
form: str = ""
tags: list[str] = []
raw_tags: list[str] = []
source: str = ""


class Sound(ItalianBaseModel):
Expand Down
61 changes: 59 additions & 2 deletions src/wiktextract/extractor/it/tag_form_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,13 @@

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .inflection import extract_it_decl_agg_template, extract_tabs_template
from .inflection import (
extract_appendix_conjugation_page,
extract_it_decl_agg_template,
extract_tabs_template,
)
from .models import Form, WordEntry
from .tags import translate_raw_tags


def extract_tag_form_line_nodes(
Expand All @@ -16,10 +21,14 @@ def extract_tag_form_line_nodes(
elif isinstance(node, TemplateNode):
if node.template_name.lower() == "tabs":
extract_tabs_template(wxr, word_entry, node)
elif node.template_name.lower() in FORM_LINK_TEMPLATES.keys():
elif node.template_name.lower() in FORM_LINK_TEMPLATES:
extract_form_link_template(wxr, word_entry, node)
elif node.template_name.lower().startswith("it-decl-agg"):
extract_it_decl_agg_template(wxr, word_entry, node)
elif node.template_name.lower() == "a cmp":
extract_a_cmp_template(wxr, word_entry, node)
elif node.template_name.lower() == "pn":
extract_pn_template(wxr, word_entry, node)


ITALIC_TAGS = {
Expand Down Expand Up @@ -70,3 +79,51 @@ def extract_form_link_template(
if form != "":
word_entry.forms.append(Form(form=form, tags=["plural"]))
arg_name += 1


def extract_a_cmp_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
# https://it.wiktionary.org/wiki/Template:A_cmp
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
raw_tag = ""
for node in expanded_node.find_child(NodeKind.ITALIC | NodeKind.BOLD):
match node.kind:
case NodeKind.ITALIC:
raw_tag = clean_node(wxr, None, node)
case NodeKind.BOLD:
form_str = clean_node(wxr, None, node)
if form_str not in ["", wxr.wtp.title]:
form = Form(form=form_str)
if raw_tag != "":
form.raw_tags.append(raw_tag)
translate_raw_tags(form)
word_entry.forms.append(form)


def extract_pn_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
# https://it.wiktionary.org/wiki/Template:Pn
has_c_arg = False
for arg_key, arg_value in t_node.template_parameters.items():
if arg_key == "c":
has_c_arg = True
break
arg_value_str = clean_node(wxr, None, arg_value)
if arg_value_str == "c":
has_c_arg = True
break
if not has_c_arg:
return
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for small_tag in expanded_node.find_html("small"):
for link_node in small_tag.find_child(NodeKind.LINK):
if len(link_node.largs) > 0:
link_str = clean_node(wxr, None, link_node.largs[0])
if link_str.startswith("Appendice:Coniugazioni/"):
extract_appendix_conjugation_page(wxr, word_entry, link_str)
8 changes: 7 additions & 1 deletion src/wiktextract/extractor/it/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,14 @@
"m e f": ["masculine", "feminine"],
}

FORM_LINE_TEMPLATE_TAGS = {
# https://it.wiktionary.org/wiki/Template:A_cmp
"comparativo": "comparative",
"superlativo": "superlative",
}


TAGS = {**TABLE_TAGS}
TAGS = {**TABLE_TAGS, **FORM_LINE_TEMPLATE_TAGS}


def translate_raw_tags(data: WordEntry) -> None:
Expand Down
109 changes: 107 additions & 2 deletions tests/test_it_forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,10 @@ def test_linkp_template(self):

def test_it_decl_agg(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
self.wxr.wtp.add_page("Template:It-decl-agg4", 10, """{|
self.wxr.wtp.add_page(
"Template:It-decl-agg4",
10,
"""{|
|- align="center"
| &nbsp;
!bgcolor="#FFFFE0" color="#000"|&nbsp;''[[singolare]]''&nbsp;
Expand All @@ -69,7 +72,8 @@ def test_it_decl_agg(self):
!bgcolor="#FFFFE0" color="#000"|&nbsp;''[[maschile]]''&nbsp;
|&nbsp; [[libero]] &nbsp;
|&nbsp; [[liberi]] &nbsp;
|}""")
|}""",
)
data = parse_page(
self.wxr,
"libero",
Expand All @@ -83,3 +87,104 @@ def test_it_decl_agg(self):
data[0]["forms"],
[{"form": "liberi", "tags": ["positive", "masculine", "plural"]}],
)

def test_a_cmp(self):
self.wxr.wtp.add_page("Template:-en-", 10, "Inglese")
self.wxr.wtp.add_page(
"Template:A cmp",
10,
"(''comparativo'' '''[[direr]]''', '''more dire''', ''superlativo'' '''[[direst]]''', '''most dire''')",
)
data = parse_page(
self.wxr,
"dire",
"""== {{-en-}} ==
===Aggettivo===
{{Pn}} {{A cmp|direr|c2=more dire|direst|s2=most dire}}
# [[sinistro]]""",
)
self.assertEqual(
data[0]["forms"],
[
{"form": "direr", "tags": ["comparative"]},
{"form": "more dire", "tags": ["comparative"]},
{"form": "direst", "tags": ["superlative"]},
{"form": "most dire", "tags": ["superlative"]},
],
)

def test_pn_template(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
self.wxr.wtp.add_page(
"Template:Pn",
10,
"'''dire'''<small>&nbsp;([[Appendice:Coniugazioni/Italiano/dire|vai alla coniugazione]])</small>",
)
self.wxr.wtp.add_page(
"Appendice:Coniugazioni/Italiano/dire", 100, "{{It-conj}}"
)
self.wxr.wtp.add_page(
"Template:It-conj",
10,
"""{|
|-
|-
! colspan="1" rowspan="2" | persona
! colspan="3" | singolare
! colspan="3" | plurale
|-
! prima
|-
! indicativo
! io
|-
! passato prossimo
| <div>
{|
|-
| [[ho]] [[detto#Italiano|detto]]</br>[[sono]] [[detto#Italiano|detto]]
|}</div>
|-
! colspan="1" rowspan="2" | imperativo
! -
! tu
|-
|
|[[di’#Italiano|di’]],</br> non [[dire#Italiano|dire]]
|}""",
)
data = parse_page(
self.wxr,
"dire",
"""== {{-it-}} ==
===Verbo===
{{Pn|c}} 3° coniugazione
# [[esternare]] ciò che si pensa parlando""",
)
self.assertEqual(
data[0]["forms"],
[
{
"form": "ho detto",
"raw_tags": ["passato prossimo", "prima", "io"],
"tags": ["singular"],
"source": "Appendice:Coniugazioni/Italiano/dire",
},
{
"form": "sono detto",
"raw_tags": ["passato prossimo", "prima", "io"],
"tags": ["singular"],
"source": "Appendice:Coniugazioni/Italiano/dire",
},
{
"form": "di’",
"raw_tags": ["imperativo", "tu"],
"source": "Appendice:Coniugazioni/Italiano/dire",
},
{
"form": "non dire",
"raw_tags": ["imperativo", "tu"],
"source": "Appendice:Coniugazioni/Italiano/dire",
},
],
)

0 comments on commit 85a9718

Please sign in to comment.