Skip to content

Commit

Permalink
[it] extract "it-conj" table template in appendix conjugation pages
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Dec 19, 2024
1 parent 6ad657c commit f54aa3c
Show file tree
Hide file tree
Showing 4 changed files with 256 additions and 2 deletions.
147 changes: 146 additions & 1 deletion src/wiktextract/extractor/it/inflection.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from wikitextprocessor import NodeKind, TemplateNode
import re
from dataclasses import dataclass

from wikitextprocessor import NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
Expand Down Expand Up @@ -76,3 +79,145 @@ def extract_it_decl_agg_template(
translate_raw_tags(form)
word_entry.forms.append(form)
col_index += 1


def extract_appendix_conjugation_page(
wxr: WiktextractContext, word_entry: WordEntry, page_title: str
) -> None:
# https://it.wiktionary.org/wiki/Appendice:Coniugazioni
page_text = wxr.wtp.get_page_body(page_title, 100)
if page_text is None:
return
root = wxr.wtp.parse(page_text)
for t_node in root.find_child(NodeKind.TEMPLATE):
if t_node.template_name.lower() == "it-conj":
extract_it_conj_template(wxr, word_entry, t_node, page_title)


@dataclass
class TableHeader:
text: str
col_index: int
colspan: int
row_index: int
rowspan: int


def extract_it_conj_template(
wxr: WiktextractContext,
word_entry: WordEntry,
t_node: TemplateNode,
page_title: str,
) -> None:
# https://it.wiktionary.org/wiki/Template:It-conj
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for table in expanded_node.find_child(NodeKind.TABLE):
col_headers = []
row_header = ""
for row in table.find_child(NodeKind.TABLE_ROW):
col_index = 0
for cell in row.find_child(
NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
):
match cell.kind:
case NodeKind.TABLE_HEADER_CELL:
header_str = clean_node(wxr, None, cell)
if header_str in ["persona", "indicativo"]:
continue
elif header_str in ["condizionale", "congiuntivo"]:
col_headers.clear()
continue
elif header_str == "imperativo":
col_headers.clear()
row_header = "imperativo"
continue

if row.contain_node(NodeKind.TABLE_CELL):
row_header = header_str
else:
colspan = 1
colspan_str = cell.attrs.get("colspan", "1")
if re.fullmatch(r"\d+", colspan_str):
colspan = int(colspan_str)
col_headers.append(
TableHeader(
header_str, col_index, colspan, 0, 0
)
)
col_index += colspan
case NodeKind.TABLE_CELL:
cell_has_table = False
for cell_table in cell.find_child_recursively(
NodeKind.TABLE
):
extract_it_conj_cell_table(
wxr,
word_entry,
cell_table,
row_header,
col_headers,
page_title,
)
cell_has_table = True
if not cell_has_table:
for form_str in clean_node(
wxr, None, cell
).splitlines():
form_str = form_str.strip(", ")
if form_str.startswith("verbo di "):
continue # first row
if form_str not in ["", wxr.wtp.title]:
add_it_conj_form(
word_entry,
form_str,
page_title,
row_header,
col_index,
col_headers,
)
col_index += 1


def extract_it_conj_cell_table(
wxr: WiktextractContext,
word_entry: WordEntry,
table_node: WikiNode,
row_header: str,
col_headers: list[TableHeader],
page_title: str,
) -> None:
for row in table_node.find_child(NodeKind.TABLE_ROW):
for col_index, cell in enumerate(row.find_child(NodeKind.TABLE_CELL)):
for cell_str in clean_node(wxr, None, cell).splitlines():
if cell_str not in ["", wxr.wtp.title]:
add_it_conj_form(
word_entry,
cell_str,
page_title,
row_header,
col_index,
col_headers,
)


def add_it_conj_form(
word_entry: WordEntry,
form_str: str,
page_title: str,
row_header: str,
col_index: int,
col_headers: list[TableHeader],
) -> None:
form = Form(form=form_str, source=page_title)
if row_header != "":
form.raw_tags.append(row_header)
for col_header in col_headers:
if (
col_index >= col_header.col_index
and col_index < col_header.col_index + col_header.colspan
):
form.raw_tags.append(col_header.text)
translate_raw_tags(form)
word_entry.forms.append(form)
1 change: 1 addition & 0 deletions src/wiktextract/extractor/it/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class Form(ItalianBaseModel):
form: str = ""
tags: list[str] = []
raw_tags: list[str] = []
source: str = ""


class Sound(ItalianBaseModel):
Expand Down
34 changes: 33 additions & 1 deletion src/wiktextract/extractor/it/tag_form_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .inflection import extract_it_decl_agg_template, extract_tabs_template
from .inflection import (
extract_appendix_conjugation_page,
extract_it_decl_agg_template,
extract_tabs_template,
)
from .models import Form, WordEntry
from .tags import translate_raw_tags

Expand All @@ -23,6 +27,8 @@ def extract_tag_form_line_nodes(
extract_it_decl_agg_template(wxr, word_entry, node)
elif node.template_name.lower() == "a cmp":
extract_a_cmp_template(wxr, word_entry, node)
elif node.template_name.lower() == "pn":
extract_pn_template(wxr, word_entry, node)


ITALIC_TAGS = {
Expand Down Expand Up @@ -95,3 +101,29 @@ def extract_a_cmp_template(
form.raw_tags.append(raw_tag)
translate_raw_tags(form)
word_entry.forms.append(form)


def extract_pn_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
# https://it.wiktionary.org/wiki/Template:Pn
has_c_arg = False
for arg_key, arg_value in t_node.template_parameters.items():
if arg_key == "c":
has_c_arg = True
break
arg_value_str = clean_node(wxr, None, arg_value)
if arg_value_str == "c":
has_c_arg = True
break
if not has_c_arg:
return
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for small_tag in expanded_node.find_html("small"):
for link_node in small_tag.find_child(NodeKind.LINK):
if len(link_node.largs) > 0:
link_str = clean_node(wxr, None, link_node.largs[0])
if link_str.startswith("Appendice:Coniugazioni/"):
extract_appendix_conjugation_page(wxr, word_entry, link_str)
76 changes: 76 additions & 0 deletions tests/test_it_forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,79 @@ def test_a_cmp(self):
{"form": "most dire", "tags": ["superlative"]},
],
)

def test_pn_template(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
self.wxr.wtp.add_page(
"Template:Pn",
10,
"'''dire'''<small>&nbsp;([[Appendice:Coniugazioni/Italiano/dire|vai alla coniugazione]])</small>",
)
self.wxr.wtp.add_page(
"Appendice:Coniugazioni/Italiano/dire", 100, "{{It-conj}}"
)
self.wxr.wtp.add_page(
"Template:It-conj",
10,
"""{|
|-
|-
! colspan="1" rowspan="2" | persona
! colspan="3" | singolare
! colspan="3" | plurale
|-
! prima
|-
! indicativo
! io
|-
! passato prossimo
| <div>
{|
|-
| [[ho]] [[detto#Italiano|detto]]</br>[[sono]] [[detto#Italiano|detto]]
|}</div>
|-
! colspan="1" rowspan="2" | imperativo
! -
! tu
|-
|
|[[di’#Italiano|di’]],</br> non [[dire#Italiano|dire]]
|}""",
)
data = parse_page(
self.wxr,
"dire",
"""== {{-it-}} ==
===Verbo===
{{Pn|c}} 3° coniugazione
# [[esternare]] ciò che si pensa parlando""",
)
self.assertEqual(
data[0]["forms"],
[
{
"form": "ho detto",
"raw_tags": ["passato prossimo", "prima", "io"],
"tags": ["singular"],
"source": "Appendice:Coniugazioni/Italiano/dire",
},
{
"form": "sono detto",
"raw_tags": ["passato prossimo", "prima", "io"],
"tags": ["singular"],
"source": "Appendice:Coniugazioni/Italiano/dire",
},
{
"form": "di’",
"raw_tags": ["imperativo", "tu"],
"source": "Appendice:Coniugazioni/Italiano/dire",
},
{
"form": "non dire",
"raw_tags": ["imperativo", "tu"],
"source": "Appendice:Coniugazioni/Italiano/dire",
},
],
)

0 comments on commit f54aa3c

Please sign in to comment.