Skip to content

Commit

Permalink
[it] extract "It-decl-agg*" table templates
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Dec 18, 2024
1 parent 2c2ee8f commit febee0a
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 2 deletions.
56 changes: 55 additions & 1 deletion src/wiktextract/extractor/it/inflection.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from wikitextprocessor import TemplateNode
from wikitextprocessor import NodeKind, TemplateNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Form, WordEntry
from .tags import translate_raw_tags


def extract_tabs_template(
Expand All @@ -22,3 +23,56 @@ def extract_tabs_template(
if arg_value not in ["", wxr.wtp.title]:
form = Form(form=arg_value, tags=tags[arg_name - 1])
word_entry.forms.append(form)


def extract_it_decl_agg_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
# https://it.wiktionary.org/wiki/Template:It-decl-agg4
# https://it.wiktionary.org/wiki/Template:It-decl-agg2
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for table in expanded_node.find_child(NodeKind.TABLE):
raw_tag = ""
col_tags = []
for row in table.find_child(NodeKind.TABLE_ROW):
row_tag = ""
col_index = 0
for cell in row.find_child(
NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
):
match cell.kind:
case NodeKind.TABLE_HEADER_CELL:
col_span = cell.attrs.get("colspan", "")
if col_span != "":
raw_tag = clean_node(wxr, None, cell)
elif (
len(
[
n
for n in row.find_child(
NodeKind.TABLE_HEADER_CELL
)
]
)
== 1
):
row_tag = clean_node(wxr, None, cell)
else:
col_header = clean_node(wxr, None, cell)
if col_header != "":
col_tags.append(col_header)
case NodeKind.TABLE_CELL:
word = clean_node(wxr, None, cell)
if word not in ["", wxr.wtp.title]:
form = Form(form=word)
if raw_tag != "":
form.raw_tags.append(raw_tag)
if row_tag != "":
form.raw_tags.append(row_tag)
if col_index < len(col_tags):
form.raw_tags.append(col_tags[col_index])
translate_raw_tags(form)
word_entry.forms.append(form)
col_index += 1
1 change: 1 addition & 0 deletions src/wiktextract/extractor/it/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def parse_page(
) -> list[dict[str, Any]]:
# page layout
# https://it.wiktionary.org/wiki/Wikizionario:Manuale_di_stile
# https://it.wiktionary.org/wiki/Aiuto:Come_iniziare_una_pagina
wxr.wtp.start_page(page_title)
tree = wxr.wtp.parse(page_text, pre_expand=True)
page_data: list[WordEntry] = []
Expand Down
4 changes: 3 additions & 1 deletion src/wiktextract/extractor/it/tag_form_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .inflection import extract_tabs_template
from .inflection import extract_it_decl_agg_template, extract_tabs_template
from .models import Form, WordEntry


Expand All @@ -18,6 +18,8 @@ def extract_tag_form_line_nodes(
extract_tabs_template(wxr, word_entry, node)
elif node.template_name.lower() in FORM_LINK_TEMPLATES.keys():
extract_form_link_template(wxr, word_entry, node)
elif node.template_name.lower().startswith("it-decl-agg"):
extract_it_decl_agg_template(wxr, word_entry, node)


ITALIC_TAGS = {
Expand Down
28 changes: 28 additions & 0 deletions tests/test_it_forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,31 @@ def test_linkp_template(self):
[{"form": "cagne", "tags": ["plural"]}],
)
self.assertEqual(data[0]["tags"], ["feminine", "singular"])

def test_it_decl_agg(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
self.wxr.wtp.add_page("Template:It-decl-agg4", 10, """{|
|- align="center"
| &nbsp;
!bgcolor="#FFFFE0" color="#000"|&nbsp;''[[singolare]]''&nbsp;
!bgcolor="#FFFFE0" color="#000"|&nbsp;''[[plurale]]''&nbsp;
|- align="center"
!bgcolor="#FFFFE0" color="#000" colspan="3"|&nbsp;''[[positivo]]''&nbsp;
|- align="center"
!bgcolor="#FFFFE0" color="#000"|&nbsp;''[[maschile]]''&nbsp;
|&nbsp; [[libero]] &nbsp;
|&nbsp; [[liberi]] &nbsp;
|}""")
data = parse_page(
self.wxr,
"libero",
"""== {{-it-}} ==
===Aggettivo===
{{It-decl-agg4|liber}}
{{Pn|w}} ''m sing''
# non [[imprigionato]] o in [[schiavitù]]""",
)
self.assertEqual(
data[0]["forms"],
[{"form": "liberi", "tags": ["positive", "masculine", "plural"]}],
)

0 comments on commit febee0a

Please sign in to comment.