[it] extract "it-conj" table template in appendix conjugation pages

tatuylonen · Dec 19, 2024 · f54aa3c · f54aa3c
1 parent 6ad657c
commit f54aa3c
Show file tree

Hide file tree

Showing 4 changed files with 256 additions and 2 deletions.
diff --git a/src/wiktextract/extractor/it/inflection.py b/src/wiktextract/extractor/it/inflection.py
@@ -1,4 +1,7 @@
-from wikitextprocessor import NodeKind, TemplateNode
+import re
+from dataclasses import dataclass
+
+from wikitextprocessor import NodeKind, TemplateNode, WikiNode
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
@@ -76,3 +79,145 @@ def extract_it_decl_agg_template(
                             translate_raw_tags(form)
                             word_entry.forms.append(form)
                         col_index += 1
+
+
+def extract_appendix_conjugation_page(
+    wxr: WiktextractContext, word_entry: WordEntry, page_title: str
+) -> None:
+    # https://it.wiktionary.org/wiki/Appendice:Coniugazioni
+    page_text = wxr.wtp.get_page_body(page_title, 100)
+    if page_text is None:
+        return
+    root = wxr.wtp.parse(page_text)
+    for t_node in root.find_child(NodeKind.TEMPLATE):
+        if t_node.template_name.lower() == "it-conj":
+            extract_it_conj_template(wxr, word_entry, t_node, page_title)
+
+
+@dataclass
+class TableHeader:
+    text: str
+    col_index: int
+    colspan: int
+    row_index: int
+    rowspan: int
+
+
+def extract_it_conj_template(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    t_node: TemplateNode,
+    page_title: str,
+) -> None:
+    # https://it.wiktionary.org/wiki/Template:It-conj
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    for table in expanded_node.find_child(NodeKind.TABLE):
+        col_headers = []
+        row_header = ""
+        for row in table.find_child(NodeKind.TABLE_ROW):
+            col_index = 0
+            for cell in row.find_child(
+                NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
+            ):
+                match cell.kind:
+                    case NodeKind.TABLE_HEADER_CELL:
+                        header_str = clean_node(wxr, None, cell)
+                        if header_str in ["persona", "indicativo"]:
+                            continue
+                        elif header_str in ["condizionale", "congiuntivo"]:
+                            col_headers.clear()
+                            continue
+                        elif header_str == "imperativo":
+                            col_headers.clear()
+                            row_header = "imperativo"
+                            continue
+
+                        if row.contain_node(NodeKind.TABLE_CELL):
+                            row_header = header_str
+                        else:
+                            colspan = 1
+                            colspan_str = cell.attrs.get("colspan", "1")
+                            if re.fullmatch(r"\d+", colspan_str):
+                                colspan = int(colspan_str)
+                            col_headers.append(
+                                TableHeader(
+                                    header_str, col_index, colspan, 0, 0
+                                )
+                            )
+                            col_index += colspan
+                    case NodeKind.TABLE_CELL:
+                        cell_has_table = False
+                        for cell_table in cell.find_child_recursively(
+                            NodeKind.TABLE
+                        ):
+                            extract_it_conj_cell_table(
+                                wxr,
+                                word_entry,
+                                cell_table,
+                                row_header,
+                                col_headers,
+                                page_title,
+                            )
+                            cell_has_table = True
+                        if not cell_has_table:
+                            for form_str in clean_node(
+                                wxr, None, cell
+                            ).splitlines():
+                                form_str = form_str.strip(", ")
+                                if form_str.startswith("verbo di "):
+                                    continue  # first row
+                                if form_str not in ["", wxr.wtp.title]:
+                                    add_it_conj_form(
+                                        word_entry,
+                                        form_str,
+                                        page_title,
+                                        row_header,
+                                        col_index,
+                                        col_headers,
+                                    )
+                        col_index += 1
+
+
+def extract_it_conj_cell_table(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    table_node: WikiNode,
+    row_header: str,
+    col_headers: list[TableHeader],
+    page_title: str,
+) -> None:
+    for row in table_node.find_child(NodeKind.TABLE_ROW):
+        for col_index, cell in enumerate(row.find_child(NodeKind.TABLE_CELL)):
+            for cell_str in clean_node(wxr, None, cell).splitlines():
+                if cell_str not in ["", wxr.wtp.title]:
+                    add_it_conj_form(
+                        word_entry,
+                        cell_str,
+                        page_title,
+                        row_header,
+                        col_index,
+                        col_headers,
+                    )
+
+
+def add_it_conj_form(
+    word_entry: WordEntry,
+    form_str: str,
+    page_title: str,
+    row_header: str,
+    col_index: int,
+    col_headers: list[TableHeader],
+) -> None:
+    form = Form(form=form_str, source=page_title)
+    if row_header != "":
+        form.raw_tags.append(row_header)
+    for col_header in col_headers:
+        if (
+            col_index >= col_header.col_index
+            and col_index < col_header.col_index + col_header.colspan
+        ):
+            form.raw_tags.append(col_header.text)
+    translate_raw_tags(form)
+    word_entry.forms.append(form)
diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py
@@ -47,6 +47,7 @@ class Form(ItalianBaseModel):
     form: str = ""
     tags: list[str] = []
     raw_tags: list[str] = []
+    source: str = ""
 
 
 class Sound(ItalianBaseModel):

diff --git a/src/wiktextract/extractor/it/tag_form_line.py b/src/wiktextract/extractor/it/tag_form_line.py
@@ -2,7 +2,11 @@
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
-from .inflection import extract_it_decl_agg_template, extract_tabs_template
+from .inflection import (
+    extract_appendix_conjugation_page,
+    extract_it_decl_agg_template,
+    extract_tabs_template,
+)
 from .models import Form, WordEntry
 from .tags import translate_raw_tags
 
@@ -23,6 +27,8 @@ def extract_tag_form_line_nodes(
                 extract_it_decl_agg_template(wxr, word_entry, node)
             elif node.template_name.lower() == "a cmp":
                 extract_a_cmp_template(wxr, word_entry, node)
+            elif node.template_name.lower() == "pn":
+                extract_pn_template(wxr, word_entry, node)
 
 
 ITALIC_TAGS = {
@@ -95,3 +101,29 @@ def extract_a_cmp_template(
                         form.raw_tags.append(raw_tag)
                     translate_raw_tags(form)
                     word_entry.forms.append(form)
+
+
+def extract_pn_template(
+    wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
+) -> None:
+    # https://it.wiktionary.org/wiki/Template:Pn
+    has_c_arg = False
+    for arg_key, arg_value in t_node.template_parameters.items():
+        if arg_key == "c":
+            has_c_arg = True
+            break
+        arg_value_str = clean_node(wxr, None, arg_value)
+        if arg_value_str == "c":
+            has_c_arg = True
+            break
+    if not has_c_arg:
+        return
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    for small_tag in expanded_node.find_html("small"):
+        for link_node in small_tag.find_child(NodeKind.LINK):
+            if len(link_node.largs) > 0:
+                link_str = clean_node(wxr, None, link_node.largs[0])
+                if link_str.startswith("Appendice:Coniugazioni/"):
+                    extract_appendix_conjugation_page(wxr, word_entry, link_str)
diff --git a/tests/test_it_forms.py b/tests/test_it_forms.py
@@ -112,3 +112,79 @@ def test_a_cmp(self):
                 {"form": "most dire", "tags": ["superlative"]},
             ],
         )
+
+    def test_pn_template(self):
+        self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
+        self.wxr.wtp.add_page(
+            "Template:Pn",
+            10,
+            "'''dire'''<small>&nbsp;([[Appendice:Coniugazioni/Italiano/dire|vai alla coniugazione]])</small>",
+        )
+        self.wxr.wtp.add_page(
+            "Appendice:Coniugazioni/Italiano/dire", 100, "{{It-conj}}"
+        )
+        self.wxr.wtp.add_page(
+            "Template:It-conj",
+            10,
+            """{|
+|-
+|-
+! colspan="1" rowspan="2" | persona
+! colspan="3" | singolare
+! colspan="3" | plurale
+|-
+! prima
+|-
+! indicativo
+! io
+|-
+! passato prossimo
+| <div>
+  {|
+  |-
+  | [[ho]] [[detto#Italiano|detto]]</br>[[sono]] [[detto#Italiano|detto]]
+  |}</div>
+|-
+! colspan="1" rowspan="2" | imperativo
+! -
+! tu
+|-
+|
+|[[di’#Italiano|di’]],</br> non [[dire#Italiano|dire]]
+|}""",
+        )
+        data = parse_page(
+            self.wxr,
+            "dire",
+            """== {{-it-}} ==
+===Verbo===
+{{Pn|c}} 3° coniugazione
+# [[esternare]] ciò che si pensa parlando""",
+        )
+        self.assertEqual(
+            data[0]["forms"],
+            [
+                {
+                    "form": "ho detto",
+                    "raw_tags": ["passato prossimo", "prima", "io"],
+                    "tags": ["singular"],
+                    "source": "Appendice:Coniugazioni/Italiano/dire",
+                },
+                {
+                    "form": "sono detto",
+                    "raw_tags": ["passato prossimo", "prima", "io"],
+                    "tags": ["singular"],
+                    "source": "Appendice:Coniugazioni/Italiano/dire",
+                },
+                {
+                    "form": "di’",
+                    "raw_tags": ["imperativo", "tu"],
+                    "source": "Appendice:Coniugazioni/Italiano/dire",
+                },
+                {
+                    "form": "non dire",
+                    "raw_tags": ["imperativo", "tu"],
+                    "source": "Appendice:Coniugazioni/Italiano/dire",
+                },
+            ],
+        )