From 6ad657c1f26944292955eb9482d42f9c76e4c619 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 19 Dec 2024 11:16:35 +0800 Subject: [PATCH 1/2] [it] extract "A cmp" adj forms template --- src/wiktextract/extractor/it/tag_form_line.py | 27 ++++++++++++++- src/wiktextract/extractor/it/tags.py | 8 ++++- tests/test_it_forms.py | 33 +++++++++++++++++-- 3 files changed, 64 insertions(+), 4 deletions(-) diff --git a/src/wiktextract/extractor/it/tag_form_line.py b/src/wiktextract/extractor/it/tag_form_line.py index b7e1da16..71b2767b 100644 --- a/src/wiktextract/extractor/it/tag_form_line.py +++ b/src/wiktextract/extractor/it/tag_form_line.py @@ -4,6 +4,7 @@ from ...wxr_context import WiktextractContext from .inflection import extract_it_decl_agg_template, extract_tabs_template from .models import Form, WordEntry +from .tags import translate_raw_tags def extract_tag_form_line_nodes( @@ -16,10 +17,12 @@ def extract_tag_form_line_nodes( elif isinstance(node, TemplateNode): if node.template_name.lower() == "tabs": extract_tabs_template(wxr, word_entry, node) - elif node.template_name.lower() in FORM_LINK_TEMPLATES.keys(): + elif node.template_name.lower() in FORM_LINK_TEMPLATES: extract_form_link_template(wxr, word_entry, node) elif node.template_name.lower().startswith("it-decl-agg"): extract_it_decl_agg_template(wxr, word_entry, node) + elif node.template_name.lower() == "a cmp": + extract_a_cmp_template(wxr, word_entry, node) ITALIC_TAGS = { @@ -70,3 +73,25 @@ def extract_form_link_template( if form != "": word_entry.forms.append(Form(form=form, tags=["plural"])) arg_name += 1 + + +def extract_a_cmp_template( + wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode +) -> None: + # https://it.wiktionary.org/wiki/Template:A_cmp + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + raw_tag = "" + for node in expanded_node.find_child(NodeKind.ITALIC | NodeKind.BOLD): + match node.kind: + case NodeKind.ITALIC: + raw_tag = clean_node(wxr, None, node) + case NodeKind.BOLD: + form_str = clean_node(wxr, None, node) + if form_str not in ["", wxr.wtp.title]: + form = Form(form=form_str) + if raw_tag != "": + form.raw_tags.append(raw_tag) + translate_raw_tags(form) + word_entry.forms.append(form) diff --git a/src/wiktextract/extractor/it/tags.py b/src/wiktextract/extractor/it/tags.py index 036ae1f9..1d9d5939 100644 --- a/src/wiktextract/extractor/it/tags.py +++ b/src/wiktextract/extractor/it/tags.py @@ -12,8 +12,14 @@ "m e f": ["masculine", "feminine"], } +FORM_LINE_TEMPLATE_TAGS = { + # https://it.wiktionary.org/wiki/Template:A_cmp + "comparativo": "comparative", + "superlativo": "superlative", +} + -TAGS = {**TABLE_TAGS} +TAGS = {**TABLE_TAGS, **FORM_LINE_TEMPLATE_TAGS} def translate_raw_tags(data: WordEntry) -> None: diff --git a/tests/test_it_forms.py b/tests/test_it_forms.py index 8ebeddf9..75a572ea 100644 --- a/tests/test_it_forms.py +++ b/tests/test_it_forms.py @@ -58,7 +58,10 @@ def test_linkp_template(self): def test_it_decl_agg(self): self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") - self.wxr.wtp.add_page("Template:It-decl-agg4", 10, """{| + self.wxr.wtp.add_page( + "Template:It-decl-agg4", + 10, + """{| |- align="center" |   !bgcolor="#FFFFE0" color="#000"| ''[[singolare]]''  @@ -69,7 +72,8 @@ def test_it_decl_agg(self): !bgcolor="#FFFFE0" color="#000"| ''[[maschile]]''  |  [[libero]]   |  [[liberi]]   -|}""") +|}""", + ) data = parse_page( self.wxr, "libero", @@ -83,3 +87,28 @@ def test_it_decl_agg(self): data[0]["forms"], [{"form": "liberi", "tags": ["positive", "masculine", "plural"]}], ) + + def test_a_cmp(self): + self.wxr.wtp.add_page("Template:-en-", 10, "Inglese") + self.wxr.wtp.add_page( + "Template:A cmp", + 10, + "(''comparativo'' '''[[direr]]''', '''more dire''', ''superlativo'' '''[[direst]]''', '''most dire''')", + ) + data = parse_page( + self.wxr, + "dire", + """== {{-en-}} == +===Aggettivo=== +{{Pn}} {{A cmp|direr|c2=more dire|direst|s2=most dire}} +# [[sinistro]]""", + ) + self.assertEqual( + data[0]["forms"], + [ + {"form": "direr", "tags": ["comparative"]}, + {"form": "more dire", "tags": ["comparative"]}, + {"form": "direst", "tags": ["superlative"]}, + {"form": "most dire", "tags": ["superlative"]}, + ], + ) From f54aa3c33d4c54766447c35f0cabc6fd4fb78e81 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 19 Dec 2024 17:06:55 +0800 Subject: [PATCH 2/2] [it] extract "it-conj" table template in appendix conjugation pages --- src/wiktextract/extractor/it/inflection.py | 147 +++++++++++++++++- src/wiktextract/extractor/it/models.py | 1 + src/wiktextract/extractor/it/tag_form_line.py | 34 +++- tests/test_it_forms.py | 76 +++++++++ 4 files changed, 256 insertions(+), 2 deletions(-) diff --git a/src/wiktextract/extractor/it/inflection.py b/src/wiktextract/extractor/it/inflection.py index 768a5c86..e84621a6 100644 --- a/src/wiktextract/extractor/it/inflection.py +++ b/src/wiktextract/extractor/it/inflection.py @@ -1,4 +1,7 @@ -from wikitextprocessor import NodeKind, TemplateNode +import re +from dataclasses import dataclass + +from wikitextprocessor import NodeKind, TemplateNode, WikiNode from ...page import clean_node from ...wxr_context import WiktextractContext @@ -76,3 +79,145 @@ def extract_it_decl_agg_template( translate_raw_tags(form) word_entry.forms.append(form) col_index += 1 + + +def extract_appendix_conjugation_page( + wxr: WiktextractContext, word_entry: WordEntry, page_title: str +) -> None: + # https://it.wiktionary.org/wiki/Appendice:Coniugazioni + page_text = wxr.wtp.get_page_body(page_title, 100) + if page_text is None: + return + root = wxr.wtp.parse(page_text) + for t_node in root.find_child(NodeKind.TEMPLATE): + if t_node.template_name.lower() == "it-conj": + extract_it_conj_template(wxr, word_entry, t_node, page_title) + + +@dataclass +class TableHeader: + text: str + col_index: int + colspan: int + row_index: int + rowspan: int + + +def extract_it_conj_template( + wxr: WiktextractContext, + word_entry: WordEntry, + t_node: TemplateNode, + page_title: str, +) -> None: + # https://it.wiktionary.org/wiki/Template:It-conj + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + for table in expanded_node.find_child(NodeKind.TABLE): + col_headers = [] + row_header = "" + for row in table.find_child(NodeKind.TABLE_ROW): + col_index = 0 + for cell in row.find_child( + NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL + ): + match cell.kind: + case NodeKind.TABLE_HEADER_CELL: + header_str = clean_node(wxr, None, cell) + if header_str in ["persona", "indicativo"]: + continue + elif header_str in ["condizionale", "congiuntivo"]: + col_headers.clear() + continue + elif header_str == "imperativo": + col_headers.clear() + row_header = "imperativo" + continue + + if row.contain_node(NodeKind.TABLE_CELL): + row_header = header_str + else: + colspan = 1 + colspan_str = cell.attrs.get("colspan", "1") + if re.fullmatch(r"\d+", colspan_str): + colspan = int(colspan_str) + col_headers.append( + TableHeader( + header_str, col_index, colspan, 0, 0 + ) + ) + col_index += colspan + case NodeKind.TABLE_CELL: + cell_has_table = False + for cell_table in cell.find_child_recursively( + NodeKind.TABLE + ): + extract_it_conj_cell_table( + wxr, + word_entry, + cell_table, + row_header, + col_headers, + page_title, + ) + cell_has_table = True + if not cell_has_table: + for form_str in clean_node( + wxr, None, cell + ).splitlines(): + form_str = form_str.strip(", ") + if form_str.startswith("verbo di "): + continue # first row + if form_str not in ["", wxr.wtp.title]: + add_it_conj_form( + word_entry, + form_str, + page_title, + row_header, + col_index, + col_headers, + ) + col_index += 1 + + +def extract_it_conj_cell_table( + wxr: WiktextractContext, + word_entry: WordEntry, + table_node: WikiNode, + row_header: str, + col_headers: list[TableHeader], + page_title: str, +) -> None: + for row in table_node.find_child(NodeKind.TABLE_ROW): + for col_index, cell in enumerate(row.find_child(NodeKind.TABLE_CELL)): + for cell_str in clean_node(wxr, None, cell).splitlines(): + if cell_str not in ["", wxr.wtp.title]: + add_it_conj_form( + word_entry, + cell_str, + page_title, + row_header, + col_index, + col_headers, + ) + + +def add_it_conj_form( + word_entry: WordEntry, + form_str: str, + page_title: str, + row_header: str, + col_index: int, + col_headers: list[TableHeader], +) -> None: + form = Form(form=form_str, source=page_title) + if row_header != "": + form.raw_tags.append(row_header) + for col_header in col_headers: + if ( + col_index >= col_header.col_index + and col_index < col_header.col_index + col_header.colspan + ): + form.raw_tags.append(col_header.text) + translate_raw_tags(form) + word_entry.forms.append(form) diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py index 44cf2f7f..71963949 100644 --- a/src/wiktextract/extractor/it/models.py +++ b/src/wiktextract/extractor/it/models.py @@ -47,6 +47,7 @@ class Form(ItalianBaseModel): form: str = "" tags: list[str] = [] raw_tags: list[str] = [] + source: str = "" class Sound(ItalianBaseModel): diff --git a/src/wiktextract/extractor/it/tag_form_line.py b/src/wiktextract/extractor/it/tag_form_line.py index 71b2767b..331640f1 100644 --- a/src/wiktextract/extractor/it/tag_form_line.py +++ b/src/wiktextract/extractor/it/tag_form_line.py @@ -2,7 +2,11 @@ from ...page import clean_node from ...wxr_context import WiktextractContext -from .inflection import extract_it_decl_agg_template, extract_tabs_template +from .inflection import ( + extract_appendix_conjugation_page, + extract_it_decl_agg_template, + extract_tabs_template, +) from .models import Form, WordEntry from .tags import translate_raw_tags @@ -23,6 +27,8 @@ def extract_tag_form_line_nodes( extract_it_decl_agg_template(wxr, word_entry, node) elif node.template_name.lower() == "a cmp": extract_a_cmp_template(wxr, word_entry, node) + elif node.template_name.lower() == "pn": + extract_pn_template(wxr, word_entry, node) ITALIC_TAGS = { @@ -95,3 +101,29 @@ def extract_a_cmp_template( form.raw_tags.append(raw_tag) translate_raw_tags(form) word_entry.forms.append(form) + + +def extract_pn_template( + wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode +) -> None: + # https://it.wiktionary.org/wiki/Template:Pn + has_c_arg = False + for arg_key, arg_value in t_node.template_parameters.items(): + if arg_key == "c": + has_c_arg = True + break + arg_value_str = clean_node(wxr, None, arg_value) + if arg_value_str == "c": + has_c_arg = True + break + if not has_c_arg: + return + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + for small_tag in expanded_node.find_html("small"): + for link_node in small_tag.find_child(NodeKind.LINK): + if len(link_node.largs) > 0: + link_str = clean_node(wxr, None, link_node.largs[0]) + if link_str.startswith("Appendice:Coniugazioni/"): + extract_appendix_conjugation_page(wxr, word_entry, link_str) diff --git a/tests/test_it_forms.py b/tests/test_it_forms.py index 75a572ea..a9287767 100644 --- a/tests/test_it_forms.py +++ b/tests/test_it_forms.py @@ -112,3 +112,79 @@ def test_a_cmp(self): {"form": "most dire", "tags": ["superlative"]}, ], ) + + def test_pn_template(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + self.wxr.wtp.add_page( + "Template:Pn", + 10, + "'''dire''' ([[Appendice:Coniugazioni/Italiano/dire|vai alla coniugazione]])", + ) + self.wxr.wtp.add_page( + "Appendice:Coniugazioni/Italiano/dire", 100, "{{It-conj}}" + ) + self.wxr.wtp.add_page( + "Template:It-conj", + 10, + """{| +|- +|- +! colspan="1" rowspan="2" | persona +! colspan="3" | singolare +! colspan="3" | plurale +|- +! prima +|- +! indicativo +! io +|- +! passato prossimo +|
+ {| + |- + | [[ho]] [[detto#Italiano|detto]]
[[sono]] [[detto#Italiano|detto]] + |}
+|- +! colspan="1" rowspan="2" | imperativo +! - +! tu +|- +| +|[[di’#Italiano|di’]],
non [[dire#Italiano|dire]] +|}""", + ) + data = parse_page( + self.wxr, + "dire", + """== {{-it-}} == +===Verbo=== +{{Pn|c}} 3° coniugazione +# [[esternare]] ciò che si pensa parlando""", + ) + self.assertEqual( + data[0]["forms"], + [ + { + "form": "ho detto", + "raw_tags": ["passato prossimo", "prima", "io"], + "tags": ["singular"], + "source": "Appendice:Coniugazioni/Italiano/dire", + }, + { + "form": "sono detto", + "raw_tags": ["passato prossimo", "prima", "io"], + "tags": ["singular"], + "source": "Appendice:Coniugazioni/Italiano/dire", + }, + { + "form": "di’", + "raw_tags": ["imperativo", "tu"], + "source": "Appendice:Coniugazioni/Italiano/dire", + }, + { + "form": "non dire", + "raw_tags": ["imperativo", "tu"], + "source": "Appendice:Coniugazioni/Italiano/dire", + }, + ], + )