Skip to content

Commit

Permalink
Merge pull request #387 from xxyzz/fr
Browse files Browse the repository at this point in the history
Ignore "Commun" header cell in the French Wiktionary "sv-nom-c-ar" template
  • Loading branch information
xxyzz authored Oct 27, 2023
2 parents 9316f80 + 369ac3c commit f6a6412
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 3 deletions.
14 changes: 11 additions & 3 deletions src/wiktextract/extractor/fr/inflection.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import defaultdict, deque
from copy import deepcopy
from dataclasses import dataclass
from typing import Dict, List

Expand Down Expand Up @@ -26,6 +27,7 @@ def extract_inflection(
"forme", # br-flex-adj
"temps", # en-conj-rég,
"cas", # lt_décl_as, ro-nom-tab(lower case)
"commun", # sv-nom-c-ar
}
)
IGNORE_TABLE_CELL = frozenset(
Expand Down Expand Up @@ -74,11 +76,11 @@ def process_inflection_table(
)
)
and row_node_child.attrs.get("style") != "display:none"
and "invisible" not in row_node_child.attrs.get("class", "")
]
current_row_has_data_cell = any(
isinstance(cell, WikiNode)
and cell.kind == NodeKind.TABLE_CELL
and "invisible" not in cell.attrs.get("class", "")
for cell in table_row_nodes
)
row_headers = []
Expand Down Expand Up @@ -144,7 +146,10 @@ def process_inflection_table(
table_cell_line != page_data[-1].get("word")
and table_cell_line not in IGNORE_TABLE_CELL
):
form_data["form"] = table_cell_line
if "form" not in form_data:
form_data["form"] = table_cell_line
else:
form_data["form"] += " " + table_cell_line
for colspan_header in colspan_headers:
if (
column_cell_index >= colspan_header.index
Expand All @@ -165,6 +170,9 @@ def process_inflection_table(
if len(row_headers) > 0:
form_data["tags"].extend(row_headers)
if "form" in form_data:
page_data[-1]["forms"].append(form_data)
for form in form_data["form"].split(" ou "):
new_form_data = deepcopy(form_data)
new_form_data["form"] = form
page_data[-1]["forms"].append(new_form_data)

column_cell_index += int(table_cell.attrs.get("colspan", 1))
60 changes: 60 additions & 0 deletions tests/test_fr_inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,3 +343,63 @@ def test_ro_nom_tab(self, mock_node_to_wikitext):
{"form": "fenililor", "tags": ["Pluriel", "Vocatif"]},
],
)

@patch(
"wikitextprocessor.Wtp.node_to_wikitext",
return_value="""{| class="flextable flextable-sv"
! class="invisible" |
|-
! Commun
! Indéfini
! Défini
|-
! Singulier
| class="sing-indef" |<bdi lang="sv" xml:lang="sv" class="lang-sv">[[robot|robot]]</bdi>
| class="sing-def" |<bdi lang="sv" xml:lang="sv" class="lang-sv">[[roboten#sv|roboten]]</bdi>
|-
! Pluriel
| class="plur-indef" |<bdi lang="sv" xml:lang="sv" class="lang-sv">[[robotar#sv|robotar]]</bdi>
| class="plur-def" |<bdi lang="sv" xml:lang="sv" class="lang-sv">[[robotarna#sv|robotarna]]</bdi>
|}""",
)
def test_sv_nom_c_ar(self, mock_node_to_wikitext):
# https://fr.wiktionary.org/wiki/robot#Nom_commun_7
page_data = [defaultdict(list, {"word": "robot"})]
node = TemplateNode(0)
self.wxr.wtp.start_page("robot")
extract_inflection(self.wxr, page_data, node)
self.assertEqual(
page_data[-1].get("forms"),
[
{"form": "roboten", "tags": ["Défini", "Singulier"]},
{"form": "robotar", "tags": ["Indéfini", "Pluriel"]},
{"form": "robotarna", "tags": ["Défini", "Pluriel"]},
],
)

@patch(
"wikitextprocessor.Wtp.node_to_wikitext",
return_value="""{|class="flextable"
|-
!scope="col"| Cas<nowiki />
!scope="col"| Singulier<nowiki />
!scope="col"| Pluriel
|-
!scope="row"| Nominatif<nowiki />
| [[robot#cs-nom|robot''' ''']]<nowiki />
| [[roboti#cs-flex-nom|robot'''i ''']]<br /><small>''ou''</small> [[robotové#cs-flex-nom|robot'''ové ''']]<nowiki />
|}""",
)
def test_cs_decl_nom_ma_dur(self, mock_node_to_wikitext):
# https://fr.wiktionary.org/wiki/robot#Nom_commun_1_2
page_data = [defaultdict(list, {"word": "robot"})]
node = TemplateNode(0)
self.wxr.wtp.start_page("robot")
extract_inflection(self.wxr, page_data, node)
self.assertEqual(
page_data[-1].get("forms"),
[
{"form": "roboti", "tags": ["Pluriel", "Nominatif"]},
{"form": "robotové", "tags": ["Pluriel", "Nominatif"]},
],
)

0 comments on commit f6a6412

Please sign in to comment.