Skip to content

Commit

Permalink
Merge pull request #348 from xxyzz/fr
Browse files Browse the repository at this point in the history
Some fixes for French Wiktionary
  • Loading branch information
xxyzz authored Sep 26, 2023
2 parents e0b24ef + ef2ed10 commit 91ae1d0
Show file tree
Hide file tree
Showing 8 changed files with 124 additions and 41 deletions.
51 changes: 37 additions & 14 deletions tests/test_fr_inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
from collections import defaultdict
from unittest.mock import patch

from wikitextprocessor import NodeKind, WikiNode, Wtp
from wikitextprocessor import Wtp
from wikitextprocessor.parser import TemplateNode

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.fr.inflection import extract_inflection
Expand Down Expand Up @@ -37,7 +38,7 @@ def tearDown(self) -> None:
)
def test_fr_reg(self, mock_node_to_wikitext):
page_data = [defaultdict(list, {"word": "productrice"})]
node = WikiNode(NodeKind.TEMPLATE, 0)
node = TemplateNode(0)
self.wxr.wtp.start_page("productrice")
extract_inflection(self.wxr, page_data, node, "fr-rég")
self.assertEqual(
Expand All @@ -47,25 +48,25 @@ def test_fr_reg(self, mock_node_to_wikitext):

@patch(
"wikitextprocessor.Wtp.node_to_wikitext",
return_value="""{|
return_value="""{|class="flextable flextable-fr-mfsp"
|-
|class='invisible'|
!scope='col'| Singulier
!scope='col'| Pluriel
|- class='flextable-fr-m'
!scope='row'| Masculin
!scope="col"| Singulier
!scope="col"| Pluriel
|- class="flextable-fr-m"
!scope="row"| Masculin
|[[animal]]<br>[[Annexe:Prononciation/français|<span>\\a.ni.mal\\</span>]]
|[[animaux]]<br>[[Annexe:Prononciation/français|<span>\\a.ni.mo\\</span>]]
|- class='flextable-fr-f'
!scope='row'| Féminin
|- class="flextable-fr-f"
!scope="row"| Féminin
|[[animale]]<br>[[Annexe:Prononciation/français|<span>\\a.ni.mal\\</span>]]
|[[animales]]<br>[[Annexe:Prononciation/français|<span>\\a.ni.mal\\</span>]]
|}""",
)
def test_fr_accord_al(self, mock_node_to_wikitext):
# https://fr.wiktionary.org/wiki/animal#Adjectif
self.maxDiff = None
page_data = [defaultdict(list, {"word": "animal", "lang_code": "fr"})]
node = WikiNode(NodeKind.TEMPLATE, 0)
node = TemplateNode(0)
self.wxr.wtp.start_page("animal")
extract_inflection(self.wxr, page_data, node, "fr-accord-al")
self.assertEqual(
Expand Down Expand Up @@ -101,7 +102,7 @@ def test_fr_accord_al(self, mock_node_to_wikitext):
def test_multiple_lines_ipa(self, mock_node_to_wikitext):
# https://fr.wiktionary.org/wiki/ration#Nom_commun_2
page_data = [defaultdict(list, {"lang_code": "en", "word": "ration"})]
node = WikiNode(NodeKind.TEMPLATE, 0)
node = TemplateNode(0)
self.wxr.wtp.start_page("ration")
extract_inflection(self.wxr, page_data, node, "en-nom-rég")
self.assertEqual(
Expand All @@ -128,7 +129,7 @@ def test_multiple_lines_ipa(self, mock_node_to_wikitext):
def test_single_line_multiple_ipa(self, mock_node_to_wikitext):
# https://fr.wiktionary.org/wiki/ration#Verbe
page_data = [defaultdict(list, {"lang_code": "en", "word": "ration"})]
node = WikiNode(NodeKind.TEMPLATE, 0)
node = TemplateNode(0)
self.wxr.wtp.start_page("ration")
extract_inflection(self.wxr, page_data, node, "en-conj-rég")
self.assertEqual(
Expand All @@ -155,10 +156,32 @@ def test_single_line_multiple_ipa(self, mock_node_to_wikitext):
def test_invalid_ipa(self, mock_node_to_wikitext):
# https://fr.wiktionary.org/wiki/animal#Nom_commun_3
page_data = [defaultdict(list, {"lang_code": "en", "word": "animal"})]
node = WikiNode(NodeKind.TEMPLATE, 0)
node = TemplateNode(0)
self.wxr.wtp.start_page("animal")
extract_inflection(self.wxr, page_data, node, "ast-accord-mf")
self.assertEqual(
page_data[-1].get("forms"),
[{"tags": ["Pluriel"], "form": "animales"}],
)

@patch(
"wikitextprocessor.Wtp.node_to_wikitext",
return_value="""{| class="flextable"
|-
! Simplifié
| <bdi lang="zh-Hans" xml:lang="zh-Hans" class="lang-zh-Hans">[[一万#zh|一万]]</bdi>
|-
! Traditionnel
| <bdi lang="zh-Hant" xml:lang="zh-Hant" class="lang-zh-Hant">[[一萬#zh|一萬]]</bdi>
|}""",
)
def test_no_column_headers(self, mock_node_to_wikitext):
# https://fr.wiktionary.org/wiki/一万#Nom_commun
page_data = [defaultdict(list, {"lang_code": "zh", "word": "一万"})]
node = TemplateNode(0)
self.wxr.wtp.start_page("一万")
extract_inflection(self.wxr, page_data, node, "zh-formes")
self.assertEqual(
page_data[-1].get("forms"),
[{"tags": ["Traditionnel"], "form": "一萬"}],
)
19 changes: 19 additions & 0 deletions tests/test_fr_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,22 @@ def test_template_as_partial_tag(self):
}
],
)

def test_list_item_has_two_words(self):
page_data = [defaultdict(list)]
self.wxr.wtp.start_page("masse")
root = self.wxr.wtp.parse(
"==== {{S|dérivés}} ====\n* [[être à la masse]], [[mettre à la masse]]"
)
extract_linkage(self.wxr, page_data, root.children[0], "derived")
self.assertEqual(
page_data,
[
{
"derived": [
{"word": "être à la masse"},
{"word": "mettre à la masse"},
]
}
],
)
30 changes: 29 additions & 1 deletion tests/test_fr_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_template_tag(self):
{
"code": "ar",
"lang": "Arabe",
"word": "مرحبا",
"word": "مرحبًا",
"roman": "mrḥbā",
"tags": ["Informel"],
},
Expand Down Expand Up @@ -128,3 +128,31 @@ def test_trad_template_gender_parameter(self):
}
],
)

def test_template_sense_parameter(self):
self.wxr.wtp.start_page("masse")
self.wxr.wtp.add_page("Modèle:info lex", 10, body="(Finance)")
self.wxr.wtp.add_page("Modèle:T", 10, body="Croate")
self.wxr.wtp.add_page("Modèle:trad+", 10, body="masa")
root = self.wxr.wtp.parse(
"""=== Traductions ===
{{trad-début|{{info lex|finance}}|12}}
* {{T|hr}} : {{trad+|hr|masa}}"""
)
page_data = [defaultdict(list)]
extract_translation(self.wxr, page_data, root.children[0])
self.assertEqual(
page_data,
[
{
"translations": [
{
"code": "hr",
"lang": "Croate",
"word": "masa",
"sense": "(Finance)",
},
]
}
],
)
2 changes: 1 addition & 1 deletion wiktextract/extractor/fr/etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def find_pos_in_etymology_list(
if (
index == 0
and isinstance(node, TemplateNode)
and node.template_name == "lien-ancre-étym"
and node.template_name in ("lien-ancre-étym", "laé")
):
return clean_node(wxr, None, node).strip("()"), clean_node(
wxr, None, child_nodes[index + 1 :]
Expand Down
24 changes: 15 additions & 9 deletions wiktextract/extractor/fr/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,23 +46,29 @@ def process_inflection_table(
for row_num, table_row in enumerate(
table_node.find_child(NodeKind.TABLE_ROW)
):
if (
row_num != 0
and len(list(table_row.filter_empty_str_child()))
== len(column_headers) + 1
):
table_row_nodes = list(table_row.filter_empty_str_child())
first_row_has_data_cell = False
if row_num == 0:
first_row_has_data_cell = not any(
isinstance(cell, WikiNode)
and cell.kind == NodeKind.TABLE_CELL
for cell in table_row_nodes
)

if row_num != 0 and len(table_row_nodes) == len(column_headers) + 1:
# data row has one more column then header: "fr-accord-al" template
column_headers.insert(0, "")

row_header = ""
for column_num, table_cell in enumerate(
table_row.filter_empty_str_child()
):
for column_num, table_cell in enumerate(table_row_nodes):
form_data = defaultdict(list)
if isinstance(table_cell, WikiNode):
if table_cell.kind == NodeKind.TABLE_HEADER_CELL:
table_header_text = clean_node(wxr, None, table_cell)
if row_num == 0:
if row_num == 0 and first_row_has_data_cell:
# if cells of the first row are not all header cells
# then the header cells are row headers but not column
# headers
column_headers.append(table_header_text)
elif (
column_num == 0
Expand Down
22 changes: 12 additions & 10 deletions wiktextract/extractor/fr/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,8 @@ def extract_linkage(
for index, child_node in enumerate(
list_item_node.filter_empty_str_child()
):
if index == 0:
if (
isinstance(child_node, WikiNode)
and child_node.kind == NodeKind.TEMPLATE
):
if index == 0 or "word" not in linkage_data:
if isinstance(child_node, TemplateNode):
process_linkage_template(wxr, child_node, linkage_data)
else:
linkage_data["word"] = clean_node(wxr, None, child_node)
Expand All @@ -44,6 +41,11 @@ def extract_linkage(
):
tag = pending_tag + tag
pending_tag = ""
elif tag.strip() == ",":
# list item has more than one word
page_data[-1][linkage_type].append(linkage_data)
linkage_data = defaultdict(list)
continue
elif len(pending_tag) > 0:
pending_tag += tag
continue
Expand Down Expand Up @@ -74,11 +76,11 @@ def process_lien_template(
linkage_data: Dict[str, Union[str, List[str]]],
) -> None:
# link word template: https://fr.wiktionary.org/wiki/Modèle:lien
if "dif" in node.template_parameters: # displayed word
word = clean_node(wxr, None, node.template_parameters.get("dif"))
else:
word = clean_node(wxr, None, node.template_parameters.get(1))

word = clean_node(
wxr,
None,
node.template_parameters.get("dif", node.template_parameters.get(1)),
)
linkage_data["word"] = word
if "tr" in node.template_parameters:
linkage_data["roman"] = clean_node(
Expand Down
3 changes: 3 additions & 0 deletions wiktextract/extractor/fr/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ def process_pos_block(
process_exemple_template(
wxr, child, page_data[-1]["senses"][-1]
)
elif template_name.startswith(("zh-mot", "ja-mot")):
# skip form line templates
continue
elif template_name.startswith(f"{lang_code}-"):
extract_inflection(wxr, page_data, child, template_name)
elif child.kind == NodeKind.BOLD:
Expand Down
14 changes: 8 additions & 6 deletions wiktextract/extractor/fr/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,10 @@ def process_translation_templates(
return
elif template_node.template_name == "trad-début":
# translation box start: https://fr.wiktionary.org/wiki/Modèle:trad-début
translation_sense_wikitext = template_node.template_parameters.get(
1, ""
)
if len(translation_sense_wikitext) > 0:
sense_parameter = template_node.template_parameters.get(1)
if sense_parameter is not None:
base_translation_data["sense"] = clean_node(
wxr, None, translation_sense_wikitext
wxr, None, sense_parameter
)
elif template_node.template_name == "T":
# Translation language: https://fr.wiktionary.org/wiki/Modèle:T
Expand All @@ -86,7 +84,11 @@ def process_translation_templates(
elif template_node.template_name.startswith("trad"):
# Translation term: https://fr.wiktionary.org/wiki/Modèle:trad
translation_term = clean_node(
wxr, None, template_node.template_parameters.get(2)
wxr,
None,
template_node.template_parameters.get(
"dif", template_node.template_parameters.get(2)
),
)
translation_roman = clean_node(
wxr,
Expand Down

0 comments on commit 91ae1d0

Please sign in to comment.