Skip to content

Commit

Permalink
Merge pull request #346 from xxyzz/fr
Browse files Browse the repository at this point in the history
Extract French Wiktionary etymology list
  • Loading branch information
xxyzz authored Sep 25, 2023
2 parents c8f7d45 + dd4f358 commit be3fd6f
Show file tree
Hide file tree
Showing 8 changed files with 348 additions and 120 deletions.
154 changes: 149 additions & 5 deletions tests/test_fr_etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.fr.page import extract_etymology
from wiktextract.extractor.fr.etymology import (
extract_etymology,
insert_etymology_data,
)
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext

Expand All @@ -23,9 +26,150 @@ def tearDown(self) -> None:

def test_ebauche_etym(self):
# https://fr.wiktionary.org/wiki/Hörsaal
# missing etymology template "ébauche-étym" should be ignored
self.wxr.wtp.start_page("")
root = self.wxr.wtp.parse(": {{ébauche-étym|de}}")
base_data = defaultdict(list, {"lang_code": "de"})
page_data = [base_data]
extract_etymology(self.wxr, page_data, base_data, root.children)
self.assertEqual(page_data, [{"lang_code": "de"}])
etymology_data = extract_etymology(self.wxr, root.children)
self.assertIsNone(etymology_data)

def test_list_etymologies(self):
# https://fr.wiktionary.org/wiki/lenn
self.wxr.wtp.start_page("lenn")
root = self.wxr.wtp.parse(
"""* [[#br-nom-1|Nom commun 1 :]]
: Du vieux breton lin (« lac, étang ; liquide, humeur »).
: Du moyen breton lenn.
* [[#br-nom-2|Nom commun 2 :]]
:Du vieux breton lenn (« pièce de toile, voile, manteau, rideau »)."""
)
etymology_data = extract_etymology(self.wxr, root.children)
self.assertEqual(
etymology_data,
{
"Nom commun 1": [
"Du vieux breton lin (« lac, étang ; liquide, humeur »).",
"Du moyen breton lenn.",
],
"Nom commun 2": [
"Du vieux breton lenn (« pièce de toile, voile, manteau, rideau »)."
],
},
)
page_data = [
defaultdict(
list,
{"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 1"},
),
defaultdict(
list,
{"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 2"},
),
]
insert_etymology_data("fr", page_data, etymology_data)
self.assertEqual(
page_data,
[
{
"lang_code": "fr",
"pos": "noun",
"pos_title": "Nom commun 1",
"etymology_texts": [
"Du vieux breton lin (« lac, étang ; liquide, humeur »).",
"Du moyen breton lenn.",
],
},
{
"lang_code": "fr",
"pos": "noun",
"pos_title": "Nom commun 2",
"etymology_texts": [
"Du vieux breton lenn (« pièce de toile, voile, manteau, rideau »)."
],
},
],
)

def test_indent_etymology_with_pos_template(self):
# https://fr.wiktionary.org/wiki/dame
self.wxr.wtp.start_page("dame")
self.wxr.wtp.add_page("Modèle:lien-ancre-étym", 10, "({{{2}}} {{{3}}})")
root = self.wxr.wtp.parse(
""": {{lien-ancre-étym|fr|Nom commun|1}} Du latin domina (« maîtresse de maison »).
: {{lien-ancre-étym|fr|Nom commun|2}} Du moyen néerlandais dam (« digue »).
: {{lien-ancre-étym|fr|Interjection|1}} Abréviation de « [[Notre-Dame]] ! » ou de « dame Dieu ! » (« [[Seigneur Dieu]] ! »).
"""
)
etymology_data = extract_etymology(self.wxr, root.children)
self.assertEqual(
etymology_data,
{
"Nom commun 1": ["Du latin domina (« maîtresse de maison »)."],
"Nom commun 2": ["Du moyen néerlandais dam (« digue »)."],
"Interjection 1": [
"Abréviation de « Notre-Dame ! » ou de « dame Dieu ! » (« Seigneur Dieu ! »)."
],
},
)
page_data = [
defaultdict(
list,
{"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 1"},
),
defaultdict(
list,
{"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 2"},
),
defaultdict(
list,
{"lang_code": "fr", "pos": "intj", "pos_title": "Interjection"},
),
]
insert_etymology_data("fr", page_data, etymology_data)
self.assertEqual(
page_data,
[
{
"lang_code": "fr",
"pos": "noun",
"pos_title": "Nom commun 1",
"etymology_texts": [
"Du latin domina (« maîtresse de maison »)."
],
},
{
"lang_code": "fr",
"pos": "noun",
"pos_title": "Nom commun 2",
"etymology_texts": [
"Du moyen néerlandais dam (« digue »)."
],
},
{
"lang_code": "fr",
"pos": "intj",
"pos_title": "Interjection",
"etymology_texts": [
"Abréviation de « Notre-Dame ! » ou de « dame Dieu ! » (« Seigneur Dieu ! »)."
],
},
],
)

def test_indent_etymology_with_italic_pos(self):
# https://fr.wiktionary.org/wiki/hélas
self.wxr.wtp.start_page("hélas")
root = self.wxr.wtp.parse(
""": (''[[#Interjection|Interjection]]'') XIIe siècle, elas ; composé de hé et de las, au sens ancien de « malheureux ».
: (''[[#fr-nom|Nom]]'') Par [[substantivation]] de l’interjection.
"""
)
etymology_data = extract_etymology(self.wxr, root.children)
self.assertEqual(
etymology_data,
{
"Interjection": [
"XIIe siècle, elas ; composé de hé et de las, au sens ancien de « malheureux »."
],
"Nom commun": ["Par substantivation de l’interjection."],
},
)
8 changes: 7 additions & 1 deletion tests/test_fr_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,19 @@ def test_zh_exemple_template(self):
)
page_data = [defaultdict(list)]
process_pos_block(
self.wxr, page_data, defaultdict(list), root.children[0], "nom"
self.wxr,
page_data,
defaultdict(list),
root.children[0],
"nom",
"Nom commun",
)
self.assertEqual(
page_data,
[
{
"pos": "noun",
"pos_title": "Nom commun",
"senses": [
{
"glosses": ["Cheval."],
Expand Down
7 changes: 1 addition & 6 deletions tests/test_fr_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,6 @@ def setUp(self):
conf1 = WiktionaryConfig(
dump_file_lang_code="fr",
capture_language_codes=None,
capture_translations=True,
capture_pronunciation=True,
capture_linkages=True,
capture_compounds=True,
capture_redirects=True,
capture_examples=True,
)
self.wxr = WiktextractContext(Wtp(lang_code="fr"), conf1)

Expand Down Expand Up @@ -52,6 +46,7 @@ def test_fr_parse_page(self):
"lang": "Français",
"lang_code": "fr",
"pos": "noun",
"pos_title": "Nom commun",
"word": "exemple",
}
],
Expand Down
1 change: 1 addition & 0 deletions wiktextract/datautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ def append_base_data(
# append new dictionary if the last dictionary has sense data and
# also has the same key
page_data.append(copy.deepcopy(base_data))
page_data[-1][field] = value
elif isinstance(page_data[-1].get(field), list):
page_data[-1][field] += value
else:
Expand Down
118 changes: 118 additions & 0 deletions wiktextract/extractor/fr/etymology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
from collections import defaultdict
from typing import Dict, List, Optional, Tuple, Union

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode

from wiktextract.page import LEVEL_KINDS, clean_node
from wiktextract.wxr_context import WiktextractContext

EtymologyData = Dict[str, List[str]]


def extract_etymology(
wxr: WiktextractContext,
nodes: List[Union[WikiNode, str]],
) -> Optional[EtymologyData]:
etymology_dict: EtymologyData = defaultdict(list)
level_node_index = len(nodes)
# find nodes after the etymology subtitle and before the next level node
for index, node in enumerate(nodes):
if isinstance(node, WikiNode) and node.kind in LEVEL_KINDS:
level_node_index = index
break

pos_title: Optional[str] = None
for etymology_node in nodes[:level_node_index]:
if (
isinstance(etymology_node, WikiNode)
and etymology_node.kind == NodeKind.LIST
):
if etymology_node.sarg == "*":
pos_title = clean_node(wxr, None, etymology_node)
pos_title = pos_title.removeprefix("* ").removesuffix(" :")
elif etymology_node.sarg == ":":
# ignore missing etymology template "ébauche-étym"
for template_node in etymology_node.find_child_recursively(
NodeKind.TEMPLATE
):
if template_node.template_name == "ébauche-étym":
return

for etymology_item in etymology_node.find_child(
NodeKind.LIST_ITEM
):
etymology_data = find_pos_in_etymology_list(
wxr, etymology_item
)
if etymology_data is not None:
new_pos_title, new_etymology_text = etymology_data
etymology_dict[new_pos_title].append(new_etymology_text)
else:
etymology_text = clean_node(
wxr, None, etymology_item.children
)
etymology_dict[pos_title].append(etymology_text)

return etymology_dict


def find_pos_in_etymology_list(
wxr: WiktextractContext, list_item_node: WikiNode
) -> Optional[Tuple[str, str]]:
"""
Return tuple of POS title and etymology text if the passed lis item node
starts with italic POS node or POS template, otherwise return None.
"""
child_nodes = list(list_item_node.filter_empty_str_child())
for index, node in enumerate(child_nodes):
if (
index == 0
and isinstance(node, TemplateNode)
and node.template_name == "lien-ancre-étym"
):
return clean_node(wxr, None, node).strip("()"), clean_node(
wxr, None, child_nodes[index + 1 :]
)
if (
index == 1
and isinstance(node, WikiNode)
and node.kind == NodeKind.ITALIC
and isinstance(child_nodes[0], str)
and child_nodes[0].endswith("(")
and isinstance(child_nodes[2], str)
and child_nodes[2].startswith(")")
):
# italic pos
pos_title = clean_node(wxr, None, node)
if pos_title == "Nom":
pos_title = "Nom commun"
return pos_title, clean_node(
wxr, None, child_nodes[index + 1 :]
).removeprefix(") ")


def insert_etymology_data(
lang_code: str, page_data: List[Dict], etymology_data: EtymologyData
) -> None:
"""
Insert list of etymology data extracted from the level 3 node to each sense
dictionary matches the language and POS.
"""
sense_dict = {} # group by pos title
for sense_data in page_data:
if sense_data.get("lang_code") == lang_code:
sense_dict[sense_data.get("pos_title")] = sense_data

for pos_title, etymology_texts in etymology_data.items():
if pos_title is None: # add to all sense dictionaries
for sense_data in sense_dict.values():
sense_data["etymology_texts"] = etymology_texts
elif pos_title in sense_dict:
sense_dict[pos_title]["etymology_texts"] = etymology_texts
elif pos_title.removesuffix(" 1") in sense_dict:
# an index number is added in the etymology section but not added in
# POS title
sense_dict[pos_title.removesuffix(" 1")][
"etymology_texts"
] = etymology_texts
2 changes: 1 addition & 1 deletion wiktextract/extractor/fr/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def extract_gloss(
gloss_data = defaultdict(list)
gloss_start = 0
# process modifier, theme tempaltes before gloss text
# https://fr.wiktionary.org/wiki/Wiktionnaire:Liste de tous les modèles/Précisions de sens
# https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens
if (
len(gloss_nodes) > 0
and isinstance(gloss_nodes[0], WikiNode)
Expand Down
2 changes: 1 addition & 1 deletion wiktextract/extractor/fr/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

from .pronunciation import is_ipa_text, insert_ipa
from .pronunciation import insert_ipa, is_ipa_text


def extract_inflection(
Expand Down
Loading

0 comments on commit be3fd6f

Please sign in to comment.