From d46946868ce1801e4e4fb3e8e67f9d3d2e8f6304 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 30 Oct 2023 16:32:10 +0800 Subject: [PATCH] Extract nested gloss and example lists in French Wiktionary --- src/wiktextract/extractor/fr/gloss.py | 17 +++++---- src/wiktextract/extractor/fr/inflection.py | 3 +- tests/test_fr_gloss.py | 41 ++++++++++++++++++++++ tests/test_fr_linkage.py | 4 +-- 4 files changed, 54 insertions(+), 11 deletions(-) diff --git a/src/wiktextract/extractor/fr/gloss.py b/src/wiktextract/extractor/fr/gloss.py index 56ccea4b..c63a1abb 100644 --- a/src/wiktextract/extractor/fr/gloss.py +++ b/src/wiktextract/extractor/fr/gloss.py @@ -11,6 +11,7 @@ def extract_gloss( wxr: WiktextractContext, page_data: List[Dict], list_node: WikiNode, + parent_glosses: List[str] = [], ) -> None: for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): gloss_nodes = list( @@ -63,19 +64,23 @@ def extract_gloss( if index not in tag_indexes ] gloss_text = clean_node(wxr, gloss_data, gloss_only_nodes) - gloss_data["glosses"] = [gloss_text] - extract_examples(wxr, gloss_data, list_item_node) + gloss_data["glosses"] = parent_glosses + [gloss_text] page_data[-1]["senses"].append(gloss_data) + for nest_gloss_list in list_item_node.find_child(NodeKind.LIST): + if nest_gloss_list.sarg.endswith("#"): + extract_gloss( + wxr, page_data, nest_gloss_list, gloss_data["glosses"] + ) + elif nest_gloss_list.sarg.endswith("*"): + extract_examples(wxr, gloss_data, nest_gloss_list) def extract_examples( wxr: WiktextractContext, gloss_data: Dict, - gloss_list_node: WikiNode, + example_list_node: WikiNode, ) -> None: - for example_node in gloss_list_node.find_child_recursively( - NodeKind.LIST_ITEM - ): + for example_node in example_list_node.find_child(NodeKind.LIST_ITEM): example_node_children = list(example_node.filter_empty_str_child()) if len(example_node_children) == 0: continue diff --git a/src/wiktextract/extractor/fr/inflection.py b/src/wiktextract/extractor/fr/inflection.py index 9479a222..8d48501c 100644 --- a/src/wiktextract/extractor/fr/inflection.py +++ b/src/wiktextract/extractor/fr/inflection.py @@ -79,8 +79,7 @@ def process_inflection_table( and "invisible" not in row_node_child.attrs.get("class", "") ] current_row_has_data_cell = any( - isinstance(cell, WikiNode) - and cell.kind == NodeKind.TABLE_CELL + isinstance(cell, WikiNode) and cell.kind == NodeKind.TABLE_CELL for cell in table_row_nodes ) row_headers = [] diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py index ff6a1395..b559a4d6 100644 --- a/tests/test_fr_gloss.py +++ b/tests/test_fr_gloss.py @@ -257,3 +257,44 @@ def test_template_is_not_tag(self, mock_get_page): } ], ) + + def test_nest_gloss(self): + self.maxDiff = None + self.wxr.wtp.start_page("eau") + root = self.wxr.wtp.parse( + """# [[fluide|Fluides]], [[sérosité]]s qui se trouvent ou qui se forment dans le [[corps]] de l’[[homme]] ou de l’[[animal]]. +#* example 1 +## [[salive|Salive]]. +##* nest example + """ + ) + page_data = [defaultdict(list)] + extract_gloss(self.wxr, page_data, root.children[0]) + self.assertEqual( + page_data[-1]["senses"], + [ + { + "examples": [ + { + "text": "example 1", + "type": "example", + } + ], + "glosses": [ + "Fluides, sérosités qui se trouvent ou qui se forment dans le corps de l’homme ou de l’animal." + ], + }, + { + "examples": [ + { + "text": "nest example", + "type": "example", + } + ], + "glosses": [ + "Fluides, sérosités qui se trouvent ou qui se forment dans le corps de l’homme ou de l’animal.", + "Salive.", + ], + }, + ], + ) diff --git a/tests/test_fr_linkage.py b/tests/test_fr_linkage.py index b4e2f1aa..498faf3f 100644 --- a/tests/test_fr_linkage.py +++ b/tests/test_fr_linkage.py @@ -183,9 +183,7 @@ def test_derives_autres_langues_section(self): def test_words_divided_by_slash(self): page_data = [defaultdict(list)] self.wxr.wtp.start_page("eau") - root = self.wxr.wtp.parse( - "* [[benoîte d’eau]] / [[benoite d’eau]]" - ) + root = self.wxr.wtp.parse("* [[benoîte d’eau]] / [[benoite d’eau]]") extract_linkage(self.wxr, page_data, root, "dérivés") self.assertEqual( page_data,