From 2b4f32d13ce61d4021496ca561c3bb7074923177 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Fri, 13 Oct 2023 15:06:46 +0800
Subject: [PATCH 1/4] =?UTF-8?q?Ignore=20empty=20string=20template=20parame?=
 =?UTF-8?q?ter=20of=20the=20"trad-d=C3=A9but"=20template?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/wiktextract/clean.py                    |   2 +-
 src/wiktextract/extractor/fr/translation.py |   6 +-
 src/wiktextract/form_descriptions.py        |   4 +-
 src/wiktextract/inflection.py               | 129 ++++++++++----------
 src/wiktextract/pronunciations.py           |   2 +-
 tests/test_clean.py                         |   2 +-
 6 files changed, 72 insertions(+), 73 deletions(-)

diff --git a/src/wiktextract/clean.py b/src/wiktextract/clean.py
index 8b1e9fec..b5c37ff6 100644
--- a/src/wiktextract/clean.py
+++ b/src/wiktextract/clean.py
@@ -1131,7 +1131,7 @@ def expand_group(v):
                 if a == "2":
                     v = "√"
                 elif a == "3":
-                    v = "∛",
+                    v = "∛"
                 elif a == "4":
                     v = "∜"
                 else:
diff --git a/src/wiktextract/extractor/fr/translation.py b/src/wiktextract/extractor/fr/translation.py
index 618bb6e3..7d98528c 100644
--- a/src/wiktextract/extractor/fr/translation.py
+++ b/src/wiktextract/extractor/fr/translation.py
@@ -72,9 +72,9 @@ def process_translation_templates(
         # translation box start: https://fr.wiktionary.org/wiki/Modèle:trad-début
         sense_parameter = template_node.template_parameters.get(1)
         if sense_parameter is not None:
-            base_translation_data["sense"] = clean_node(
-                wxr, None, sense_parameter
-            )
+            sense_text = clean_node(wxr, None, sense_parameter)
+            if len(sense_text) > 0:
+                base_translation_data["sense"] = sense_text
     elif template_node.template_name == "T":
         # Translation language: https://fr.wiktionary.org/wiki/Modèle:T
         base_translation_data["code"] = template_node.template_parameters.get(1)
diff --git a/src/wiktextract/form_descriptions.py b/src/wiktextract/form_descriptions.py
index c2938cc6..6ef230a3 100644
--- a/src/wiktextract/form_descriptions.py
+++ b/src/wiktextract/form_descriptions.py
@@ -169,7 +169,7 @@
     r"\s+\((({}): ([^()]|\([^()]+\))+)\)"
     .format("|".join(re.escape(x.removeprefix("?"))
                      for x in sorted(xlat_head_map.values(),
-                                     key=lambda x: len(x),
+                                     key=len,
                                      reverse=True)
                      if x and not x.startswith("class-"))))
 
@@ -179,7 +179,7 @@
     "|".join(re.escape(x) for x in
              # The sort is to put longer ones first, preferring them in
              # the regexp match
-             sorted(xlat_head_map.keys(), key=lambda x: len(x),
+             sorted(xlat_head_map.keys(), key=len,
                     reverse=True)))
 head_final_re = re.compile(head_final_re_text + "$")
 
diff --git a/src/wiktextract/inflection.py b/src/wiktextract/inflection.py
index ec8a0037..2ef3ea51 100644
--- a/src/wiktextract/inflection.py
+++ b/src/wiktextract/inflection.py
@@ -262,7 +262,7 @@ def set_debug_cell_text(text):
     # Arabic, but that is being handled elsewhere now.
 ])
 
-class InflCell(object):
+class InflCell:
     """Cell in an inflection table."""
     __slots__ = (
         "text",
@@ -292,7 +292,7 @@ def __repr__(self):
         return str(self)
 
 
-class HdrSpan(object):
+class HdrSpan:
     """Saved information about a header cell/span during the parsing
     of a table."""
     __slots__ = (
@@ -569,7 +569,7 @@ def extract_cell_content(lang, word, col):
             col = col[:-1]
         else:
             break
-            
+
     # Check for another form of note definition
     if (len(col) > 2 and col[1] in (")", " ", ":") and
         col[0].isdigit() and
@@ -1200,7 +1200,7 @@ def parse_simple_table(wxr, tablecontext, word, lang, pos,
     for x in titles:
         assert isinstance(x, str)
 
-    
+
     # print("PARSE_SIMPLE_TABLE: TITLES:", titles)
     if debug_cell_text:
         print("ROWS:")
@@ -1249,7 +1249,7 @@ def parse_simple_table(wxr, tablecontext, word, lang, pos,
     # for row in rows:
     #     print("  ", row)
 
-        
+
     # Parse definitions for references (from table itself and from text
     # after it)
     def_ht = {}
@@ -1333,7 +1333,7 @@ def add_new_hdrspan(col, hdrspans, store_new_hdrspan,
         # later with "dummy-load-stored-hdrspans".
         if store_new_hdrspan:
             tablecontext.stored_hdrspans.append(hdrspan)
-            
+
         # Handle headers that are above left-side header
         # columns and are followed by personal pronouns in
         # remaining columns (basically headers that
@@ -1354,7 +1354,7 @@ def add_new_hdrspan(col, hdrspans, store_new_hdrspan,
             later_allowed = later_allowed | set(["dummy"])
             # dummy2 has different behavior than plain dummy
             # and does not belong here.
-            
+
             # print("col0_cats={} later_cats={} "
             #       "fol_by_nonempty={} col_idx={} end={} "
             #       "tagsets={}"
@@ -1524,7 +1524,7 @@ def handle_mixed_lines(alts):
               # First is base and the rest is IPA alternatives
             alts = list((alts[0], "", alts[i])
                         for i in range(1, len(alts)))
-    
+
         # Check for romanizations, forms first, romanizations under
         elif (len(alts) % 2 == 0 and
               not any("(" in x for x in alts) and
@@ -1668,12 +1668,12 @@ def handle_parens(form, roman, clitic, extra_tags):
             form = (form[:m.start()] + subst +
                     form[m.end():]).strip()
         return form, roman, clitic
-        
+
     def merge_row_and_column_tags(form, some_has_covered_text):
         # Merge column tags and row tags.  We give preference
         # to moods etc coming from rowtags (cf. austteigen/German/Verb
         # imperative forms).
-        
+
         # In certain cases, what a tag means depends on whether
         # it is a row or column header. Depending on the language,
         # we replace certain tags with others if they're in
@@ -1710,19 +1710,19 @@ def merge_row_and_column_tags(form, some_has_covered_text):
                             for tt in old_tags)):
                         continue
                     tags.add(t)
-    
+
                 # Extract language-specific tags from the
                 # form.  This may also adjust the form.
                 form, lang_tags = lang_specific_tags(lang, pos, form)
                 tags.update(lang_tags)
-    
+
                 # For non-finite verb forms, see if they have
                 # a gender/class suffix
                 if pos == "verb" and any(valid_tags[t] == "non-finite"
                        for t in tags):
                     form, tt = parse_head_final_tags(wxr, lang, form)
                     tags.update(tt)
-    
+
                 # Remove "personal" tag if have nth person; these
                 # come up with e.g. reconhecer/Portuguese/Verb.  But
                 # not if we also have "pronoun"
@@ -1732,20 +1732,20 @@ def merge_row_and_column_tags(form, some_has_covered_text):
                        ["first-person", "second-person",
                         "third-person"])):
                     tags.remove("personal")
-    
+
                 # If we have impersonal, remove person and number.
                 # This happens with e.g. viajar/Portuguese/Verb
                 if "impersonal" in tags:
                     tags = tags - set(["first-person", "second-person",
                                        "third-person",
                                        "singular", "plural"])
-    
+
                 # Remove unnecessary "positive" tag from verb forms
                 if pos == "verb" and "positive" in tags:
                     if "negative" in tags:
                         tags.remove("negative")
                     tags.remove("positive")
-    
+
                 # Many Russian (and other Slavic) inflection tables
                 # have animate/inanimate distinction that generates
                 # separate entries for neuter/feminine, but the
@@ -1758,14 +1758,14 @@ def merge_row_and_column_tags(form, some_has_covered_text):
                                 "masculine" not in tags and
                                 "plural" not in tags):
                                 tags.remove(t1)
-    
+
                 # German adjective tables contain "(keiner)" etc
                 # for mixed declension plural.  When the adjective
                 # disappears and it becomes just one word, remove
                 # the "includes-article" tag.  e.g. eiskalt/German
                 if "includes-article" in tags and " " not in form:
                     tags.remove("includes-article")
-    
+
                 # Handle ignored forms.  We mark that the form was
                 # provided.  This is important information; some words
                 # just do not have a certain form.  However, there also
@@ -1784,7 +1784,7 @@ def merge_row_and_column_tags(form, some_has_covered_text):
                     form = "-"
                 elif col_idx in has_covering_hdr:
                     some_has_covered_text = True
-    
+
                 # Handle ambiguous object concord. If a header
                 # gives the "dummy-object-concord"-tag to a word,
                 # replace person, number and gender tags with
@@ -1798,7 +1798,7 @@ def merge_row_and_column_tags(form, some_has_covered_text):
                         if subtag in tags:
                             tags.remove(subtag)
                             tags.add(objtag)
-    
+
                 # Remove the dummy mood tag that we sometimes
                 # use to block adding other mood and related
                 # tags
@@ -1813,7 +1813,7 @@ def merge_row_and_column_tags(form, some_has_covered_text):
                                    "dummy-reset-stored-hdrspans",
                                    "dummy-section-header",
                                    ])
-    
+
                 # Perform language-specific tag replacements according
                 # to rules in a table.
                 lang_tag_mappings = get_lang_conf(lang,
@@ -1822,13 +1822,13 @@ def merge_row_and_column_tags(form, some_has_covered_text):
                     for pre, post in lang_tag_mappings.items():
                         if all(t in tags for t in pre):
                             tags = (tags - set(pre)) | set(post)
-                            
+
                 # Warn if there are entries with empty tags
                 if not tags:
                     wxr.wtp.debug("inflection table: empty tags for {}"
                               .format(form),
                               sortid="inflection/1826")
-    
+
                 # Warn if form looks like IPA
                 ########## XXX ########
                 # Because IPA is its own unicode block, we could also
@@ -1844,14 +1844,14 @@ def merge_row_and_column_tags(form, some_has_covered_text):
                               "form={} tags={}"
                               .format(form, tags),
                               sortid="inflection/1840")
-    
+
                 # Note that this checks `form`, not `in tags`
                 if form == "dummy-ignored-text-cell":
                     continue
-    
+
                 if "dummy-remove-this-cell" in tags:
                     continue
-    
+
                 # Add the form
                 tags = list(sorted(tags))
                 dt = {"form": form, "tags": tags, "source": source}
@@ -1866,7 +1866,7 @@ def merge_row_and_column_tags(form, some_has_covered_text):
                           "source": source}
                     ret.append(dt)
         return ret, form, some_has_covered_text
-        
+
     # First extract definitions from cells
     # See defs_ht for footnote defs stuff
     for row in rows:
@@ -2072,7 +2072,7 @@ def merge_row_and_column_tags(form, some_has_covered_text):
 
                 if any("dummy-load-stored-hdrspans" in ts for ts in v):
                     hdrspans.extend(tablecontext.stored_hdrspans)
-                    
+
                 if any("dummy-reset-stored-hdrspans" in ts for ts in v):
                     tablecontext.stored_hdrspans = []
 
@@ -2081,7 +2081,7 @@ def merge_row_and_column_tags(form, some_has_covered_text):
                     store_new_hdrspan = True
                 else:
                     store_new_hdrspan = False
-                    
+
                 new_coltags = list(x for x in new_coltags
                                    if not any(t in noinherit_tags for t in x))
                 # print("new_coltags={} previously_seen={} all_hdr_tags={}"
@@ -2091,13 +2091,13 @@ def merge_row_and_column_tags(form, some_has_covered_text):
                     = add_new_hdrspan(col, hdrspans, store_new_hdrspan,
                                       col0_followed_by_nonempty, col0_hdrspan)
 
-                    
+
                 continue
 
             # These values are ignored, at least for now
             if re.match(r"^(# |\(see )", col):
                 continue
-                
+
             if any("dummy-skip-this" in ts for ts in rowtags):
                 continue  # Skip this cell
 
@@ -2117,7 +2117,7 @@ def merge_row_and_column_tags(form, some_has_covered_text):
                 get_lang_conf(lang, "ignore_top_left_text_cell") == True
                 ):
                 continue  # Skip text at top left, as in Icelandic, Faroese
-                
+
             # if col0_hdrspan is not None:
             #     print("COL0 FOLLOWED NONHDR: {!r} by {!r}"
             #           .format(col0_hdrspan.text, col))
@@ -2135,16 +2135,16 @@ def merge_row_and_column_tags(form, some_has_covered_text):
             # newline.
             col = re.sub(r"[ \t\r]+", " ", col)
             # Split the cell text into alternatives
-            
+
             col, alts, split_extra_tags = \
             split_text_into_alts(col)
 
             # Some cells have mixed form content, like text and romanization,
             # or text and IPA. Handle these.
             alts = handle_mixed_lines(alts)
-            
+
             alts = list((x, combined_coltags) for x in alts)
-            
+
             # Generate forms from the alternatives
             # alts is a list of (tuple of forms, tuple of tags)
             for (form, base_roman, ipa), coltags in alts:
@@ -2180,7 +2180,7 @@ def merge_row_and_column_tags(form, some_has_covered_text):
                                                               word,
                                                               base_roman)
                     extra_tags.extend(hdr_tags)
-                                        
+
                 # Do some additional cleanup on the cell.
                 form = re.sub(r"^\s*,\s*", "", form)
                 form = re.sub(r"\s*,\s*$", "", form)
@@ -2192,7 +2192,7 @@ def merge_row_and_column_tags(form, some_has_covered_text):
                 # Look for parentheses that have semantic meaning
                 form, et = find_semantic_parens(form)
                 extra_tags.extend(et)
-                        
+
                 # Handle parentheses in the table element.  We parse
                 # tags anywhere and romanizations anywhere but beginning.
                 roman = base_roman
@@ -2212,7 +2212,7 @@ def merge_row_and_column_tags(form, some_has_covered_text):
                 if paren is not None:
                     form, roman, clitic = handle_parens(form, roman, clitic,
                                                         extra_tags)
-                    
+
                 # Ignore certain forms that are not really forms,
                 # unless they're really, really close to the article title
                 if form in ("", "unchanged",
@@ -2243,7 +2243,7 @@ def merge_row_and_column_tags(form, some_has_covered_text):
                         merge_row_and_column_tags(form,
                                                   some_has_covered_text)
                 ret.extend(merge_ret)
-                
+
         # End of row.
         rownum += 1
         # For certain languages, if the row was empty, reset
@@ -2307,7 +2307,7 @@ def merge_row_and_column_tags(form, some_has_covered_text):
                 had_noun = False
                 continue  # Skip the articles
 
-            
+
             dt = dt.copy()
             dt["tags"] = tags
             new_ret.append(dt)
@@ -2337,13 +2337,13 @@ def merge_row_and_column_tags(form, some_has_covered_text):
                     continue
                 if key_tag not in tags:
                     skip_this = True
-    
+
             if skip_this:
                 continue
             new_ret.append(cell_data)
-            
+
         ret = new_ret
-        
+
     # Post-process English inflection tables, addding "multiword-construction"
     # when the number of words has increased.
     if lang == "English" and pos == "verb":
@@ -2375,7 +2375,7 @@ def merge_row_and_column_tags(form, some_has_covered_text):
             ret = [dt] + [tn] + ret
         else:
             ret = [dt] + ret
-                
+
     return ret
 
 def handle_generic_table(wxr, tablecontext, data,
@@ -2584,7 +2584,7 @@ def determine_header(wxr, tablecontext, lang, word, pos,
         is_title = True
     return is_title, hdr_expansion, target, celltext
 
-class TableContext(object):
+class TableContext:
     """Saved context used when parsing a table and its subtables."""
     __slot__ = (
         "stored_hdrspans",
@@ -2623,7 +2623,7 @@ def handle_wikitext_or_html_table(wxr, word, lang, pos,
 
     if not tablecontext:
         tablecontext = TableContext()
-        
+
     def handle_table1(wxr, tablecontext, word, lang, pos,
                               data, tree, titles, source, after, depth):
         """Helper function allowing the 'flattening' out of the table
@@ -2640,7 +2640,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos,
         assert isinstance(after, str)
         assert isinstance(depth, int)
         # print("HANDLE_WIKITEXT_TABLE", titles)
-    
+
         col_gap_data = []   # Filling for columns with rowspan > 1
                             # col_gap_data contains None or InflCell
         vertical_still_left = []  # Number of remaining rows for which to fill
@@ -2652,7 +2652,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos,
         rows = []
 
         sub_ret = []
-        
+
         for node in tree.children:
             if not isinstance(node, WikiNode):
                 continue
@@ -2660,7 +2660,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos,
                 kind = node.sarg
             else:
                 kind = node.kind
-            
+
             # print("  {}".format(node))
             if kind in (NodeKind.TABLE_CAPTION, "caption"):
                 # print("  CAPTION:", node)
@@ -2671,7 +2671,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos,
                     # have more data.  The hidden data duplicates these rows, so
                     # we skip it and just process the hidden data.
                     continue
-    
+
                 # Parse a table row.
                 row = []
                 style = None
@@ -2693,7 +2693,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos,
                                     "th", "td"):
                         print("    UNEXPECTED ROW CONTENT: {}".format(col))
                         continue
-                        
+
                     while (len(row) < len(vertical_still_left) and
                            vertical_still_left[len(row)] > 0):
                     # vertical_still_left is [...0, 0, 2...] for each column.
@@ -2705,7 +2705,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos,
                     # and rowspan and colspan are just to generate the "fill-
                         vertical_still_left[len(row)] -= 1
                         row.append(col_gap_data[len(row)])
-                        
+
                         # appending row is how "indexing" is
                         # done here; something is appended,
                         # like a filler-cell here or a "start"
@@ -2719,7 +2719,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos,
                         # except when a new rowspan is needed,
                         # at the same time that
                         # vertical_still_left gets reassigned.
-    
+
                     try:
                         rowspan = int(col.attrs.get("rowspan", "1"))  # 🡙
                         colspan = int(col.attrs.get("colspan", "1"))  # 🡘
@@ -2727,7 +2727,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos,
                         rowspan = 1
                         colspan = 1
                     # print("COL:", col)
-    
+
                     # Process any nested tables recursively.
                     tables, rest = recursively_extract(col, lambda x:
                                                        isinstance(x, WikiNode)
@@ -2735,11 +2735,11 @@ def handle_table1(wxr, tablecontext, word, lang, pos,
                                                        (x.kind == NodeKind.TABLE
                                                        or
                                                        x.sarg == "table"))
-    
+
                     # Clean the rest of the cell.
                     celltext = clean_node(wxr, None, rest)
                     # print("CLEANED:", celltext)
-    
+
                     # Handle nested tables.
                     for tbl in tables:
                         # Some nested tables (e.g., croí/Irish) have subtitles
@@ -2761,12 +2761,12 @@ def handle_table1(wxr, tablecontext, word, lang, pos,
                             titles = []
                             after = ""
                             sub_ret.extend(subtbl)
-    
+
                     # This magic value is used as part of header detection
                     cellstyle = (col.attrs.get("style", "") + "//" +
                                  col.attrs.get("class", "") + "//" +
                                  str(kind))
-    
+
                     if not row:  # if first column in row
                         style = cellstyle
                     target = None
@@ -2781,7 +2781,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos,
                                      row, col, celltext, titletext,
                                      cols_headered,
                                      None, cellstyle)
-                                      
+
                     if is_title:
                     # If this cell gets a "*" tag, make the whole column
                     # below it (toggling it in cols_headered = [F, F, T...])
@@ -2825,7 +2825,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos,
                             # future, or None
                             vertical_still_left[len(row)] = rowspan - 1
                             # A counter for how many gaps🡙 are still left to be
-                            # filled (row.append or 
+                            # filled (row.append or
                             # row[col_gap_data[len(row)] =>
                             # rows), it is not reset to [], but decremented to 0
                             # each time a row gets something from col_gap_data.
@@ -2857,11 +2857,11 @@ def handle_table1(wxr, tablecontext, word, lang, pos,
         else:
             main_ret = [(rows, titles, after, depth)]
         return main_ret
-        
+
 
     new_rows = handle_table1(wxr, tablecontext, word, lang, pos,
                               data, tree, titles, source, after, 0)
-                              
+
     # Now we have a table that has been parsed into rows and columns of
     # InflCell objects.  Parse the inflection table from that format.
     if new_rows:
@@ -2878,13 +2878,13 @@ def handle_html_table(wxr, word, lang, pos, data, tree, titles, source,
     """A passer-on function for html-tables, XXX, remove these?"""
     handle_wikitext_or_html_table(wxr, word, lang, pos,
                                 data, tree, titles, source, after, tablecontext)
-                                
+
 def handle_wikitext_table(wxr, word, lang, pos, data, tree, titles, source,
                       after, tablecontext=None):
     """A passer-on function for html-tables, XXX, remove these?"""
     handle_wikitext_or_html_table(wxr, word, lang, pos,
                                 data, tree, titles, source, after, tablecontext)
-                                
+
 
 
 def parse_inflection_section(wxr, data,
@@ -3017,4 +3017,3 @@ def recurse(node, titles, navframe=False):
                 f.write(section + "\n")
                 text = wxr.wtp.node_to_wikitext(tree)
                 f.write(text + "\n")
-                
diff --git a/src/wiktextract/pronunciations.py b/src/wiktextract/pronunciations.py
index a600b76b..99cd4fa7 100644
--- a/src/wiktextract/pronunciations.py
+++ b/src/wiktextract/pronunciations.py
@@ -25,7 +25,7 @@
 pron_romanization_re = re.compile(
     "(?m)^(" +
     "|".join(re.escape(x) for x in
-             sorted(pron_romanizations.keys(), key=lambda x: len(x),
+             sorted(pron_romanizations.keys(), key=len,
                     reverse=True)) +
     ")([^\n]+)")
 
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 7d4379c9..344633d2 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -21,7 +21,7 @@ def test_pos(self):
         poses = self.wxr.config.POS_TYPES
         self.assertTrue(isinstance(poses, set))
         for pos_type in ["noun", "verb", "pron", "adj", "adv", "num"]:
-            self.assertTrue(pos_type in poses)
+            self.assertIn(pos_type, poses)
         self.assertLess(len(poses), 50)
 
     def test_cv_plain(self):

From 7e4451f670f25bc5ebf63cc8c67d742cf92dabb0 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Fri, 13 Oct 2023 16:55:34 +0800
Subject: [PATCH 2/4] Don't extract thesaurus and analyze templates for French
 Wiktionary

French Wiktionary's thesaurus pages contain too much words that are
not synonym words and doesn't use a common page layout. And word entry
pages alreay have synonym section. Disbale this feature remove many
error messages from the default English thesaurus extractor.

Pre-expand templates just makes extracting French Wiktionary harder,
I haven't seen any template that should be pre-expanded.
---
 languages/get_data.py                |  3 +-
 src/wiktextract/config.py            | 61 ++++++++--------------------
 src/wiktextract/data/fr/config.json  |  4 ++
 src/wiktextract/extractor/fr/page.py |  6 ---
 src/wiktextract/page.py              |  3 +-
 src/wiktextract/wiktionary.py        |  6 ++-
 src/wiktextract/wiktwords.py         |  4 +-
 src/wiktextract/wxr_context.py       | 16 +++++---
 tests/test_fr_etymology.py           |  4 --
 tests/test_fr_form_line.py           |  4 --
 tests/test_fr_gloss.py               |  4 --
 tests/test_fr_inflection.py          |  4 --
 tests/test_fr_linkage.py             |  4 --
 tests/test_fr_page.py                |  4 --
 tests/test_fr_pronunciation.py       |  4 --
 tests/test_fr_translation.py         |  4 --
 16 files changed, 43 insertions(+), 92 deletions(-)
 create mode 100644 src/wiktextract/data/fr/config.json

diff --git a/languages/get_data.py b/languages/get_data.py
index 9de776c0..4730602e 100644
--- a/languages/get_data.py
+++ b/languages/get_data.py
@@ -83,7 +83,8 @@ def get_lang_data(lang_code: str, dump_file: str, db_path: Path | None) -> None:
     ) as fout:
         json.dump(data, fout, indent=2, ensure_ascii=False, sort_keys=True)
     wxr.wtp.close_db_conn()
-    close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn)
+    if wxr.config.extract_thesaurus_pages:
+        close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn)
 
 
 if __name__ == "__main__":
diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py
index 77c30c2f..8d80276c 100644
--- a/src/wiktextract/config.py
+++ b/src/wiktextract/config.py
@@ -6,28 +6,15 @@
 import collections
 import json
 import sys
-from typing import TYPE_CHECKING, Callable, Optional
+from typing import Callable, Optional
+
+from wikitextprocessor.core import CollatedErrorReturnData
 
 if sys.version_info < (3, 10):
     from importlib_resources import files
 else:
     from importlib.resources import files
 
-if TYPE_CHECKING:
-    from wikitextprocessor.core import StatsData
-
-
-def int_dict():
-    return collections.defaultdict(int)
-
-
-def int_dict_dict():
-    return collections.defaultdict(int_dict)
-
-
-def list_dict():
-    return collections.defaultdict(list)
-
 
 class WiktionaryConfig:
     """This class holds configuration data for Wiktionary parsing."""
@@ -66,6 +53,8 @@ class WiktionaryConfig:
         "LANGUAGES_BY_NAME",
         "LANGUAGES_BY_CODE",
         "FORM_OF_TEMPLATES",
+        "analyze_templates",
+        "extract_thesaurus_pages",
     )
 
     def __init__(
@@ -130,38 +119,13 @@ def __init__(
             self.set_attr_from_json(
                 "FORM_OF_TEMPLATES", "form_of_templates.json"
             )
-        if dump_file_lang_code == "fr":
-            self.set_attr_from_json("FR_FORM_TABLES", "form_tables.json")
         if dump_file_lang_code == "de":
             self.set_attr_from_json("DE_FORM_TABLES", "form_templates.json")
+        self.analyze_templates = True  # find templates that need pre-expand
+        self.extract_thesaurus_pages = True
+        self.load_edition_settings()
 
-    def to_kwargs(self):
-        return {
-            "dump_file_lang_code": self.dump_file_lang_code,
-            "capture_language_codes": self.capture_language_codes,
-            "capture_translations": self.capture_translations,
-            "capture_pronunciation": self.capture_pronunciation,
-            "capture_linkages": self.capture_linkages,
-            "capture_compounds": self.capture_compounds,
-            "capture_redirects": self.capture_redirects,
-            "capture_examples": self.capture_examples,
-            "capture_etymologies": self.capture_etymologies,
-            "capture_inflections": self.capture_inflections,
-            "capture_descendants": self.capture_descendants,
-            "verbose": self.verbose,
-            "expand_tables": self.expand_tables,
-        }
-
-    def to_return(self) -> "StatsData":
-        return {
-            "num_pages": self.num_pages,
-            "language_counts": self.language_counts,
-            "pos_counts": self.pos_counts,
-            "section_counts": self.section_counts,
-        }
-
-    def merge_return(self, ret):
-        assert isinstance(ret, dict)
+    def merge_return(self, ret: CollatedErrorReturnData):
         if "num_pages" in ret:
             self.num_pages += ret["num_pages"]
             for k, v in ret["language_counts"].items():
@@ -271,3 +235,10 @@ def alias_info(name, new_code, kind, old_code, use_code, not_use_code):
                         )
                 else:
                     self.LANGUAGES_BY_NAME[lang_name] = lang_code
+
+    def load_edition_settings(self):
+        file_path = self.data_folder / "config.json"
+        if file_path.exists():
+            with file_path.open(encoding="utf-8") as f:
+                for key, value in json.load(f).items():
+                    setattr(self, key, value)
diff --git a/src/wiktextract/data/fr/config.json b/src/wiktextract/data/fr/config.json
new file mode 100644
index 00000000..91a7ba44
--- /dev/null
+++ b/src/wiktextract/data/fr/config.json
@@ -0,0 +1,4 @@
+{
+  "analyze_templates": false,
+  "extract_thesaurus_pages": false
+}
diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py
index 0f797e4a..73a2b1b3 100644
--- a/src/wiktextract/extractor/fr/page.py
+++ b/src/wiktextract/extractor/fr/page.py
@@ -152,12 +152,6 @@ def parse_page(
         page_text,
         pre_expand=True,
         additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
-        do_not_pre_expand={
-            "trad-début",  # don't expand translation start/end tempaltes
-            "trad-fin",
-            "(",  # similar to "trad-debut", pre-expand breaks node structre
-            ")",
-        },
     )
 
     page_data = []
diff --git a/src/wiktextract/page.py b/src/wiktextract/page.py
index 4c5cea9d..d7d43c4d 100644
--- a/src/wiktextract/page.py
+++ b/src/wiktextract/page.py
@@ -36,7 +36,8 @@ def parse_page(
     captured."""
     page_extractor_mod = import_extractor_module(wxr.wtp.lang_code, "page")
     page_data = page_extractor_mod.parse_page(wxr, page_title, page_text)
-    inject_linkages(wxr, page_data)
+    if wxr.config.extract_thesaurus_pages:
+        inject_linkages(wxr, page_data)
     if wxr.config.dump_file_lang_code == "en":
         process_categories(wxr, page_data)
     remove_duplicate_data(page_data)
diff --git a/src/wiktextract/wiktionary.py b/src/wiktextract/wiktionary.py
index 9a65a3d5..21988c90 100644
--- a/src/wiktextract/wiktionary.py
+++ b/src/wiktextract/wiktionary.py
@@ -119,6 +119,7 @@ def parse_wiktionary(
         override_folders,
         skip_extract_dump,
         save_pages_path,
+        not wxr.config.analyze_templates,
     )
 
     if not phase1_only:
@@ -178,7 +179,10 @@ def reprocess_wiktionary(
 
     # Extract thesaurus data. This iterates over thesaurus pages,
     # but is very fast.
-    if thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0:
+    if (
+        wxr.config.extract_thesaurus_pages
+        and thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0
+    ):
         extract_thesaurus_data(wxr, num_processes)
 
     emitted = set()
diff --git a/src/wiktextract/wiktwords.py b/src/wiktextract/wiktwords.py
index fc4b4740..4cdc5067 100755
--- a/src/wiktextract/wiktwords.py
+++ b/src/wiktextract/wiktwords.py
@@ -83,6 +83,7 @@ def process_single_page(
     # is disabled by default to speed up single page testing.
     if (
         args.use_thesaurus
+        and wxr.config.extract_thesaurus_pages
         and thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0
     ):
         extract_thesaurus_data(wxr)
@@ -507,7 +508,8 @@ def main():
             json.dump(tree, f, indent=2, sort_keys=True)
 
     wxr.wtp.close_db_conn()
-    close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn)
+    if wxr.config.extract_thesaurus_pages:
+        close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn)
 
     if args.profile:
         pr.disable()
diff --git a/src/wiktextract/wxr_context.py b/src/wiktextract/wxr_context.py
index a6b395d0..794fc777 100644
--- a/src/wiktextract/wxr_context.py
+++ b/src/wiktextract/wxr_context.py
@@ -28,15 +28,20 @@ def __init__(self, wtp: Wtp, config: WiktionaryConfig):
         self.thesaurus_db_path = wtp.db_path.with_stem(
             f"{wtp.db_path.stem}_thesaurus"
         )
-        self.thesaurus_db_conn = init_thesaurus_db(self.thesaurus_db_path)
+        self.thesaurus_db_conn = (
+            init_thesaurus_db(self.thesaurus_db_path)
+            if config.extract_thesaurus_pages
+            else None
+        )
 
     def reconnect_databases(self, check_same_thread: bool = True) -> None:
         # `multiprocessing.pool.Pool.imap()` runs in another thread, if the db
         # connection is used to create iterable data for `imap`,
         # `check_same_thread` must be `False`.
-        self.thesaurus_db_conn = sqlite3.connect(
-            self.thesaurus_db_path, check_same_thread=check_same_thread
-        )
+        if self.config.extract_thesaurus_pages:
+            self.thesaurus_db_conn = sqlite3.connect(
+                self.thesaurus_db_path, check_same_thread=check_same_thread
+            )
         self.wtp.db_conn = sqlite3.connect(
             self.wtp.db_path, check_same_thread=check_same_thread
         )
@@ -44,7 +49,8 @@ def reconnect_databases(self, check_same_thread: bool = True) -> None:
     def remove_unpicklable_objects(self) -> None:
         # remove these variables before passing the `WiktextractContext` object
         # to worker processes
-        self.thesaurus_db_conn.close()
+        if self.config.extract_thesaurus_pages:
+            self.thesaurus_db_conn.close()
         self.thesaurus_db_conn = None
         self.wtp.db_conn.close()
         self.wtp.db_conn = None
diff --git a/tests/test_fr_etymology.py b/tests/test_fr_etymology.py
index 7275cd61..ee25cd3f 100644
--- a/tests/test_fr_etymology.py
+++ b/tests/test_fr_etymology.py
@@ -8,7 +8,6 @@
     extract_etymology,
     insert_etymology_data,
 )
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -20,9 +19,6 @@ def setUp(self) -> None:
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     def test_ebauche_etym(self):
         # https://fr.wiktionary.org/wiki/Hörsaal
diff --git a/tests/test_fr_form_line.py b/tests/test_fr_form_line.py
index 5c8ba1d6..f7bbcd72 100644
--- a/tests/test_fr_form_line.py
+++ b/tests/test_fr_form_line.py
@@ -9,7 +9,6 @@
     extract_form_line,
     process_zh_mot_template,
 )
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -21,9 +20,6 @@ def setUp(self) -> None:
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     @patch(
         "wiktextract.extractor.fr.pronunciation.clean_node",
diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py
index 5f5d11d6..845bb34a 100644
--- a/tests/test_fr_gloss.py
+++ b/tests/test_fr_gloss.py
@@ -7,7 +7,6 @@
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.fr.gloss import extract_gloss
 from wiktextract.extractor.fr.page import process_pos_block
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -19,9 +18,6 @@ def setUp(self) -> None:
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     @patch(
         "wikitextprocessor.Wtp.get_page",
diff --git a/tests/test_fr_inflection.py b/tests/test_fr_inflection.py
index abb1042f..f793aea7 100644
--- a/tests/test_fr_inflection.py
+++ b/tests/test_fr_inflection.py
@@ -7,7 +7,6 @@
 
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.fr.inflection import extract_inflection
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -19,9 +18,6 @@ def setUp(self) -> None:
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     @patch(
         "wikitextprocessor.Wtp.node_to_wikitext",
diff --git a/tests/test_fr_linkage.py b/tests/test_fr_linkage.py
index 3b0fbb5f..6b8b2f70 100644
--- a/tests/test_fr_linkage.py
+++ b/tests/test_fr_linkage.py
@@ -5,7 +5,6 @@
 
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.fr.linkage import extract_linkage
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -17,9 +16,6 @@ def setUp(self) -> None:
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     def test_tags(self):
         page_data = [defaultdict(list)]
diff --git a/tests/test_fr_page.py b/tests/test_fr_page.py
index c481027b..e372d5e4 100644
--- a/tests/test_fr_page.py
+++ b/tests/test_fr_page.py
@@ -8,7 +8,6 @@
 
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.fr.page import parse_page
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -23,9 +22,6 @@ def setUp(self):
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     def test_fr_parse_page(self):
         self.wxr.wtp.add_page("Modèle:langue", 10, "Français")
diff --git a/tests/test_fr_pronunciation.py b/tests/test_fr_pronunciation.py
index f2b665ca..fc771cb9 100644
--- a/tests/test_fr_pronunciation.py
+++ b/tests/test_fr_pronunciation.py
@@ -6,7 +6,6 @@
 
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.fr.pronunciation import extract_pronunciation
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -18,9 +17,6 @@ def setUp(self) -> None:
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     def test_pron_list(self):
         page_data = [
diff --git a/tests/test_fr_translation.py b/tests/test_fr_translation.py
index b687018f..a161d7fa 100644
--- a/tests/test_fr_translation.py
+++ b/tests/test_fr_translation.py
@@ -5,7 +5,6 @@
 
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.fr.translation import extract_translation
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -17,9 +16,6 @@ def setUp(self) -> None:
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     def test_italic_tag(self):
         self.wxr.wtp.start_page("")

From b2b54d39cfc051b7e3213851d08932bba4f59d9e Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Mon, 16 Oct 2023 15:53:51 +0800
Subject: [PATCH 3/4] Only add the italic node as tag if it's between
 parentheses

---
 src/wiktextract/extractor/fr/gloss.py      | 28 ++++++++++++++++------
 src/wiktextract/extractor/fr/inflection.py |  5 +++-
 tests/test_fr_gloss.py                     | 19 +++++++++++++++
 3 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/src/wiktextract/extractor/fr/gloss.py b/src/wiktextract/extractor/fr/gloss.py
index bcb03994..c3eb1f8c 100644
--- a/src/wiktextract/extractor/fr/gloss.py
+++ b/src/wiktextract/extractor/fr/gloss.py
@@ -43,15 +43,29 @@ def extract_gloss(
                 )
 
         gloss_only_nodes = []
-        # extract italic tags
-        for node in gloss_nodes[gloss_start:]:
-            if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
+        tag_indexes = set()
+        for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start):
+            # if an italic node is between parentheses then it's a tag, also
+            # don't add the parenthese strings to `gloss_only_nodes`
+            if (
+                isinstance(node, WikiNode)
+                and node.kind == NodeKind.ITALIC
+                and index > gloss_start
+                and isinstance(gloss_nodes[index - 1], str)
+                and gloss_nodes[index - 1].strip() == "("
+                and index + 1 < len(gloss_nodes)
+                and isinstance(gloss_nodes[index + 1], str)
+                and gloss_nodes[index + 1].strip() == ")"
+            ):
                 gloss_data["tags"].append(clean_node(wxr, None, node))
+                tag_indexes |= {index - 1, index, index + 1}
                 continue
-            elif isinstance(node, str) and node.strip() in ["(", ")"]:
-                # remove parentheses around italic node
-                continue
-            gloss_only_nodes.append(node)
+
+        gloss_only_nodes = [
+            node
+            for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start)
+            if index not in tag_indexes
+        ]
         gloss_text = clean_node(wxr, gloss_data, gloss_only_nodes)
         gloss_data["glosses"] = [gloss_text]
         extract_examples(wxr, gloss_data, list_item_node)
diff --git a/src/wiktextract/extractor/fr/inflection.py b/src/wiktextract/extractor/fr/inflection.py
index c9ee14e3..d65f3d88 100644
--- a/src/wiktextract/extractor/fr/inflection.py
+++ b/src/wiktextract/extractor/fr/inflection.py
@@ -36,6 +36,7 @@ def extract_inflection(
     }
 )
 
+
 @dataclass
 class ColspanHeader:
     text: str
@@ -123,7 +124,9 @@ def process_inflection_table(
                             )
                         else:
                             column_headers.append(table_header_text)
-                        column_cell_index += int(table_cell.attrs.get("colspan", 1))
+                        column_cell_index += int(
+                            table_cell.attrs.get("colspan", 1)
+                        )
                     elif row_num > 0:
                         row_headers.append(table_header_text)
                         if "rowspan" in table_cell.attrs:
diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py
index 845bb34a..95d601c0 100644
--- a/tests/test_fr_gloss.py
+++ b/tests/test_fr_gloss.py
@@ -196,3 +196,22 @@ def test_italic_tag(self):
                 }
             ],
         )
+
+    def test_not_italic_tag(self):
+        # https://fr.wiktionary.org/wiki/bec-en-ciseaux
+        self.wxr.wtp.start_page("bec-en-ciseaux")
+        root = self.wxr.wtp.parse(
+            "# [[oiseau|Oiseau]] aquatique de taille moyenne du genre ''[[Rhynchops]]''."
+        )
+        page_data = [defaultdict(list)]
+        extract_gloss(self.wxr, page_data, root.children[0])
+        self.assertEqual(
+            page_data,
+            [
+                {
+                    "senses": [
+                        {"glosses": ["Oiseau aquatique de taille moyenne du genre Rhynchops."]}
+                    ]
+                }
+            ],
+        )

From d7d04bbd4ee06881a71f3a21be9984b4278aa092 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Mon, 16 Oct 2023 17:12:58 +0800
Subject: [PATCH 4/4] Preserve white spaces in gloss nodes

---
 src/wiktextract/extractor/fr/gloss.py | 31 ++++++++++++++++++---------
 tests/test_fr_gloss.py                | 20 +++++++++++++++++
 2 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/src/wiktextract/extractor/fr/gloss.py b/src/wiktextract/extractor/fr/gloss.py
index c3eb1f8c..f93fd0cd 100644
--- a/src/wiktextract/extractor/fr/gloss.py
+++ b/src/wiktextract/extractor/fr/gloss.py
@@ -14,19 +14,30 @@ def extract_gloss(
     list_node: WikiNode,
 ) -> None:
     for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
-        gloss_nodes = list(list_item_node.invert_find_child(NodeKind.LIST))
+        gloss_nodes = list(
+            list_item_node.invert_find_child(
+                NodeKind.LIST, include_empty_str=True
+            )
+        )
+        # remove the first empty space in list item nodes
+        if (
+            len(gloss_nodes) > 0
+            and isinstance(gloss_nodes[0], str)
+            and len(gloss_nodes[0].strip()) == 0
+        ):
+            gloss_nodes = gloss_nodes[1:]
+
         gloss_data = defaultdict(list)
         gloss_start = 0
         # process modifier, theme tempaltes before gloss text
         # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens
-        if (
-            len(gloss_nodes) > 0
-            and isinstance(gloss_nodes[0], WikiNode)
-            and gloss_nodes[0].kind == NodeKind.TEMPLATE
-        ):
+        if len(gloss_nodes) > 0 and isinstance(gloss_nodes[0], TemplateNode):
             gloss_start = 1
             for index, gloss_node in enumerate(gloss_nodes[1:], 1):
-                if (
+                if isinstance(gloss_node, str) and len(gloss_node.strip()) == 0:
+                    # ignore empty string
+                    gloss_start = index + 1
+                elif (
                     not isinstance(gloss_node, WikiNode)
                     or gloss_node.kind != NodeKind.TEMPLATE
                     # template "variante de" is not a modifier
@@ -38,9 +49,9 @@ def extract_gloss(
                 else:
                     gloss_start = index + 1
             for tag_node in gloss_nodes[:gloss_start]:
-                gloss_data["tags"].append(
-                    clean_node(wxr, gloss_data, tag_node).strip("()")
-                )
+                tag = clean_node(wxr, gloss_data, tag_node).strip("() ")
+                if len(tag) > 0:
+                    gloss_data["tags"].append(tag)
 
         gloss_only_nodes = []
         tag_indexes = set()
diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py
index 95d601c0..84eccd86 100644
--- a/tests/test_fr_gloss.py
+++ b/tests/test_fr_gloss.py
@@ -215,3 +215,23 @@ def test_not_italic_tag(self):
                 }
             ],
         )
+
+    def test_preserve_space_between_tags(self):
+        # https://fr.wiktionary.org/wiki/becs-en-ciseaux
+        # the space between italic node and the link node should be preserved
+        self.wxr.wtp.start_page("becs-en-ciseaux")
+        root = self.wxr.wtp.parse(
+            "# ''Pluriel de'' [[bec-en-ciseaux]]."
+        )
+        page_data = [defaultdict(list)]
+        extract_gloss(self.wxr, page_data, root.children[0])
+        self.assertEqual(
+            page_data,
+            [
+                {
+                    "senses": [
+                        {"glosses": ["Pluriel de bec-en-ciseaux."]}
+                    ]
+                }
+            ],
+        )