tatuylonen · xxyzz · Oct 16, 2023 · Oct 13, 2023 · Oct 13, 2023 · Oct 16, 2023
diff --git a/languages/get_data.py b/languages/get_data.py
@@ -83,7 +83,8 @@ def get_lang_data(lang_code: str, dump_file: str, db_path: Path | None) -> None:
     ) as fout:
         json.dump(data, fout, indent=2, ensure_ascii=False, sort_keys=True)
     wxr.wtp.close_db_conn()
-    close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn)
+    if wxr.config.extract_thesaurus_pages:
+        close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn)
 
 
 if __name__ == "__main__":

diff --git a/src/wiktextract/clean.py b/src/wiktextract/clean.py
@@ -1131,7 +1131,7 @@ def expand_group(v):
                 if a == "2":
                     v = "√"
                 elif a == "3":
-                    v = "∛",
+                    v = "∛"
                 elif a == "4":
                     v = "∜"
                 else:

diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py
@@ -6,28 +6,15 @@
 import collections
 import json
 import sys
-from typing import TYPE_CHECKING, Callable, Optional
+from typing import Callable, Optional
+
+from wikitextprocessor.core import CollatedErrorReturnData
 
 if sys.version_info < (3, 10):
     from importlib_resources import files
 else:
     from importlib.resources import files
 
-if TYPE_CHECKING:
-    from wikitextprocessor.core import StatsData
-
-
-def int_dict():
-    return collections.defaultdict(int)
-
-
-def int_dict_dict():
-    return collections.defaultdict(int_dict)
-
-
-def list_dict():
-    return collections.defaultdict(list)
-
 
 class WiktionaryConfig:
     """This class holds configuration data for Wiktionary parsing."""
@@ -66,6 +53,8 @@ class WiktionaryConfig:
         "LANGUAGES_BY_NAME",
         "LANGUAGES_BY_CODE",
         "FORM_OF_TEMPLATES",
+        "analyze_templates",
+        "extract_thesaurus_pages",
     )
 
     def __init__(
@@ -130,38 +119,13 @@ def __init__(
             self.set_attr_from_json(
                 "FORM_OF_TEMPLATES", "form_of_templates.json"
             )
-        if dump_file_lang_code == "fr":
-            self.set_attr_from_json("FR_FORM_TABLES", "form_tables.json")
         if dump_file_lang_code == "de":
             self.set_attr_from_json("DE_FORM_TABLES", "form_templates.json")
+        self.analyze_templates = True  # find templates that need pre-expand
+        self.extract_thesaurus_pages = True
+        self.load_edition_settings()
 
-    def to_kwargs(self):
-        return {
-            "dump_file_lang_code": self.dump_file_lang_code,
-            "capture_language_codes": self.capture_language_codes,
-            "capture_translations": self.capture_translations,
-            "capture_pronunciation": self.capture_pronunciation,
-            "capture_linkages": self.capture_linkages,
-            "capture_compounds": self.capture_compounds,
-            "capture_redirects": self.capture_redirects,
-            "capture_examples": self.capture_examples,
-            "capture_etymologies": self.capture_etymologies,
-            "capture_inflections": self.capture_inflections,
-            "capture_descendants": self.capture_descendants,
-            "verbose": self.verbose,
-            "expand_tables": self.expand_tables,
-        }
-
-    def to_return(self) -> "StatsData":
-        return {
-            "num_pages": self.num_pages,
-            "language_counts": self.language_counts,
-            "pos_counts": self.pos_counts,
-            "section_counts": self.section_counts,
-        }
-
-    def merge_return(self, ret):
-        assert isinstance(ret, dict)
+    def merge_return(self, ret: CollatedErrorReturnData):
         if "num_pages" in ret:
             self.num_pages += ret["num_pages"]
             for k, v in ret["language_counts"].items():
@@ -271,3 +235,10 @@ def alias_info(name, new_code, kind, old_code, use_code, not_use_code):
                         )
                 else:
                     self.LANGUAGES_BY_NAME[lang_name] = lang_code
+
+    def load_edition_settings(self):
+        file_path = self.data_folder / "config.json"
+        if file_path.exists():
+            with file_path.open(encoding="utf-8") as f:
+                for key, value in json.load(f).items():
+                    setattr(self, key, value)
diff --git a/src/wiktextract/data/fr/config.json b/src/wiktextract/data/fr/config.json
@@ -0,0 +1,4 @@
+{
+  "analyze_templates": false,
+  "extract_thesaurus_pages": false
+}
diff --git a/src/wiktextract/extractor/fr/gloss.py b/src/wiktextract/extractor/fr/gloss.py
@@ -14,19 +14,30 @@ def extract_gloss(
     list_node: WikiNode,
 ) -> None:
     for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
-        gloss_nodes = list(list_item_node.invert_find_child(NodeKind.LIST))
+        gloss_nodes = list(
+            list_item_node.invert_find_child(
+                NodeKind.LIST, include_empty_str=True
+            )
+        )
+        # remove the first empty space in list item nodes
+        if (
+            len(gloss_nodes) > 0
+            and isinstance(gloss_nodes[0], str)
+            and len(gloss_nodes[0].strip()) == 0
+        ):
+            gloss_nodes = gloss_nodes[1:]
+
         gloss_data = defaultdict(list)
         gloss_start = 0
         # process modifier, theme tempaltes before gloss text
         # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens
-        if (
-            len(gloss_nodes) > 0
-            and isinstance(gloss_nodes[0], WikiNode)
-            and gloss_nodes[0].kind == NodeKind.TEMPLATE
-        ):
+        if len(gloss_nodes) > 0 and isinstance(gloss_nodes[0], TemplateNode):
             gloss_start = 1
             for index, gloss_node in enumerate(gloss_nodes[1:], 1):
-                if (
+                if isinstance(gloss_node, str) and len(gloss_node.strip()) == 0:
+                    # ignore empty string
+                    gloss_start = index + 1
+                elif (
                     not isinstance(gloss_node, WikiNode)
                     or gloss_node.kind != NodeKind.TEMPLATE
                     # template "variante de" is not a modifier
@@ -38,20 +49,34 @@ def extract_gloss(
                 else:
                     gloss_start = index + 1
             for tag_node in gloss_nodes[:gloss_start]:
-                gloss_data["tags"].append(
-                    clean_node(wxr, gloss_data, tag_node).strip("()")
-                )
+                tag = clean_node(wxr, gloss_data, tag_node).strip("() ")
+                if len(tag) > 0:
+                    gloss_data["tags"].append(tag)
 
         gloss_only_nodes = []
-        # extract italic tags
-        for node in gloss_nodes[gloss_start:]:
-            if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
+        tag_indexes = set()
+        for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start):
+            # if an italic node is between parentheses then it's a tag, also
+            # don't add the parenthese strings to `gloss_only_nodes`
+            if (
+                isinstance(node, WikiNode)
+                and node.kind == NodeKind.ITALIC
+                and index > gloss_start
+                and isinstance(gloss_nodes[index - 1], str)
+                and gloss_nodes[index - 1].strip() == "("
+                and index + 1 < len(gloss_nodes)
+                and isinstance(gloss_nodes[index + 1], str)
+                and gloss_nodes[index + 1].strip() == ")"
+            ):
                 gloss_data["tags"].append(clean_node(wxr, None, node))
+                tag_indexes |= {index - 1, index, index + 1}
                 continue
-            elif isinstance(node, str) and node.strip() in ["(", ")"]:
-                # remove parentheses around italic node
-                continue
-            gloss_only_nodes.append(node)
+
+        gloss_only_nodes = [
+            node
+            for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start)
+            if index not in tag_indexes
+        ]
         gloss_text = clean_node(wxr, gloss_data, gloss_only_nodes)
         gloss_data["glosses"] = [gloss_text]
         extract_examples(wxr, gloss_data, list_item_node)

diff --git a/src/wiktextract/extractor/fr/inflection.py b/src/wiktextract/extractor/fr/inflection.py
@@ -36,6 +36,7 @@ def extract_inflection(
     }
 )
 
+
 @dataclass
 class ColspanHeader:
     text: str
@@ -123,7 +124,9 @@ def process_inflection_table(
                             )
                         else:
                             column_headers.append(table_header_text)
-                        column_cell_index += int(table_cell.attrs.get("colspan", 1))
+                        column_cell_index += int(
+                            table_cell.attrs.get("colspan", 1)
+                        )
                     elif row_num > 0:
                         row_headers.append(table_header_text)
                         if "rowspan" in table_cell.attrs:

diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py
@@ -152,12 +152,6 @@ def parse_page(
         page_text,
         pre_expand=True,
         additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
-        do_not_pre_expand={
-            "trad-début",  # don't expand translation start/end tempaltes
-            "trad-fin",
-            "(",  # similar to "trad-debut", pre-expand breaks node structre
-            ")",
-        },
     )
 
     page_data = []

diff --git a/src/wiktextract/extractor/fr/translation.py b/src/wiktextract/extractor/fr/translation.py
@@ -72,9 +72,9 @@ def process_translation_templates(
         # translation box start: https://fr.wiktionary.org/wiki/Modèle:trad-début
         sense_parameter = template_node.template_parameters.get(1)
         if sense_parameter is not None:
-            base_translation_data["sense"] = clean_node(
-                wxr, None, sense_parameter
-            )
+            sense_text = clean_node(wxr, None, sense_parameter)
+            if len(sense_text) > 0:
+                base_translation_data["sense"] = sense_text
     elif template_node.template_name == "T":
         # Translation language: https://fr.wiktionary.org/wiki/Modèle:T
         base_translation_data["code"] = template_node.template_parameters.get(1)

diff --git a/src/wiktextract/form_descriptions.py b/src/wiktextract/form_descriptions.py
@@ -169,7 +169,7 @@
     r"\s+\((({}): ([^()]|\([^()]+\))+)\)"
     .format("|".join(re.escape(x.removeprefix("?"))
                      for x in sorted(xlat_head_map.values(),
-                                     key=lambda x: len(x),
+                                     key=len,
                                      reverse=True)
                      if x and not x.startswith("class-"))))
 
@@ -179,7 +179,7 @@
     "|".join(re.escape(x) for x in
              # The sort is to put longer ones first, preferring them in
              # the regexp match
-             sorted(xlat_head_map.keys(), key=lambda x: len(x),
+             sorted(xlat_head_map.keys(), key=len,
                     reverse=True)))
 head_final_re = re.compile(head_final_re_text + "$")