Don't extract thesaurus and analyze templates for French Wiktionary

French Wiktionary's thesaurus pages contain too much words that are not synonym words and doesn't use a common page layout. And word entry pages alreay have synonym section. Disbale this feature remove many error messages from the default English thesaurus extractor. Pre-expand templates just makes extracting French Wiktionary harder, I haven't seen any template that should be pre-expanded.
tatuylonen · Oct 13, 2023 · 7e4451f · 7e4451f
1 parent 2b4f32d
commit 7e4451f
Show file tree

Hide file tree

Showing 16 changed files with 43 additions and 92 deletions.
diff --git a/languages/get_data.py b/languages/get_data.py
@@ -83,7 +83,8 @@ def get_lang_data(lang_code: str, dump_file: str, db_path: Path | None) -> None:
     ) as fout:
         json.dump(data, fout, indent=2, ensure_ascii=False, sort_keys=True)
     wxr.wtp.close_db_conn()
-    close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn)
+    if wxr.config.extract_thesaurus_pages:
+        close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn)
 
 
 if __name__ == "__main__":

diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py
@@ -6,28 +6,15 @@
 import collections
 import json
 import sys
-from typing import TYPE_CHECKING, Callable, Optional
+from typing import Callable, Optional
+
+from wikitextprocessor.core import CollatedErrorReturnData
 
 if sys.version_info < (3, 10):
     from importlib_resources import files
 else:
     from importlib.resources import files
 
-if TYPE_CHECKING:
-    from wikitextprocessor.core import StatsData
-
-
-def int_dict():
-    return collections.defaultdict(int)
-
-
-def int_dict_dict():
-    return collections.defaultdict(int_dict)
-
-
-def list_dict():
-    return collections.defaultdict(list)
-
 
 class WiktionaryConfig:
     """This class holds configuration data for Wiktionary parsing."""
@@ -66,6 +53,8 @@ class WiktionaryConfig:
         "LANGUAGES_BY_NAME",
         "LANGUAGES_BY_CODE",
         "FORM_OF_TEMPLATES",
+        "analyze_templates",
+        "extract_thesaurus_pages",
     )
 
     def __init__(
@@ -130,38 +119,13 @@ def __init__(
             self.set_attr_from_json(
                 "FORM_OF_TEMPLATES", "form_of_templates.json"
             )
-        if dump_file_lang_code == "fr":
-            self.set_attr_from_json("FR_FORM_TABLES", "form_tables.json")
         if dump_file_lang_code == "de":
             self.set_attr_from_json("DE_FORM_TABLES", "form_templates.json")
+        self.analyze_templates = True  # find templates that need pre-expand
+        self.extract_thesaurus_pages = True
+        self.load_edition_settings()
 
-    def to_kwargs(self):
-        return {
-            "dump_file_lang_code": self.dump_file_lang_code,
-            "capture_language_codes": self.capture_language_codes,
-            "capture_translations": self.capture_translations,
-            "capture_pronunciation": self.capture_pronunciation,
-            "capture_linkages": self.capture_linkages,
-            "capture_compounds": self.capture_compounds,
-            "capture_redirects": self.capture_redirects,
-            "capture_examples": self.capture_examples,
-            "capture_etymologies": self.capture_etymologies,
-            "capture_inflections": self.capture_inflections,
-            "capture_descendants": self.capture_descendants,
-            "verbose": self.verbose,
-            "expand_tables": self.expand_tables,
-        }
-
-    def to_return(self) -> "StatsData":
-        return {
-            "num_pages": self.num_pages,
-            "language_counts": self.language_counts,
-            "pos_counts": self.pos_counts,
-            "section_counts": self.section_counts,
-        }
-
-    def merge_return(self, ret):
-        assert isinstance(ret, dict)
+    def merge_return(self, ret: CollatedErrorReturnData):
         if "num_pages" in ret:
             self.num_pages += ret["num_pages"]
             for k, v in ret["language_counts"].items():
@@ -271,3 +235,10 @@ def alias_info(name, new_code, kind, old_code, use_code, not_use_code):
                         )
                 else:
                     self.LANGUAGES_BY_NAME[lang_name] = lang_code
+
+    def load_edition_settings(self):
+        file_path = self.data_folder / "config.json"
+        if file_path.exists():
+            with file_path.open(encoding="utf-8") as f:
+                for key, value in json.load(f).items():
+                    setattr(self, key, value)
diff --git a/src/wiktextract/data/fr/config.json b/src/wiktextract/data/fr/config.json
@@ -0,0 +1,4 @@
+{
+  "analyze_templates": false,
+  "extract_thesaurus_pages": false
+}
diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py
@@ -152,12 +152,6 @@ def parse_page(
         page_text,
         pre_expand=True,
         additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
-        do_not_pre_expand={
-            "trad-début",  # don't expand translation start/end tempaltes
-            "trad-fin",
-            "(",  # similar to "trad-debut", pre-expand breaks node structre
-            ")",
-        },
     )
 
     page_data = []

diff --git a/src/wiktextract/page.py b/src/wiktextract/page.py
@@ -36,7 +36,8 @@ def parse_page(
     captured."""
     page_extractor_mod = import_extractor_module(wxr.wtp.lang_code, "page")
     page_data = page_extractor_mod.parse_page(wxr, page_title, page_text)
-    inject_linkages(wxr, page_data)
+    if wxr.config.extract_thesaurus_pages:
+        inject_linkages(wxr, page_data)
     if wxr.config.dump_file_lang_code == "en":
         process_categories(wxr, page_data)
     remove_duplicate_data(page_data)

diff --git a/src/wiktextract/wiktionary.py b/src/wiktextract/wiktionary.py
@@ -119,6 +119,7 @@ def parse_wiktionary(
         override_folders,
         skip_extract_dump,
         save_pages_path,
+        not wxr.config.analyze_templates,
     )
 
     if not phase1_only:
@@ -178,7 +179,10 @@ def reprocess_wiktionary(
 
     # Extract thesaurus data. This iterates over thesaurus pages,
     # but is very fast.
-    if thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0:
+    if (
+        wxr.config.extract_thesaurus_pages
+        and thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0
+    ):
         extract_thesaurus_data(wxr, num_processes)
 
     emitted = set()

diff --git a/src/wiktextract/wiktwords.py b/src/wiktextract/wiktwords.py
@@ -83,6 +83,7 @@ def process_single_page(
     # is disabled by default to speed up single page testing.
     if (
         args.use_thesaurus
+        and wxr.config.extract_thesaurus_pages
         and thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0
     ):
         extract_thesaurus_data(wxr)
@@ -507,7 +508,8 @@ def main():
             json.dump(tree, f, indent=2, sort_keys=True)
 
     wxr.wtp.close_db_conn()
-    close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn)
+    if wxr.config.extract_thesaurus_pages:
+        close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn)
 
     if args.profile:
         pr.disable()

diff --git a/src/wiktextract/wxr_context.py b/src/wiktextract/wxr_context.py
@@ -28,23 +28,29 @@ def __init__(self, wtp: Wtp, config: WiktionaryConfig):
         self.thesaurus_db_path = wtp.db_path.with_stem(
             f"{wtp.db_path.stem}_thesaurus"
         )
-        self.thesaurus_db_conn = init_thesaurus_db(self.thesaurus_db_path)
+        self.thesaurus_db_conn = (
+            init_thesaurus_db(self.thesaurus_db_path)
+            if config.extract_thesaurus_pages
+            else None
+        )
 
     def reconnect_databases(self, check_same_thread: bool = True) -> None:
         # `multiprocessing.pool.Pool.imap()` runs in another thread, if the db
         # connection is used to create iterable data for `imap`,
         # `check_same_thread` must be `False`.
-        self.thesaurus_db_conn = sqlite3.connect(
-            self.thesaurus_db_path, check_same_thread=check_same_thread
-        )
+        if self.config.extract_thesaurus_pages:
+            self.thesaurus_db_conn = sqlite3.connect(
+                self.thesaurus_db_path, check_same_thread=check_same_thread
+            )
         self.wtp.db_conn = sqlite3.connect(
             self.wtp.db_path, check_same_thread=check_same_thread
         )
 
     def remove_unpicklable_objects(self) -> None:
         # remove these variables before passing the `WiktextractContext` object
         # to worker processes
-        self.thesaurus_db_conn.close()
+        if self.config.extract_thesaurus_pages:
+            self.thesaurus_db_conn.close()
         self.thesaurus_db_conn = None
         self.wtp.db_conn.close()
         self.wtp.db_conn = None

diff --git a/tests/test_fr_etymology.py b/tests/test_fr_etymology.py
@@ -8,7 +8,6 @@
     extract_etymology,
     insert_etymology_data,
 )
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -20,9 +19,6 @@ def setUp(self) -> None:
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     def test_ebauche_etym(self):
         # https://fr.wiktionary.org/wiki/Hörsaal

diff --git a/tests/test_fr_form_line.py b/tests/test_fr_form_line.py
@@ -9,7 +9,6 @@
     extract_form_line,
     process_zh_mot_template,
 )
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -21,9 +20,6 @@ def setUp(self) -> None:
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     @patch(
         "wiktextract.extractor.fr.pronunciation.clean_node",

diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py
@@ -7,7 +7,6 @@
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.fr.gloss import extract_gloss
 from wiktextract.extractor.fr.page import process_pos_block
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -19,9 +18,6 @@ def setUp(self) -> None:
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     @patch(
         "wikitextprocessor.Wtp.get_page",

diff --git a/tests/test_fr_inflection.py b/tests/test_fr_inflection.py
@@ -7,7 +7,6 @@
 
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.fr.inflection import extract_inflection
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -19,9 +18,6 @@ def setUp(self) -> None:
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     @patch(
         "wikitextprocessor.Wtp.node_to_wikitext",

diff --git a/tests/test_fr_linkage.py b/tests/test_fr_linkage.py
@@ -5,7 +5,6 @@
 
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.fr.linkage import extract_linkage
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -17,9 +16,6 @@ def setUp(self) -> None:
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     def test_tags(self):
         page_data = [defaultdict(list)]

diff --git a/tests/test_fr_page.py b/tests/test_fr_page.py
@@ -8,7 +8,6 @@
 
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.fr.page import parse_page
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -23,9 +22,6 @@ def setUp(self):
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     def test_fr_parse_page(self):
         self.wxr.wtp.add_page("Modèle:langue", 10, "Français")

diff --git a/tests/test_fr_pronunciation.py b/tests/test_fr_pronunciation.py
@@ -6,7 +6,6 @@
 
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.fr.pronunciation import extract_pronunciation
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -18,9 +17,6 @@ def setUp(self) -> None:
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     def test_pron_list(self):
         page_data = [

diff --git a/tests/test_fr_translation.py b/tests/test_fr_translation.py
@@ -5,7 +5,6 @@
 
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.fr.translation import extract_translation
-from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
@@ -17,9 +16,6 @@ def setUp(self) -> None:
 
     def tearDown(self) -> None:
         self.wxr.wtp.close_db_conn()
-        close_thesaurus_db(
-            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
-        )
 
     def test_italic_tag(self):
         self.wxr.wtp.start_page("")