From 7e4451f670f25bc5ebf63cc8c67d742cf92dabb0 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 13 Oct 2023 16:55:34 +0800 Subject: [PATCH] Don't extract thesaurus and analyze templates for French Wiktionary French Wiktionary's thesaurus pages contain too much words that are not synonym words and doesn't use a common page layout. And word entry pages alreay have synonym section. Disbale this feature remove many error messages from the default English thesaurus extractor. Pre-expand templates just makes extracting French Wiktionary harder, I haven't seen any template that should be pre-expanded. --- languages/get_data.py | 3 +- src/wiktextract/config.py | 61 ++++++++-------------------- src/wiktextract/data/fr/config.json | 4 ++ src/wiktextract/extractor/fr/page.py | 6 --- src/wiktextract/page.py | 3 +- src/wiktextract/wiktionary.py | 6 ++- src/wiktextract/wiktwords.py | 4 +- src/wiktextract/wxr_context.py | 16 +++++--- tests/test_fr_etymology.py | 4 -- tests/test_fr_form_line.py | 4 -- tests/test_fr_gloss.py | 4 -- tests/test_fr_inflection.py | 4 -- tests/test_fr_linkage.py | 4 -- tests/test_fr_page.py | 4 -- tests/test_fr_pronunciation.py | 4 -- tests/test_fr_translation.py | 4 -- 16 files changed, 43 insertions(+), 92 deletions(-) create mode 100644 src/wiktextract/data/fr/config.json diff --git a/languages/get_data.py b/languages/get_data.py index 9de776c0..4730602e 100644 --- a/languages/get_data.py +++ b/languages/get_data.py @@ -83,7 +83,8 @@ def get_lang_data(lang_code: str, dump_file: str, db_path: Path | None) -> None: ) as fout: json.dump(data, fout, indent=2, ensure_ascii=False, sort_keys=True) wxr.wtp.close_db_conn() - close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn) + if wxr.config.extract_thesaurus_pages: + close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn) if __name__ == "__main__": diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py index 77c30c2f..8d80276c 100644 --- a/src/wiktextract/config.py +++ b/src/wiktextract/config.py @@ -6,28 +6,15 @@ import collections import json import sys -from typing import TYPE_CHECKING, Callable, Optional +from typing import Callable, Optional + +from wikitextprocessor.core import CollatedErrorReturnData if sys.version_info < (3, 10): from importlib_resources import files else: from importlib.resources import files -if TYPE_CHECKING: - from wikitextprocessor.core import StatsData - - -def int_dict(): - return collections.defaultdict(int) - - -def int_dict_dict(): - return collections.defaultdict(int_dict) - - -def list_dict(): - return collections.defaultdict(list) - class WiktionaryConfig: """This class holds configuration data for Wiktionary parsing.""" @@ -66,6 +53,8 @@ class WiktionaryConfig: "LANGUAGES_BY_NAME", "LANGUAGES_BY_CODE", "FORM_OF_TEMPLATES", + "analyze_templates", + "extract_thesaurus_pages", ) def __init__( @@ -130,38 +119,13 @@ def __init__( self.set_attr_from_json( "FORM_OF_TEMPLATES", "form_of_templates.json" ) - if dump_file_lang_code == "fr": - self.set_attr_from_json("FR_FORM_TABLES", "form_tables.json") if dump_file_lang_code == "de": self.set_attr_from_json("DE_FORM_TABLES", "form_templates.json") + self.analyze_templates = True # find templates that need pre-expand + self.extract_thesaurus_pages = True + self.load_edition_settings() - def to_kwargs(self): - return { - "dump_file_lang_code": self.dump_file_lang_code, - "capture_language_codes": self.capture_language_codes, - "capture_translations": self.capture_translations, - "capture_pronunciation": self.capture_pronunciation, - "capture_linkages": self.capture_linkages, - "capture_compounds": self.capture_compounds, - "capture_redirects": self.capture_redirects, - "capture_examples": self.capture_examples, - "capture_etymologies": self.capture_etymologies, - "capture_inflections": self.capture_inflections, - "capture_descendants": self.capture_descendants, - "verbose": self.verbose, - "expand_tables": self.expand_tables, - } - - def to_return(self) -> "StatsData": - return { - "num_pages": self.num_pages, - "language_counts": self.language_counts, - "pos_counts": self.pos_counts, - "section_counts": self.section_counts, - } - - def merge_return(self, ret): - assert isinstance(ret, dict) + def merge_return(self, ret: CollatedErrorReturnData): if "num_pages" in ret: self.num_pages += ret["num_pages"] for k, v in ret["language_counts"].items(): @@ -271,3 +235,10 @@ def alias_info(name, new_code, kind, old_code, use_code, not_use_code): ) else: self.LANGUAGES_BY_NAME[lang_name] = lang_code + + def load_edition_settings(self): + file_path = self.data_folder / "config.json" + if file_path.exists(): + with file_path.open(encoding="utf-8") as f: + for key, value in json.load(f).items(): + setattr(self, key, value) diff --git a/src/wiktextract/data/fr/config.json b/src/wiktextract/data/fr/config.json new file mode 100644 index 00000000..91a7ba44 --- /dev/null +++ b/src/wiktextract/data/fr/config.json @@ -0,0 +1,4 @@ +{ + "analyze_templates": false, + "extract_thesaurus_pages": false +} diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py index 0f797e4a..73a2b1b3 100644 --- a/src/wiktextract/extractor/fr/page.py +++ b/src/wiktextract/extractor/fr/page.py @@ -152,12 +152,6 @@ def parse_page( page_text, pre_expand=True, additional_expand=ADDITIONAL_EXPAND_TEMPLATES, - do_not_pre_expand={ - "trad-début", # don't expand translation start/end tempaltes - "trad-fin", - "(", # similar to "trad-debut", pre-expand breaks node structre - ")", - }, ) page_data = [] diff --git a/src/wiktextract/page.py b/src/wiktextract/page.py index 4c5cea9d..d7d43c4d 100644 --- a/src/wiktextract/page.py +++ b/src/wiktextract/page.py @@ -36,7 +36,8 @@ def parse_page( captured.""" page_extractor_mod = import_extractor_module(wxr.wtp.lang_code, "page") page_data = page_extractor_mod.parse_page(wxr, page_title, page_text) - inject_linkages(wxr, page_data) + if wxr.config.extract_thesaurus_pages: + inject_linkages(wxr, page_data) if wxr.config.dump_file_lang_code == "en": process_categories(wxr, page_data) remove_duplicate_data(page_data) diff --git a/src/wiktextract/wiktionary.py b/src/wiktextract/wiktionary.py index 9a65a3d5..21988c90 100644 --- a/src/wiktextract/wiktionary.py +++ b/src/wiktextract/wiktionary.py @@ -119,6 +119,7 @@ def parse_wiktionary( override_folders, skip_extract_dump, save_pages_path, + not wxr.config.analyze_templates, ) if not phase1_only: @@ -178,7 +179,10 @@ def reprocess_wiktionary( # Extract thesaurus data. This iterates over thesaurus pages, # but is very fast. - if thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0: + if ( + wxr.config.extract_thesaurus_pages + and thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0 + ): extract_thesaurus_data(wxr, num_processes) emitted = set() diff --git a/src/wiktextract/wiktwords.py b/src/wiktextract/wiktwords.py index fc4b4740..4cdc5067 100755 --- a/src/wiktextract/wiktwords.py +++ b/src/wiktextract/wiktwords.py @@ -83,6 +83,7 @@ def process_single_page( # is disabled by default to speed up single page testing. if ( args.use_thesaurus + and wxr.config.extract_thesaurus_pages and thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0 ): extract_thesaurus_data(wxr) @@ -507,7 +508,8 @@ def main(): json.dump(tree, f, indent=2, sort_keys=True) wxr.wtp.close_db_conn() - close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn) + if wxr.config.extract_thesaurus_pages: + close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn) if args.profile: pr.disable() diff --git a/src/wiktextract/wxr_context.py b/src/wiktextract/wxr_context.py index a6b395d0..794fc777 100644 --- a/src/wiktextract/wxr_context.py +++ b/src/wiktextract/wxr_context.py @@ -28,15 +28,20 @@ def __init__(self, wtp: Wtp, config: WiktionaryConfig): self.thesaurus_db_path = wtp.db_path.with_stem( f"{wtp.db_path.stem}_thesaurus" ) - self.thesaurus_db_conn = init_thesaurus_db(self.thesaurus_db_path) + self.thesaurus_db_conn = ( + init_thesaurus_db(self.thesaurus_db_path) + if config.extract_thesaurus_pages + else None + ) def reconnect_databases(self, check_same_thread: bool = True) -> None: # `multiprocessing.pool.Pool.imap()` runs in another thread, if the db # connection is used to create iterable data for `imap`, # `check_same_thread` must be `False`. - self.thesaurus_db_conn = sqlite3.connect( - self.thesaurus_db_path, check_same_thread=check_same_thread - ) + if self.config.extract_thesaurus_pages: + self.thesaurus_db_conn = sqlite3.connect( + self.thesaurus_db_path, check_same_thread=check_same_thread + ) self.wtp.db_conn = sqlite3.connect( self.wtp.db_path, check_same_thread=check_same_thread ) @@ -44,7 +49,8 @@ def reconnect_databases(self, check_same_thread: bool = True) -> None: def remove_unpicklable_objects(self) -> None: # remove these variables before passing the `WiktextractContext` object # to worker processes - self.thesaurus_db_conn.close() + if self.config.extract_thesaurus_pages: + self.thesaurus_db_conn.close() self.thesaurus_db_conn = None self.wtp.db_conn.close() self.wtp.db_conn = None diff --git a/tests/test_fr_etymology.py b/tests/test_fr_etymology.py index 7275cd61..ee25cd3f 100644 --- a/tests/test_fr_etymology.py +++ b/tests/test_fr_etymology.py @@ -8,7 +8,6 @@ extract_etymology, insert_etymology_data, ) -from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -20,9 +19,6 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - close_thesaurus_db( - self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn - ) def test_ebauche_etym(self): # https://fr.wiktionary.org/wiki/Hörsaal diff --git a/tests/test_fr_form_line.py b/tests/test_fr_form_line.py index 5c8ba1d6..f7bbcd72 100644 --- a/tests/test_fr_form_line.py +++ b/tests/test_fr_form_line.py @@ -9,7 +9,6 @@ extract_form_line, process_zh_mot_template, ) -from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -21,9 +20,6 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - close_thesaurus_db( - self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn - ) @patch( "wiktextract.extractor.fr.pronunciation.clean_node", diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py index 5f5d11d6..845bb34a 100644 --- a/tests/test_fr_gloss.py +++ b/tests/test_fr_gloss.py @@ -7,7 +7,6 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.fr.gloss import extract_gloss from wiktextract.extractor.fr.page import process_pos_block -from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -19,9 +18,6 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - close_thesaurus_db( - self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn - ) @patch( "wikitextprocessor.Wtp.get_page", diff --git a/tests/test_fr_inflection.py b/tests/test_fr_inflection.py index abb1042f..f793aea7 100644 --- a/tests/test_fr_inflection.py +++ b/tests/test_fr_inflection.py @@ -7,7 +7,6 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.fr.inflection import extract_inflection -from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -19,9 +18,6 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - close_thesaurus_db( - self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn - ) @patch( "wikitextprocessor.Wtp.node_to_wikitext", diff --git a/tests/test_fr_linkage.py b/tests/test_fr_linkage.py index 3b0fbb5f..6b8b2f70 100644 --- a/tests/test_fr_linkage.py +++ b/tests/test_fr_linkage.py @@ -5,7 +5,6 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.fr.linkage import extract_linkage -from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -17,9 +16,6 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - close_thesaurus_db( - self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn - ) def test_tags(self): page_data = [defaultdict(list)] diff --git a/tests/test_fr_page.py b/tests/test_fr_page.py index c481027b..e372d5e4 100644 --- a/tests/test_fr_page.py +++ b/tests/test_fr_page.py @@ -8,7 +8,6 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.fr.page import parse_page -from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -23,9 +22,6 @@ def setUp(self): def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - close_thesaurus_db( - self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn - ) def test_fr_parse_page(self): self.wxr.wtp.add_page("Modèle:langue", 10, "Français") diff --git a/tests/test_fr_pronunciation.py b/tests/test_fr_pronunciation.py index f2b665ca..fc771cb9 100644 --- a/tests/test_fr_pronunciation.py +++ b/tests/test_fr_pronunciation.py @@ -6,7 +6,6 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.fr.pronunciation import extract_pronunciation -from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -18,9 +17,6 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - close_thesaurus_db( - self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn - ) def test_pron_list(self): page_data = [ diff --git a/tests/test_fr_translation.py b/tests/test_fr_translation.py index b687018f..a161d7fa 100644 --- a/tests/test_fr_translation.py +++ b/tests/test_fr_translation.py @@ -5,7 +5,6 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.fr.translation import extract_translation -from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -17,9 +16,6 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - close_thesaurus_db( - self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn - ) def test_italic_tag(self): self.wxr.wtp.start_page("")