Skip to content

Commit

Permalink
Don't extract thesaurus and analyze templates for French Wiktionary
Browse files Browse the repository at this point in the history
French Wiktionary's thesaurus pages contain too much words that are
not synonym words and doesn't use a common page layout. And word entry
pages alreay have synonym section. Disbale this feature remove many
error messages from the default English thesaurus extractor.

Pre-expand templates just makes extracting French Wiktionary harder,
I haven't seen any template that should be pre-expanded.
  • Loading branch information
xxyzz committed Oct 13, 2023
1 parent 2b4f32d commit 7e4451f
Show file tree
Hide file tree
Showing 16 changed files with 43 additions and 92 deletions.
3 changes: 2 additions & 1 deletion languages/get_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ def get_lang_data(lang_code: str, dump_file: str, db_path: Path | None) -> None:
) as fout:
json.dump(data, fout, indent=2, ensure_ascii=False, sort_keys=True)
wxr.wtp.close_db_conn()
close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn)
if wxr.config.extract_thesaurus_pages:
close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn)


if __name__ == "__main__":
Expand Down
61 changes: 16 additions & 45 deletions src/wiktextract/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,15 @@
import collections
import json
import sys
from typing import TYPE_CHECKING, Callable, Optional
from typing import Callable, Optional

from wikitextprocessor.core import CollatedErrorReturnData

if sys.version_info < (3, 10):
from importlib_resources import files
else:
from importlib.resources import files

if TYPE_CHECKING:
from wikitextprocessor.core import StatsData


def int_dict():
return collections.defaultdict(int)


def int_dict_dict():
return collections.defaultdict(int_dict)


def list_dict():
return collections.defaultdict(list)


class WiktionaryConfig:
"""This class holds configuration data for Wiktionary parsing."""
Expand Down Expand Up @@ -66,6 +53,8 @@ class WiktionaryConfig:
"LANGUAGES_BY_NAME",
"LANGUAGES_BY_CODE",
"FORM_OF_TEMPLATES",
"analyze_templates",
"extract_thesaurus_pages",
)

def __init__(
Expand Down Expand Up @@ -130,38 +119,13 @@ def __init__(
self.set_attr_from_json(
"FORM_OF_TEMPLATES", "form_of_templates.json"
)
if dump_file_lang_code == "fr":
self.set_attr_from_json("FR_FORM_TABLES", "form_tables.json")
if dump_file_lang_code == "de":
self.set_attr_from_json("DE_FORM_TABLES", "form_templates.json")
self.analyze_templates = True # find templates that need pre-expand
self.extract_thesaurus_pages = True
self.load_edition_settings()

def to_kwargs(self):
return {
"dump_file_lang_code": self.dump_file_lang_code,
"capture_language_codes": self.capture_language_codes,
"capture_translations": self.capture_translations,
"capture_pronunciation": self.capture_pronunciation,
"capture_linkages": self.capture_linkages,
"capture_compounds": self.capture_compounds,
"capture_redirects": self.capture_redirects,
"capture_examples": self.capture_examples,
"capture_etymologies": self.capture_etymologies,
"capture_inflections": self.capture_inflections,
"capture_descendants": self.capture_descendants,
"verbose": self.verbose,
"expand_tables": self.expand_tables,
}

def to_return(self) -> "StatsData":
return {
"num_pages": self.num_pages,
"language_counts": self.language_counts,
"pos_counts": self.pos_counts,
"section_counts": self.section_counts,
}

def merge_return(self, ret):
assert isinstance(ret, dict)
def merge_return(self, ret: CollatedErrorReturnData):
if "num_pages" in ret:
self.num_pages += ret["num_pages"]
for k, v in ret["language_counts"].items():
Expand Down Expand Up @@ -271,3 +235,10 @@ def alias_info(name, new_code, kind, old_code, use_code, not_use_code):
)
else:
self.LANGUAGES_BY_NAME[lang_name] = lang_code

def load_edition_settings(self):
file_path = self.data_folder / "config.json"
if file_path.exists():
with file_path.open(encoding="utf-8") as f:
for key, value in json.load(f).items():
setattr(self, key, value)
4 changes: 4 additions & 0 deletions src/wiktextract/data/fr/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"analyze_templates": false,
"extract_thesaurus_pages": false
}
6 changes: 0 additions & 6 deletions src/wiktextract/extractor/fr/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,12 +152,6 @@ def parse_page(
page_text,
pre_expand=True,
additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
do_not_pre_expand={
"trad-début", # don't expand translation start/end tempaltes
"trad-fin",
"(", # similar to "trad-debut", pre-expand breaks node structre
")",
},
)

page_data = []
Expand Down
3 changes: 2 additions & 1 deletion src/wiktextract/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ def parse_page(
captured."""
page_extractor_mod = import_extractor_module(wxr.wtp.lang_code, "page")
page_data = page_extractor_mod.parse_page(wxr, page_title, page_text)
inject_linkages(wxr, page_data)
if wxr.config.extract_thesaurus_pages:
inject_linkages(wxr, page_data)
if wxr.config.dump_file_lang_code == "en":
process_categories(wxr, page_data)
remove_duplicate_data(page_data)
Expand Down
6 changes: 5 additions & 1 deletion src/wiktextract/wiktionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def parse_wiktionary(
override_folders,
skip_extract_dump,
save_pages_path,
not wxr.config.analyze_templates,
)

if not phase1_only:
Expand Down Expand Up @@ -178,7 +179,10 @@ def reprocess_wiktionary(

# Extract thesaurus data. This iterates over thesaurus pages,
# but is very fast.
if thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0:
if (
wxr.config.extract_thesaurus_pages
and thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0
):
extract_thesaurus_data(wxr, num_processes)

emitted = set()
Expand Down
4 changes: 3 additions & 1 deletion src/wiktextract/wiktwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def process_single_page(
# is disabled by default to speed up single page testing.
if (
args.use_thesaurus
and wxr.config.extract_thesaurus_pages
and thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0
):
extract_thesaurus_data(wxr)
Expand Down Expand Up @@ -507,7 +508,8 @@ def main():
json.dump(tree, f, indent=2, sort_keys=True)

wxr.wtp.close_db_conn()
close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn)
if wxr.config.extract_thesaurus_pages:
close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn)

if args.profile:
pr.disable()
Expand Down
16 changes: 11 additions & 5 deletions src/wiktextract/wxr_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,23 +28,29 @@ def __init__(self, wtp: Wtp, config: WiktionaryConfig):
self.thesaurus_db_path = wtp.db_path.with_stem(
f"{wtp.db_path.stem}_thesaurus"
)
self.thesaurus_db_conn = init_thesaurus_db(self.thesaurus_db_path)
self.thesaurus_db_conn = (
init_thesaurus_db(self.thesaurus_db_path)
if config.extract_thesaurus_pages
else None
)

def reconnect_databases(self, check_same_thread: bool = True) -> None:
# `multiprocessing.pool.Pool.imap()` runs in another thread, if the db
# connection is used to create iterable data for `imap`,
# `check_same_thread` must be `False`.
self.thesaurus_db_conn = sqlite3.connect(
self.thesaurus_db_path, check_same_thread=check_same_thread
)
if self.config.extract_thesaurus_pages:
self.thesaurus_db_conn = sqlite3.connect(
self.thesaurus_db_path, check_same_thread=check_same_thread
)
self.wtp.db_conn = sqlite3.connect(
self.wtp.db_path, check_same_thread=check_same_thread
)

def remove_unpicklable_objects(self) -> None:
# remove these variables before passing the `WiktextractContext` object
# to worker processes
self.thesaurus_db_conn.close()
if self.config.extract_thesaurus_pages:
self.thesaurus_db_conn.close()
self.thesaurus_db_conn = None
self.wtp.db_conn.close()
self.wtp.db_conn = None
Expand Down
4 changes: 0 additions & 4 deletions tests/test_fr_etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
extract_etymology,
insert_etymology_data,
)
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


Expand All @@ -20,9 +19,6 @@ def setUp(self) -> None:

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
close_thesaurus_db(
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)

def test_ebauche_etym(self):
# https://fr.wiktionary.org/wiki/Hörsaal
Expand Down
4 changes: 0 additions & 4 deletions tests/test_fr_form_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
extract_form_line,
process_zh_mot_template,
)
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


Expand All @@ -21,9 +20,6 @@ def setUp(self) -> None:

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
close_thesaurus_db(
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)

@patch(
"wiktextract.extractor.fr.pronunciation.clean_node",
Expand Down
4 changes: 0 additions & 4 deletions tests/test_fr_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.fr.gloss import extract_gloss
from wiktextract.extractor.fr.page import process_pos_block
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


Expand All @@ -19,9 +18,6 @@ def setUp(self) -> None:

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
close_thesaurus_db(
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)

@patch(
"wikitextprocessor.Wtp.get_page",
Expand Down
4 changes: 0 additions & 4 deletions tests/test_fr_inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.fr.inflection import extract_inflection
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


Expand All @@ -19,9 +18,6 @@ def setUp(self) -> None:

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
close_thesaurus_db(
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)

@patch(
"wikitextprocessor.Wtp.node_to_wikitext",
Expand Down
4 changes: 0 additions & 4 deletions tests/test_fr_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.fr.linkage import extract_linkage
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


Expand All @@ -17,9 +16,6 @@ def setUp(self) -> None:

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
close_thesaurus_db(
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)

def test_tags(self):
page_data = [defaultdict(list)]
Expand Down
4 changes: 0 additions & 4 deletions tests/test_fr_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.fr.page import parse_page
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


Expand All @@ -23,9 +22,6 @@ def setUp(self):

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
close_thesaurus_db(
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)

def test_fr_parse_page(self):
self.wxr.wtp.add_page("Modèle:langue", 10, "Français")
Expand Down
4 changes: 0 additions & 4 deletions tests/test_fr_pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.fr.pronunciation import extract_pronunciation
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


Expand All @@ -18,9 +17,6 @@ def setUp(self) -> None:

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
close_thesaurus_db(
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)

def test_pron_list(self):
page_data = [
Expand Down
4 changes: 0 additions & 4 deletions tests/test_fr_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.fr.translation import extract_translation
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


Expand All @@ -17,9 +16,6 @@ def setUp(self) -> None:

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
close_thesaurus_db(
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)

def test_italic_tag(self):
self.wxr.wtp.start_page("")
Expand Down

0 comments on commit 7e4451f

Please sign in to comment.