Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Disable analyze templates and extract thesaurus pages features for French Wiktionary #362

Merged
merged 4 commits into from
Oct 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion languages/get_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ def get_lang_data(lang_code: str, dump_file: str, db_path: Path | None) -> None:
) as fout:
json.dump(data, fout, indent=2, ensure_ascii=False, sort_keys=True)
wxr.wtp.close_db_conn()
close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn)
if wxr.config.extract_thesaurus_pages:
close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -1131,7 +1131,7 @@ def expand_group(v):
if a == "2":
v = "√"
elif a == "3":
v = "∛",
v = "∛"
elif a == "4":
v = "∜"
else:
Expand Down
61 changes: 16 additions & 45 deletions src/wiktextract/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,15 @@
import collections
import json
import sys
from typing import TYPE_CHECKING, Callable, Optional
from typing import Callable, Optional

from wikitextprocessor.core import CollatedErrorReturnData

if sys.version_info < (3, 10):
from importlib_resources import files
else:
from importlib.resources import files

if TYPE_CHECKING:
from wikitextprocessor.core import StatsData


def int_dict():
return collections.defaultdict(int)


def int_dict_dict():
return collections.defaultdict(int_dict)


def list_dict():
return collections.defaultdict(list)


class WiktionaryConfig:
"""This class holds configuration data for Wiktionary parsing."""
Expand Down Expand Up @@ -66,6 +53,8 @@ class WiktionaryConfig:
"LANGUAGES_BY_NAME",
"LANGUAGES_BY_CODE",
"FORM_OF_TEMPLATES",
"analyze_templates",
"extract_thesaurus_pages",
)

def __init__(
Expand Down Expand Up @@ -130,38 +119,13 @@ def __init__(
self.set_attr_from_json(
"FORM_OF_TEMPLATES", "form_of_templates.json"
)
if dump_file_lang_code == "fr":
self.set_attr_from_json("FR_FORM_TABLES", "form_tables.json")
if dump_file_lang_code == "de":
self.set_attr_from_json("DE_FORM_TABLES", "form_templates.json")
self.analyze_templates = True # find templates that need pre-expand
self.extract_thesaurus_pages = True
self.load_edition_settings()

def to_kwargs(self):
return {
"dump_file_lang_code": self.dump_file_lang_code,
"capture_language_codes": self.capture_language_codes,
"capture_translations": self.capture_translations,
"capture_pronunciation": self.capture_pronunciation,
"capture_linkages": self.capture_linkages,
"capture_compounds": self.capture_compounds,
"capture_redirects": self.capture_redirects,
"capture_examples": self.capture_examples,
"capture_etymologies": self.capture_etymologies,
"capture_inflections": self.capture_inflections,
"capture_descendants": self.capture_descendants,
"verbose": self.verbose,
"expand_tables": self.expand_tables,
}

def to_return(self) -> "StatsData":
return {
"num_pages": self.num_pages,
"language_counts": self.language_counts,
"pos_counts": self.pos_counts,
"section_counts": self.section_counts,
}

def merge_return(self, ret):
assert isinstance(ret, dict)
def merge_return(self, ret: CollatedErrorReturnData):
if "num_pages" in ret:
self.num_pages += ret["num_pages"]
for k, v in ret["language_counts"].items():
Expand Down Expand Up @@ -271,3 +235,10 @@ def alias_info(name, new_code, kind, old_code, use_code, not_use_code):
)
else:
self.LANGUAGES_BY_NAME[lang_name] = lang_code

def load_edition_settings(self):
file_path = self.data_folder / "config.json"
if file_path.exists():
with file_path.open(encoding="utf-8") as f:
for key, value in json.load(f).items():
setattr(self, key, value)
4 changes: 4 additions & 0 deletions src/wiktextract/data/fr/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"analyze_templates": false,
"extract_thesaurus_pages": false
}
59 changes: 42 additions & 17 deletions src/wiktextract/extractor/fr/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,30 @@ def extract_gloss(
list_node: WikiNode,
) -> None:
for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
gloss_nodes = list(list_item_node.invert_find_child(NodeKind.LIST))
gloss_nodes = list(
list_item_node.invert_find_child(
NodeKind.LIST, include_empty_str=True
)
)
# remove the first empty space in list item nodes
if (
len(gloss_nodes) > 0
and isinstance(gloss_nodes[0], str)
and len(gloss_nodes[0].strip()) == 0
):
gloss_nodes = gloss_nodes[1:]

gloss_data = defaultdict(list)
gloss_start = 0
# process modifier, theme tempaltes before gloss text
# https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens
if (
len(gloss_nodes) > 0
and isinstance(gloss_nodes[0], WikiNode)
and gloss_nodes[0].kind == NodeKind.TEMPLATE
):
if len(gloss_nodes) > 0 and isinstance(gloss_nodes[0], TemplateNode):
gloss_start = 1
for index, gloss_node in enumerate(gloss_nodes[1:], 1):
if (
if isinstance(gloss_node, str) and len(gloss_node.strip()) == 0:
# ignore empty string
gloss_start = index + 1
elif (
not isinstance(gloss_node, WikiNode)
or gloss_node.kind != NodeKind.TEMPLATE
# template "variante de" is not a modifier
Expand All @@ -38,20 +49,34 @@ def extract_gloss(
else:
gloss_start = index + 1
for tag_node in gloss_nodes[:gloss_start]:
gloss_data["tags"].append(
clean_node(wxr, gloss_data, tag_node).strip("()")
)
tag = clean_node(wxr, gloss_data, tag_node).strip("() ")
if len(tag) > 0:
gloss_data["tags"].append(tag)

gloss_only_nodes = []
# extract italic tags
for node in gloss_nodes[gloss_start:]:
if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
tag_indexes = set()
for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start):
# if an italic node is between parentheses then it's a tag, also
# don't add the parenthese strings to `gloss_only_nodes`
if (
isinstance(node, WikiNode)
and node.kind == NodeKind.ITALIC
and index > gloss_start
and isinstance(gloss_nodes[index - 1], str)
and gloss_nodes[index - 1].strip() == "("
and index + 1 < len(gloss_nodes)
and isinstance(gloss_nodes[index + 1], str)
and gloss_nodes[index + 1].strip() == ")"
):
gloss_data["tags"].append(clean_node(wxr, None, node))
tag_indexes |= {index - 1, index, index + 1}
continue
elif isinstance(node, str) and node.strip() in ["(", ")"]:
# remove parentheses around italic node
continue
gloss_only_nodes.append(node)

gloss_only_nodes = [
node
for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start)
if index not in tag_indexes
]
gloss_text = clean_node(wxr, gloss_data, gloss_only_nodes)
gloss_data["glosses"] = [gloss_text]
extract_examples(wxr, gloss_data, list_item_node)
Expand Down
5 changes: 4 additions & 1 deletion src/wiktextract/extractor/fr/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def extract_inflection(
}
)


@dataclass
class ColspanHeader:
text: str
Expand Down Expand Up @@ -123,7 +124,9 @@ def process_inflection_table(
)
else:
column_headers.append(table_header_text)
column_cell_index += int(table_cell.attrs.get("colspan", 1))
column_cell_index += int(
table_cell.attrs.get("colspan", 1)
)
elif row_num > 0:
row_headers.append(table_header_text)
if "rowspan" in table_cell.attrs:
Expand Down
6 changes: 0 additions & 6 deletions src/wiktextract/extractor/fr/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,12 +152,6 @@ def parse_page(
page_text,
pre_expand=True,
additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
do_not_pre_expand={
"trad-début", # don't expand translation start/end tempaltes
"trad-fin",
"(", # similar to "trad-debut", pre-expand breaks node structre
")",
},
)

page_data = []
Expand Down
6 changes: 3 additions & 3 deletions src/wiktextract/extractor/fr/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ def process_translation_templates(
# translation box start: https://fr.wiktionary.org/wiki/Modèle:trad-début
sense_parameter = template_node.template_parameters.get(1)
if sense_parameter is not None:
base_translation_data["sense"] = clean_node(
wxr, None, sense_parameter
)
sense_text = clean_node(wxr, None, sense_parameter)
if len(sense_text) > 0:
base_translation_data["sense"] = sense_text
elif template_node.template_name == "T":
# Translation language: https://fr.wiktionary.org/wiki/Modèle:T
base_translation_data["code"] = template_node.template_parameters.get(1)
Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/form_descriptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@
r"\s+\((({}): ([^()]|\([^()]+\))+)\)"
.format("|".join(re.escape(x.removeprefix("?"))
for x in sorted(xlat_head_map.values(),
key=lambda x: len(x),
key=len,
reverse=True)
if x and not x.startswith("class-"))))

Expand All @@ -179,7 +179,7 @@
"|".join(re.escape(x) for x in
# The sort is to put longer ones first, preferring them in
# the regexp match
sorted(xlat_head_map.keys(), key=lambda x: len(x),
sorted(xlat_head_map.keys(), key=len,
reverse=True)))
head_final_re = re.compile(head_final_re_text + "$")

Expand Down
Loading