From 2b4f32d13ce61d4021496ca561c3bb7074923177 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 13 Oct 2023 15:06:46 +0800 Subject: [PATCH 1/4] =?UTF-8?q?Ignore=20empty=20string=20template=20parame?= =?UTF-8?q?ter=20of=20the=20"trad-d=C3=A9but"=20template?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/wiktextract/clean.py | 2 +- src/wiktextract/extractor/fr/translation.py | 6 +- src/wiktextract/form_descriptions.py | 4 +- src/wiktextract/inflection.py | 129 ++++++++++---------- src/wiktextract/pronunciations.py | 2 +- tests/test_clean.py | 2 +- 6 files changed, 72 insertions(+), 73 deletions(-) diff --git a/src/wiktextract/clean.py b/src/wiktextract/clean.py index 8b1e9fec..b5c37ff6 100644 --- a/src/wiktextract/clean.py +++ b/src/wiktextract/clean.py @@ -1131,7 +1131,7 @@ def expand_group(v): if a == "2": v = "√" elif a == "3": - v = "∛", + v = "∛" elif a == "4": v = "∜" else: diff --git a/src/wiktextract/extractor/fr/translation.py b/src/wiktextract/extractor/fr/translation.py index 618bb6e3..7d98528c 100644 --- a/src/wiktextract/extractor/fr/translation.py +++ b/src/wiktextract/extractor/fr/translation.py @@ -72,9 +72,9 @@ def process_translation_templates( # translation box start: https://fr.wiktionary.org/wiki/Modèle:trad-début sense_parameter = template_node.template_parameters.get(1) if sense_parameter is not None: - base_translation_data["sense"] = clean_node( - wxr, None, sense_parameter - ) + sense_text = clean_node(wxr, None, sense_parameter) + if len(sense_text) > 0: + base_translation_data["sense"] = sense_text elif template_node.template_name == "T": # Translation language: https://fr.wiktionary.org/wiki/Modèle:T base_translation_data["code"] = template_node.template_parameters.get(1) diff --git a/src/wiktextract/form_descriptions.py b/src/wiktextract/form_descriptions.py index c2938cc6..6ef230a3 100644 --- a/src/wiktextract/form_descriptions.py +++ b/src/wiktextract/form_descriptions.py @@ -169,7 +169,7 @@ r"\s+\((({}): ([^()]|\([^()]+\))+)\)" .format("|".join(re.escape(x.removeprefix("?")) for x in sorted(xlat_head_map.values(), - key=lambda x: len(x), + key=len, reverse=True) if x and not x.startswith("class-")))) @@ -179,7 +179,7 @@ "|".join(re.escape(x) for x in # The sort is to put longer ones first, preferring them in # the regexp match - sorted(xlat_head_map.keys(), key=lambda x: len(x), + sorted(xlat_head_map.keys(), key=len, reverse=True))) head_final_re = re.compile(head_final_re_text + "$") diff --git a/src/wiktextract/inflection.py b/src/wiktextract/inflection.py index ec8a0037..2ef3ea51 100644 --- a/src/wiktextract/inflection.py +++ b/src/wiktextract/inflection.py @@ -262,7 +262,7 @@ def set_debug_cell_text(text): # Arabic, but that is being handled elsewhere now. ]) -class InflCell(object): +class InflCell: """Cell in an inflection table.""" __slots__ = ( "text", @@ -292,7 +292,7 @@ def __repr__(self): return str(self) -class HdrSpan(object): +class HdrSpan: """Saved information about a header cell/span during the parsing of a table.""" __slots__ = ( @@ -569,7 +569,7 @@ def extract_cell_content(lang, word, col): col = col[:-1] else: break - + # Check for another form of note definition if (len(col) > 2 and col[1] in (")", " ", ":") and col[0].isdigit() and @@ -1200,7 +1200,7 @@ def parse_simple_table(wxr, tablecontext, word, lang, pos, for x in titles: assert isinstance(x, str) - + # print("PARSE_SIMPLE_TABLE: TITLES:", titles) if debug_cell_text: print("ROWS:") @@ -1249,7 +1249,7 @@ def parse_simple_table(wxr, tablecontext, word, lang, pos, # for row in rows: # print(" ", row) - + # Parse definitions for references (from table itself and from text # after it) def_ht = {} @@ -1333,7 +1333,7 @@ def add_new_hdrspan(col, hdrspans, store_new_hdrspan, # later with "dummy-load-stored-hdrspans". if store_new_hdrspan: tablecontext.stored_hdrspans.append(hdrspan) - + # Handle headers that are above left-side header # columns and are followed by personal pronouns in # remaining columns (basically headers that @@ -1354,7 +1354,7 @@ def add_new_hdrspan(col, hdrspans, store_new_hdrspan, later_allowed = later_allowed | set(["dummy"]) # dummy2 has different behavior than plain dummy # and does not belong here. - + # print("col0_cats={} later_cats={} " # "fol_by_nonempty={} col_idx={} end={} " # "tagsets={}" @@ -1524,7 +1524,7 @@ def handle_mixed_lines(alts): # First is base and the rest is IPA alternatives alts = list((alts[0], "", alts[i]) for i in range(1, len(alts))) - + # Check for romanizations, forms first, romanizations under elif (len(alts) % 2 == 0 and not any("(" in x for x in alts) and @@ -1668,12 +1668,12 @@ def handle_parens(form, roman, clitic, extra_tags): form = (form[:m.start()] + subst + form[m.end():]).strip() return form, roman, clitic - + def merge_row_and_column_tags(form, some_has_covered_text): # Merge column tags and row tags. We give preference # to moods etc coming from rowtags (cf. austteigen/German/Verb # imperative forms). - + # In certain cases, what a tag means depends on whether # it is a row or column header. Depending on the language, # we replace certain tags with others if they're in @@ -1710,19 +1710,19 @@ def merge_row_and_column_tags(form, some_has_covered_text): for tt in old_tags)): continue tags.add(t) - + # Extract language-specific tags from the # form. This may also adjust the form. form, lang_tags = lang_specific_tags(lang, pos, form) tags.update(lang_tags) - + # For non-finite verb forms, see if they have # a gender/class suffix if pos == "verb" and any(valid_tags[t] == "non-finite" for t in tags): form, tt = parse_head_final_tags(wxr, lang, form) tags.update(tt) - + # Remove "personal" tag if have nth person; these # come up with e.g. reconhecer/Portuguese/Verb. But # not if we also have "pronoun" @@ -1732,20 +1732,20 @@ def merge_row_and_column_tags(form, some_has_covered_text): ["first-person", "second-person", "third-person"])): tags.remove("personal") - + # If we have impersonal, remove person and number. # This happens with e.g. viajar/Portuguese/Verb if "impersonal" in tags: tags = tags - set(["first-person", "second-person", "third-person", "singular", "plural"]) - + # Remove unnecessary "positive" tag from verb forms if pos == "verb" and "positive" in tags: if "negative" in tags: tags.remove("negative") tags.remove("positive") - + # Many Russian (and other Slavic) inflection tables # have animate/inanimate distinction that generates # separate entries for neuter/feminine, but the @@ -1758,14 +1758,14 @@ def merge_row_and_column_tags(form, some_has_covered_text): "masculine" not in tags and "plural" not in tags): tags.remove(t1) - + # German adjective tables contain "(keiner)" etc # for mixed declension plural. When the adjective # disappears and it becomes just one word, remove # the "includes-article" tag. e.g. eiskalt/German if "includes-article" in tags and " " not in form: tags.remove("includes-article") - + # Handle ignored forms. We mark that the form was # provided. This is important information; some words # just do not have a certain form. However, there also @@ -1784,7 +1784,7 @@ def merge_row_and_column_tags(form, some_has_covered_text): form = "-" elif col_idx in has_covering_hdr: some_has_covered_text = True - + # Handle ambiguous object concord. If a header # gives the "dummy-object-concord"-tag to a word, # replace person, number and gender tags with @@ -1798,7 +1798,7 @@ def merge_row_and_column_tags(form, some_has_covered_text): if subtag in tags: tags.remove(subtag) tags.add(objtag) - + # Remove the dummy mood tag that we sometimes # use to block adding other mood and related # tags @@ -1813,7 +1813,7 @@ def merge_row_and_column_tags(form, some_has_covered_text): "dummy-reset-stored-hdrspans", "dummy-section-header", ]) - + # Perform language-specific tag replacements according # to rules in a table. lang_tag_mappings = get_lang_conf(lang, @@ -1822,13 +1822,13 @@ def merge_row_and_column_tags(form, some_has_covered_text): for pre, post in lang_tag_mappings.items(): if all(t in tags for t in pre): tags = (tags - set(pre)) | set(post) - + # Warn if there are entries with empty tags if not tags: wxr.wtp.debug("inflection table: empty tags for {}" .format(form), sortid="inflection/1826") - + # Warn if form looks like IPA ########## XXX ######## # Because IPA is its own unicode block, we could also @@ -1844,14 +1844,14 @@ def merge_row_and_column_tags(form, some_has_covered_text): "form={} tags={}" .format(form, tags), sortid="inflection/1840") - + # Note that this checks `form`, not `in tags` if form == "dummy-ignored-text-cell": continue - + if "dummy-remove-this-cell" in tags: continue - + # Add the form tags = list(sorted(tags)) dt = {"form": form, "tags": tags, "source": source} @@ -1866,7 +1866,7 @@ def merge_row_and_column_tags(form, some_has_covered_text): "source": source} ret.append(dt) return ret, form, some_has_covered_text - + # First extract definitions from cells # See defs_ht for footnote defs stuff for row in rows: @@ -2072,7 +2072,7 @@ def merge_row_and_column_tags(form, some_has_covered_text): if any("dummy-load-stored-hdrspans" in ts for ts in v): hdrspans.extend(tablecontext.stored_hdrspans) - + if any("dummy-reset-stored-hdrspans" in ts for ts in v): tablecontext.stored_hdrspans = [] @@ -2081,7 +2081,7 @@ def merge_row_and_column_tags(form, some_has_covered_text): store_new_hdrspan = True else: store_new_hdrspan = False - + new_coltags = list(x for x in new_coltags if not any(t in noinherit_tags for t in x)) # print("new_coltags={} previously_seen={} all_hdr_tags={}" @@ -2091,13 +2091,13 @@ def merge_row_and_column_tags(form, some_has_covered_text): = add_new_hdrspan(col, hdrspans, store_new_hdrspan, col0_followed_by_nonempty, col0_hdrspan) - + continue # These values are ignored, at least for now if re.match(r"^(# |\(see )", col): continue - + if any("dummy-skip-this" in ts for ts in rowtags): continue # Skip this cell @@ -2117,7 +2117,7 @@ def merge_row_and_column_tags(form, some_has_covered_text): get_lang_conf(lang, "ignore_top_left_text_cell") == True ): continue # Skip text at top left, as in Icelandic, Faroese - + # if col0_hdrspan is not None: # print("COL0 FOLLOWED NONHDR: {!r} by {!r}" # .format(col0_hdrspan.text, col)) @@ -2135,16 +2135,16 @@ def merge_row_and_column_tags(form, some_has_covered_text): # newline. col = re.sub(r"[ \t\r]+", " ", col) # Split the cell text into alternatives - + col, alts, split_extra_tags = \ split_text_into_alts(col) # Some cells have mixed form content, like text and romanization, # or text and IPA. Handle these. alts = handle_mixed_lines(alts) - + alts = list((x, combined_coltags) for x in alts) - + # Generate forms from the alternatives # alts is a list of (tuple of forms, tuple of tags) for (form, base_roman, ipa), coltags in alts: @@ -2180,7 +2180,7 @@ def merge_row_and_column_tags(form, some_has_covered_text): word, base_roman) extra_tags.extend(hdr_tags) - + # Do some additional cleanup on the cell. form = re.sub(r"^\s*,\s*", "", form) form = re.sub(r"\s*,\s*$", "", form) @@ -2192,7 +2192,7 @@ def merge_row_and_column_tags(form, some_has_covered_text): # Look for parentheses that have semantic meaning form, et = find_semantic_parens(form) extra_tags.extend(et) - + # Handle parentheses in the table element. We parse # tags anywhere and romanizations anywhere but beginning. roman = base_roman @@ -2212,7 +2212,7 @@ def merge_row_and_column_tags(form, some_has_covered_text): if paren is not None: form, roman, clitic = handle_parens(form, roman, clitic, extra_tags) - + # Ignore certain forms that are not really forms, # unless they're really, really close to the article title if form in ("", "unchanged", @@ -2243,7 +2243,7 @@ def merge_row_and_column_tags(form, some_has_covered_text): merge_row_and_column_tags(form, some_has_covered_text) ret.extend(merge_ret) - + # End of row. rownum += 1 # For certain languages, if the row was empty, reset @@ -2307,7 +2307,7 @@ def merge_row_and_column_tags(form, some_has_covered_text): had_noun = False continue # Skip the articles - + dt = dt.copy() dt["tags"] = tags new_ret.append(dt) @@ -2337,13 +2337,13 @@ def merge_row_and_column_tags(form, some_has_covered_text): continue if key_tag not in tags: skip_this = True - + if skip_this: continue new_ret.append(cell_data) - + ret = new_ret - + # Post-process English inflection tables, addding "multiword-construction" # when the number of words has increased. if lang == "English" and pos == "verb": @@ -2375,7 +2375,7 @@ def merge_row_and_column_tags(form, some_has_covered_text): ret = [dt] + [tn] + ret else: ret = [dt] + ret - + return ret def handle_generic_table(wxr, tablecontext, data, @@ -2584,7 +2584,7 @@ def determine_header(wxr, tablecontext, lang, word, pos, is_title = True return is_title, hdr_expansion, target, celltext -class TableContext(object): +class TableContext: """Saved context used when parsing a table and its subtables.""" __slot__ = ( "stored_hdrspans", @@ -2623,7 +2623,7 @@ def handle_wikitext_or_html_table(wxr, word, lang, pos, if not tablecontext: tablecontext = TableContext() - + def handle_table1(wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, depth): """Helper function allowing the 'flattening' out of the table @@ -2640,7 +2640,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos, assert isinstance(after, str) assert isinstance(depth, int) # print("HANDLE_WIKITEXT_TABLE", titles) - + col_gap_data = [] # Filling for columns with rowspan > 1 # col_gap_data contains None or InflCell vertical_still_left = [] # Number of remaining rows for which to fill @@ -2652,7 +2652,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos, rows = [] sub_ret = [] - + for node in tree.children: if not isinstance(node, WikiNode): continue @@ -2660,7 +2660,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos, kind = node.sarg else: kind = node.kind - + # print(" {}".format(node)) if kind in (NodeKind.TABLE_CAPTION, "caption"): # print(" CAPTION:", node) @@ -2671,7 +2671,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos, # have more data. The hidden data duplicates these rows, so # we skip it and just process the hidden data. continue - + # Parse a table row. row = [] style = None @@ -2693,7 +2693,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos, "th", "td"): print(" UNEXPECTED ROW CONTENT: {}".format(col)) continue - + while (len(row) < len(vertical_still_left) and vertical_still_left[len(row)] > 0): # vertical_still_left is [...0, 0, 2...] for each column. @@ -2705,7 +2705,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos, # and rowspan and colspan are just to generate the "fill- vertical_still_left[len(row)] -= 1 row.append(col_gap_data[len(row)]) - + # appending row is how "indexing" is # done here; something is appended, # like a filler-cell here or a "start" @@ -2719,7 +2719,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos, # except when a new rowspan is needed, # at the same time that # vertical_still_left gets reassigned. - + try: rowspan = int(col.attrs.get("rowspan", "1")) # 🡙 colspan = int(col.attrs.get("colspan", "1")) # 🡘 @@ -2727,7 +2727,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos, rowspan = 1 colspan = 1 # print("COL:", col) - + # Process any nested tables recursively. tables, rest = recursively_extract(col, lambda x: isinstance(x, WikiNode) @@ -2735,11 +2735,11 @@ def handle_table1(wxr, tablecontext, word, lang, pos, (x.kind == NodeKind.TABLE or x.sarg == "table")) - + # Clean the rest of the cell. celltext = clean_node(wxr, None, rest) # print("CLEANED:", celltext) - + # Handle nested tables. for tbl in tables: # Some nested tables (e.g., croí/Irish) have subtitles @@ -2761,12 +2761,12 @@ def handle_table1(wxr, tablecontext, word, lang, pos, titles = [] after = "" sub_ret.extend(subtbl) - + # This magic value is used as part of header detection cellstyle = (col.attrs.get("style", "") + "//" + col.attrs.get("class", "") + "//" + str(kind)) - + if not row: # if first column in row style = cellstyle target = None @@ -2781,7 +2781,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos, row, col, celltext, titletext, cols_headered, None, cellstyle) - + if is_title: # If this cell gets a "*" tag, make the whole column # below it (toggling it in cols_headered = [F, F, T...]) @@ -2825,7 +2825,7 @@ def handle_table1(wxr, tablecontext, word, lang, pos, # future, or None vertical_still_left[len(row)] = rowspan - 1 # A counter for how many gaps🡙 are still left to be - # filled (row.append or + # filled (row.append or # row[col_gap_data[len(row)] => # rows), it is not reset to [], but decremented to 0 # each time a row gets something from col_gap_data. @@ -2857,11 +2857,11 @@ def handle_table1(wxr, tablecontext, word, lang, pos, else: main_ret = [(rows, titles, after, depth)] return main_ret - + new_rows = handle_table1(wxr, tablecontext, word, lang, pos, data, tree, titles, source, after, 0) - + # Now we have a table that has been parsed into rows and columns of # InflCell objects. Parse the inflection table from that format. if new_rows: @@ -2878,13 +2878,13 @@ def handle_html_table(wxr, word, lang, pos, data, tree, titles, source, """A passer-on function for html-tables, XXX, remove these?""" handle_wikitext_or_html_table(wxr, word, lang, pos, data, tree, titles, source, after, tablecontext) - + def handle_wikitext_table(wxr, word, lang, pos, data, tree, titles, source, after, tablecontext=None): """A passer-on function for html-tables, XXX, remove these?""" handle_wikitext_or_html_table(wxr, word, lang, pos, data, tree, titles, source, after, tablecontext) - + def parse_inflection_section(wxr, data, @@ -3017,4 +3017,3 @@ def recurse(node, titles, navframe=False): f.write(section + "\n") text = wxr.wtp.node_to_wikitext(tree) f.write(text + "\n") - diff --git a/src/wiktextract/pronunciations.py b/src/wiktextract/pronunciations.py index a600b76b..99cd4fa7 100644 --- a/src/wiktextract/pronunciations.py +++ b/src/wiktextract/pronunciations.py @@ -25,7 +25,7 @@ pron_romanization_re = re.compile( "(?m)^(" + "|".join(re.escape(x) for x in - sorted(pron_romanizations.keys(), key=lambda x: len(x), + sorted(pron_romanizations.keys(), key=len, reverse=True)) + ")([^\n]+)") diff --git a/tests/test_clean.py b/tests/test_clean.py index 7d4379c9..344633d2 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -21,7 +21,7 @@ def test_pos(self): poses = self.wxr.config.POS_TYPES self.assertTrue(isinstance(poses, set)) for pos_type in ["noun", "verb", "pron", "adj", "adv", "num"]: - self.assertTrue(pos_type in poses) + self.assertIn(pos_type, poses) self.assertLess(len(poses), 50) def test_cv_plain(self): From 7e4451f670f25bc5ebf63cc8c67d742cf92dabb0 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 13 Oct 2023 16:55:34 +0800 Subject: [PATCH 2/4] Don't extract thesaurus and analyze templates for French Wiktionary French Wiktionary's thesaurus pages contain too much words that are not synonym words and doesn't use a common page layout. And word entry pages alreay have synonym section. Disbale this feature remove many error messages from the default English thesaurus extractor. Pre-expand templates just makes extracting French Wiktionary harder, I haven't seen any template that should be pre-expanded. --- languages/get_data.py | 3 +- src/wiktextract/config.py | 61 ++++++++-------------------- src/wiktextract/data/fr/config.json | 4 ++ src/wiktextract/extractor/fr/page.py | 6 --- src/wiktextract/page.py | 3 +- src/wiktextract/wiktionary.py | 6 ++- src/wiktextract/wiktwords.py | 4 +- src/wiktextract/wxr_context.py | 16 +++++--- tests/test_fr_etymology.py | 4 -- tests/test_fr_form_line.py | 4 -- tests/test_fr_gloss.py | 4 -- tests/test_fr_inflection.py | 4 -- tests/test_fr_linkage.py | 4 -- tests/test_fr_page.py | 4 -- tests/test_fr_pronunciation.py | 4 -- tests/test_fr_translation.py | 4 -- 16 files changed, 43 insertions(+), 92 deletions(-) create mode 100644 src/wiktextract/data/fr/config.json diff --git a/languages/get_data.py b/languages/get_data.py index 9de776c0..4730602e 100644 --- a/languages/get_data.py +++ b/languages/get_data.py @@ -83,7 +83,8 @@ def get_lang_data(lang_code: str, dump_file: str, db_path: Path | None) -> None: ) as fout: json.dump(data, fout, indent=2, ensure_ascii=False, sort_keys=True) wxr.wtp.close_db_conn() - close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn) + if wxr.config.extract_thesaurus_pages: + close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn) if __name__ == "__main__": diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py index 77c30c2f..8d80276c 100644 --- a/src/wiktextract/config.py +++ b/src/wiktextract/config.py @@ -6,28 +6,15 @@ import collections import json import sys -from typing import TYPE_CHECKING, Callable, Optional +from typing import Callable, Optional + +from wikitextprocessor.core import CollatedErrorReturnData if sys.version_info < (3, 10): from importlib_resources import files else: from importlib.resources import files -if TYPE_CHECKING: - from wikitextprocessor.core import StatsData - - -def int_dict(): - return collections.defaultdict(int) - - -def int_dict_dict(): - return collections.defaultdict(int_dict) - - -def list_dict(): - return collections.defaultdict(list) - class WiktionaryConfig: """This class holds configuration data for Wiktionary parsing.""" @@ -66,6 +53,8 @@ class WiktionaryConfig: "LANGUAGES_BY_NAME", "LANGUAGES_BY_CODE", "FORM_OF_TEMPLATES", + "analyze_templates", + "extract_thesaurus_pages", ) def __init__( @@ -130,38 +119,13 @@ def __init__( self.set_attr_from_json( "FORM_OF_TEMPLATES", "form_of_templates.json" ) - if dump_file_lang_code == "fr": - self.set_attr_from_json("FR_FORM_TABLES", "form_tables.json") if dump_file_lang_code == "de": self.set_attr_from_json("DE_FORM_TABLES", "form_templates.json") + self.analyze_templates = True # find templates that need pre-expand + self.extract_thesaurus_pages = True + self.load_edition_settings() - def to_kwargs(self): - return { - "dump_file_lang_code": self.dump_file_lang_code, - "capture_language_codes": self.capture_language_codes, - "capture_translations": self.capture_translations, - "capture_pronunciation": self.capture_pronunciation, - "capture_linkages": self.capture_linkages, - "capture_compounds": self.capture_compounds, - "capture_redirects": self.capture_redirects, - "capture_examples": self.capture_examples, - "capture_etymologies": self.capture_etymologies, - "capture_inflections": self.capture_inflections, - "capture_descendants": self.capture_descendants, - "verbose": self.verbose, - "expand_tables": self.expand_tables, - } - - def to_return(self) -> "StatsData": - return { - "num_pages": self.num_pages, - "language_counts": self.language_counts, - "pos_counts": self.pos_counts, - "section_counts": self.section_counts, - } - - def merge_return(self, ret): - assert isinstance(ret, dict) + def merge_return(self, ret: CollatedErrorReturnData): if "num_pages" in ret: self.num_pages += ret["num_pages"] for k, v in ret["language_counts"].items(): @@ -271,3 +235,10 @@ def alias_info(name, new_code, kind, old_code, use_code, not_use_code): ) else: self.LANGUAGES_BY_NAME[lang_name] = lang_code + + def load_edition_settings(self): + file_path = self.data_folder / "config.json" + if file_path.exists(): + with file_path.open(encoding="utf-8") as f: + for key, value in json.load(f).items(): + setattr(self, key, value) diff --git a/src/wiktextract/data/fr/config.json b/src/wiktextract/data/fr/config.json new file mode 100644 index 00000000..91a7ba44 --- /dev/null +++ b/src/wiktextract/data/fr/config.json @@ -0,0 +1,4 @@ +{ + "analyze_templates": false, + "extract_thesaurus_pages": false +} diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py index 0f797e4a..73a2b1b3 100644 --- a/src/wiktextract/extractor/fr/page.py +++ b/src/wiktextract/extractor/fr/page.py @@ -152,12 +152,6 @@ def parse_page( page_text, pre_expand=True, additional_expand=ADDITIONAL_EXPAND_TEMPLATES, - do_not_pre_expand={ - "trad-début", # don't expand translation start/end tempaltes - "trad-fin", - "(", # similar to "trad-debut", pre-expand breaks node structre - ")", - }, ) page_data = [] diff --git a/src/wiktextract/page.py b/src/wiktextract/page.py index 4c5cea9d..d7d43c4d 100644 --- a/src/wiktextract/page.py +++ b/src/wiktextract/page.py @@ -36,7 +36,8 @@ def parse_page( captured.""" page_extractor_mod = import_extractor_module(wxr.wtp.lang_code, "page") page_data = page_extractor_mod.parse_page(wxr, page_title, page_text) - inject_linkages(wxr, page_data) + if wxr.config.extract_thesaurus_pages: + inject_linkages(wxr, page_data) if wxr.config.dump_file_lang_code == "en": process_categories(wxr, page_data) remove_duplicate_data(page_data) diff --git a/src/wiktextract/wiktionary.py b/src/wiktextract/wiktionary.py index 9a65a3d5..21988c90 100644 --- a/src/wiktextract/wiktionary.py +++ b/src/wiktextract/wiktionary.py @@ -119,6 +119,7 @@ def parse_wiktionary( override_folders, skip_extract_dump, save_pages_path, + not wxr.config.analyze_templates, ) if not phase1_only: @@ -178,7 +179,10 @@ def reprocess_wiktionary( # Extract thesaurus data. This iterates over thesaurus pages, # but is very fast. - if thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0: + if ( + wxr.config.extract_thesaurus_pages + and thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0 + ): extract_thesaurus_data(wxr, num_processes) emitted = set() diff --git a/src/wiktextract/wiktwords.py b/src/wiktextract/wiktwords.py index fc4b4740..4cdc5067 100755 --- a/src/wiktextract/wiktwords.py +++ b/src/wiktextract/wiktwords.py @@ -83,6 +83,7 @@ def process_single_page( # is disabled by default to speed up single page testing. if ( args.use_thesaurus + and wxr.config.extract_thesaurus_pages and thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0 ): extract_thesaurus_data(wxr) @@ -507,7 +508,8 @@ def main(): json.dump(tree, f, indent=2, sort_keys=True) wxr.wtp.close_db_conn() - close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn) + if wxr.config.extract_thesaurus_pages: + close_thesaurus_db(wxr.thesaurus_db_path, wxr.thesaurus_db_conn) if args.profile: pr.disable() diff --git a/src/wiktextract/wxr_context.py b/src/wiktextract/wxr_context.py index a6b395d0..794fc777 100644 --- a/src/wiktextract/wxr_context.py +++ b/src/wiktextract/wxr_context.py @@ -28,15 +28,20 @@ def __init__(self, wtp: Wtp, config: WiktionaryConfig): self.thesaurus_db_path = wtp.db_path.with_stem( f"{wtp.db_path.stem}_thesaurus" ) - self.thesaurus_db_conn = init_thesaurus_db(self.thesaurus_db_path) + self.thesaurus_db_conn = ( + init_thesaurus_db(self.thesaurus_db_path) + if config.extract_thesaurus_pages + else None + ) def reconnect_databases(self, check_same_thread: bool = True) -> None: # `multiprocessing.pool.Pool.imap()` runs in another thread, if the db # connection is used to create iterable data for `imap`, # `check_same_thread` must be `False`. - self.thesaurus_db_conn = sqlite3.connect( - self.thesaurus_db_path, check_same_thread=check_same_thread - ) + if self.config.extract_thesaurus_pages: + self.thesaurus_db_conn = sqlite3.connect( + self.thesaurus_db_path, check_same_thread=check_same_thread + ) self.wtp.db_conn = sqlite3.connect( self.wtp.db_path, check_same_thread=check_same_thread ) @@ -44,7 +49,8 @@ def reconnect_databases(self, check_same_thread: bool = True) -> None: def remove_unpicklable_objects(self) -> None: # remove these variables before passing the `WiktextractContext` object # to worker processes - self.thesaurus_db_conn.close() + if self.config.extract_thesaurus_pages: + self.thesaurus_db_conn.close() self.thesaurus_db_conn = None self.wtp.db_conn.close() self.wtp.db_conn = None diff --git a/tests/test_fr_etymology.py b/tests/test_fr_etymology.py index 7275cd61..ee25cd3f 100644 --- a/tests/test_fr_etymology.py +++ b/tests/test_fr_etymology.py @@ -8,7 +8,6 @@ extract_etymology, insert_etymology_data, ) -from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -20,9 +19,6 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - close_thesaurus_db( - self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn - ) def test_ebauche_etym(self): # https://fr.wiktionary.org/wiki/Hörsaal diff --git a/tests/test_fr_form_line.py b/tests/test_fr_form_line.py index 5c8ba1d6..f7bbcd72 100644 --- a/tests/test_fr_form_line.py +++ b/tests/test_fr_form_line.py @@ -9,7 +9,6 @@ extract_form_line, process_zh_mot_template, ) -from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -21,9 +20,6 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - close_thesaurus_db( - self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn - ) @patch( "wiktextract.extractor.fr.pronunciation.clean_node", diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py index 5f5d11d6..845bb34a 100644 --- a/tests/test_fr_gloss.py +++ b/tests/test_fr_gloss.py @@ -7,7 +7,6 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.fr.gloss import extract_gloss from wiktextract.extractor.fr.page import process_pos_block -from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -19,9 +18,6 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - close_thesaurus_db( - self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn - ) @patch( "wikitextprocessor.Wtp.get_page", diff --git a/tests/test_fr_inflection.py b/tests/test_fr_inflection.py index abb1042f..f793aea7 100644 --- a/tests/test_fr_inflection.py +++ b/tests/test_fr_inflection.py @@ -7,7 +7,6 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.fr.inflection import extract_inflection -from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -19,9 +18,6 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - close_thesaurus_db( - self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn - ) @patch( "wikitextprocessor.Wtp.node_to_wikitext", diff --git a/tests/test_fr_linkage.py b/tests/test_fr_linkage.py index 3b0fbb5f..6b8b2f70 100644 --- a/tests/test_fr_linkage.py +++ b/tests/test_fr_linkage.py @@ -5,7 +5,6 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.fr.linkage import extract_linkage -from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -17,9 +16,6 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - close_thesaurus_db( - self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn - ) def test_tags(self): page_data = [defaultdict(list)] diff --git a/tests/test_fr_page.py b/tests/test_fr_page.py index c481027b..e372d5e4 100644 --- a/tests/test_fr_page.py +++ b/tests/test_fr_page.py @@ -8,7 +8,6 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.fr.page import parse_page -from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -23,9 +22,6 @@ def setUp(self): def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - close_thesaurus_db( - self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn - ) def test_fr_parse_page(self): self.wxr.wtp.add_page("Modèle:langue", 10, "Français") diff --git a/tests/test_fr_pronunciation.py b/tests/test_fr_pronunciation.py index f2b665ca..fc771cb9 100644 --- a/tests/test_fr_pronunciation.py +++ b/tests/test_fr_pronunciation.py @@ -6,7 +6,6 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.fr.pronunciation import extract_pronunciation -from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -18,9 +17,6 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - close_thesaurus_db( - self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn - ) def test_pron_list(self): page_data = [ diff --git a/tests/test_fr_translation.py b/tests/test_fr_translation.py index b687018f..a161d7fa 100644 --- a/tests/test_fr_translation.py +++ b/tests/test_fr_translation.py @@ -5,7 +5,6 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.fr.translation import extract_translation -from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -17,9 +16,6 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - close_thesaurus_db( - self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn - ) def test_italic_tag(self): self.wxr.wtp.start_page("") From b2b54d39cfc051b7e3213851d08932bba4f59d9e Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 16 Oct 2023 15:53:51 +0800 Subject: [PATCH 3/4] Only add the italic node as tag if it's between parentheses --- src/wiktextract/extractor/fr/gloss.py | 28 ++++++++++++++++------ src/wiktextract/extractor/fr/inflection.py | 5 +++- tests/test_fr_gloss.py | 19 +++++++++++++++ 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/src/wiktextract/extractor/fr/gloss.py b/src/wiktextract/extractor/fr/gloss.py index bcb03994..c3eb1f8c 100644 --- a/src/wiktextract/extractor/fr/gloss.py +++ b/src/wiktextract/extractor/fr/gloss.py @@ -43,15 +43,29 @@ def extract_gloss( ) gloss_only_nodes = [] - # extract italic tags - for node in gloss_nodes[gloss_start:]: - if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: + tag_indexes = set() + for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start): + # if an italic node is between parentheses then it's a tag, also + # don't add the parenthese strings to `gloss_only_nodes` + if ( + isinstance(node, WikiNode) + and node.kind == NodeKind.ITALIC + and index > gloss_start + and isinstance(gloss_nodes[index - 1], str) + and gloss_nodes[index - 1].strip() == "(" + and index + 1 < len(gloss_nodes) + and isinstance(gloss_nodes[index + 1], str) + and gloss_nodes[index + 1].strip() == ")" + ): gloss_data["tags"].append(clean_node(wxr, None, node)) + tag_indexes |= {index - 1, index, index + 1} continue - elif isinstance(node, str) and node.strip() in ["(", ")"]: - # remove parentheses around italic node - continue - gloss_only_nodes.append(node) + + gloss_only_nodes = [ + node + for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start) + if index not in tag_indexes + ] gloss_text = clean_node(wxr, gloss_data, gloss_only_nodes) gloss_data["glosses"] = [gloss_text] extract_examples(wxr, gloss_data, list_item_node) diff --git a/src/wiktextract/extractor/fr/inflection.py b/src/wiktextract/extractor/fr/inflection.py index c9ee14e3..d65f3d88 100644 --- a/src/wiktextract/extractor/fr/inflection.py +++ b/src/wiktextract/extractor/fr/inflection.py @@ -36,6 +36,7 @@ def extract_inflection( } ) + @dataclass class ColspanHeader: text: str @@ -123,7 +124,9 @@ def process_inflection_table( ) else: column_headers.append(table_header_text) - column_cell_index += int(table_cell.attrs.get("colspan", 1)) + column_cell_index += int( + table_cell.attrs.get("colspan", 1) + ) elif row_num > 0: row_headers.append(table_header_text) if "rowspan" in table_cell.attrs: diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py index 845bb34a..95d601c0 100644 --- a/tests/test_fr_gloss.py +++ b/tests/test_fr_gloss.py @@ -196,3 +196,22 @@ def test_italic_tag(self): } ], ) + + def test_not_italic_tag(self): + # https://fr.wiktionary.org/wiki/bec-en-ciseaux + self.wxr.wtp.start_page("bec-en-ciseaux") + root = self.wxr.wtp.parse( + "# [[oiseau|Oiseau]] aquatique de taille moyenne du genre ''[[Rhynchops]]''." + ) + page_data = [defaultdict(list)] + extract_gloss(self.wxr, page_data, root.children[0]) + self.assertEqual( + page_data, + [ + { + "senses": [ + {"glosses": ["Oiseau aquatique de taille moyenne du genre Rhynchops."]} + ] + } + ], + ) From d7d04bbd4ee06881a71f3a21be9984b4278aa092 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 16 Oct 2023 17:12:58 +0800 Subject: [PATCH 4/4] Preserve white spaces in gloss nodes --- src/wiktextract/extractor/fr/gloss.py | 31 ++++++++++++++++++--------- tests/test_fr_gloss.py | 20 +++++++++++++++++ 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/src/wiktextract/extractor/fr/gloss.py b/src/wiktextract/extractor/fr/gloss.py index c3eb1f8c..f93fd0cd 100644 --- a/src/wiktextract/extractor/fr/gloss.py +++ b/src/wiktextract/extractor/fr/gloss.py @@ -14,19 +14,30 @@ def extract_gloss( list_node: WikiNode, ) -> None: for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): - gloss_nodes = list(list_item_node.invert_find_child(NodeKind.LIST)) + gloss_nodes = list( + list_item_node.invert_find_child( + NodeKind.LIST, include_empty_str=True + ) + ) + # remove the first empty space in list item nodes + if ( + len(gloss_nodes) > 0 + and isinstance(gloss_nodes[0], str) + and len(gloss_nodes[0].strip()) == 0 + ): + gloss_nodes = gloss_nodes[1:] + gloss_data = defaultdict(list) gloss_start = 0 # process modifier, theme tempaltes before gloss text # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens - if ( - len(gloss_nodes) > 0 - and isinstance(gloss_nodes[0], WikiNode) - and gloss_nodes[0].kind == NodeKind.TEMPLATE - ): + if len(gloss_nodes) > 0 and isinstance(gloss_nodes[0], TemplateNode): gloss_start = 1 for index, gloss_node in enumerate(gloss_nodes[1:], 1): - if ( + if isinstance(gloss_node, str) and len(gloss_node.strip()) == 0: + # ignore empty string + gloss_start = index + 1 + elif ( not isinstance(gloss_node, WikiNode) or gloss_node.kind != NodeKind.TEMPLATE # template "variante de" is not a modifier @@ -38,9 +49,9 @@ def extract_gloss( else: gloss_start = index + 1 for tag_node in gloss_nodes[:gloss_start]: - gloss_data["tags"].append( - clean_node(wxr, gloss_data, tag_node).strip("()") - ) + tag = clean_node(wxr, gloss_data, tag_node).strip("() ") + if len(tag) > 0: + gloss_data["tags"].append(tag) gloss_only_nodes = [] tag_indexes = set() diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py index 95d601c0..84eccd86 100644 --- a/tests/test_fr_gloss.py +++ b/tests/test_fr_gloss.py @@ -215,3 +215,23 @@ def test_not_italic_tag(self): } ], ) + + def test_preserve_space_between_tags(self): + # https://fr.wiktionary.org/wiki/becs-en-ciseaux + # the space between italic node and the link node should be preserved + self.wxr.wtp.start_page("becs-en-ciseaux") + root = self.wxr.wtp.parse( + "# ''Pluriel de'' [[bec-en-ciseaux]]." + ) + page_data = [defaultdict(list)] + extract_gloss(self.wxr, page_data, root.children[0]) + self.assertEqual( + page_data, + [ + { + "senses": [ + {"glosses": ["Pluriel de bec-en-ciseaux."]} + ] + } + ], + )