From eb2a63690a5f26f7dad92eabc215423792c841f3 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Fri, 17 May 2024 08:21:10 +0000 Subject: [PATCH] Replace `data: str` with `candidates: Sequence[str]` When the spelling dictionaries are loaded, previously the correction line was just stored in memory as a simple text. Through out the code, callers would then have to deal with the `data` attribute, correctly `split()` + `strip()` it. With this change, the dictionary parsing code now encapsulates this problem. The auto-correction works from the assumption that there is only one candidate. This assumption is invariant and seem to be properly maintained in the code. Therefore, we can just pick the first candidate word when doing a correction. In the code, the following name changes are performed: * `Misspelling.data` -> `Misspelling.candidates` * `fixword` -> `candidates` when used for multiple candidates (`fixword` remains for when it is a correction) On performance: Performance-wise, this change moves computation from "checking" time to "startup" time. The performance cost does not appear to be noticeable in my baseline (#3419). Though, keep the corpus weakness on the ratio of cased vs. non-cased corrections with multiple candidates in mind. The all lowercase typo is now slightly more expensive (it was passed throughout `fix_case` and fed directly into the `print` in the original code. In the new code, it will always need a `join`). There are still an overweight of lower-case only corrections in general, so the unconditional `.join` alone is not sufficient to affect the performance noticeably. --- codespell_lib/_codespell.py | 31 ++++++++++++++++--------------- codespell_lib/_text_util.py | 10 ++++++---- codespell_lib/spellchecker.py | 12 ++++++++---- 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index c36a83567d5..48d6d6fd6f6 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -715,10 +715,10 @@ def ask_for_word_fix( misspelling: Misspelling, interactivity: int, colors: TermColors, -) -> Tuple[bool, str]: +) -> Tuple[bool, Sequence[str]]: wrongword = match.group() if interactivity <= 0: - return misspelling.fix, fix_case(wrongword, misspelling.data) + return misspelling.fix, fix_case(wrongword, misspelling.candidates) line_ui = ( f"{line[:match.start()]}" @@ -728,7 +728,8 @@ def ask_for_word_fix( if misspelling.fix and interactivity & 1: r = "" - fixword = fix_case(wrongword, misspelling.data) + candidates = fix_case(wrongword, misspelling.candidates) + fixword = candidates[0] while not r: print(f"{line_ui}\t{wrongword} ==> {fixword} (Y/n) ", end="", flush=True) r = sys.stdin.readline().strip().upper() @@ -746,12 +747,12 @@ def ask_for_word_fix( # we ask the user which word to use r = "" - opt = [w.strip() for w in misspelling.data.split(",")] + opt = misspelling.candidates while not r: print(f"{line_ui} Choose an option (blank for none): ", end="") - for i, o in enumerate(opt): - fixword = fix_case(wrongword, o) - print(f" {i}) {fixword}", end="") + cased_candidates = fix_case(wrongword, opt) + for i, candidates in enumerate(cased_candidates): + print(f" {i}) {candidates}", end="") print(": ", end="", flush=True) n = sys.stdin.readline().strip() @@ -766,9 +767,9 @@ def ask_for_word_fix( if r: misspelling.fix = True - misspelling.data = r + misspelling.candidates = (r,) - return misspelling.fix, fix_case(wrongword, misspelling.data) + return misspelling.fix, fix_case(wrongword, misspelling.candidates) def print_context( @@ -860,14 +861,14 @@ def parse_file( if lword not in misspellings: continue fix = misspellings[lword].fix - fixword = fix_case(word, misspellings[lword].data) + candidates = fix_case(word, misspellings[lword].candidates) if summary and fix: summary.update(lword) cfilename = f"{colors.FILE}{filename}{colors.DISABLE}" cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}" - crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}" + crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}" reason = misspellings[lword].reason if reason: @@ -957,13 +958,13 @@ def parse_file( context_shown = False fix = misspellings[lword].fix - fixword = fix_case(word, misspellings[lword].data) + candidates = fix_case(word, misspellings[lword].candidates) if options.interactive and lword not in asked_for: if context is not None: context_shown = True print_context(lines, i, context) - fix, fixword = ask_for_word_fix( + fix, candidates = ask_for_word_fix( lines[i], match, misspellings[lword], @@ -980,7 +981,7 @@ def parse_file( if options.write_changes and fix: changed = True - lines[i] = re.sub(rf"\b{word}\b", fixword, lines[i]) + lines[i] = re.sub(rf"\b{word}\b", candidates[0], lines[i]) fixed_words.add(word) continue @@ -995,7 +996,7 @@ def parse_file( cfilename = f"{colors.FILE}{filename}{colors.DISABLE}" cline = f"{colors.FILE}{i + 1}{colors.DISABLE}" cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}" - crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}" + crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}" reason = misspellings[lword].reason if reason: diff --git a/codespell_lib/_text_util.py b/codespell_lib/_text_util.py index 18a2ec89b40..c141db503d7 100644 --- a/codespell_lib/_text_util.py +++ b/codespell_lib/_text_util.py @@ -16,12 +16,14 @@ Copyright (C) 2011 ProFUSION embedded systems """ +from typing import Sequence -def fix_case(word: str, fixword: str) -> str: + +def fix_case(word: str, candidates: Sequence[str]) -> Sequence[str]: if word == word.capitalize(): - return ", ".join(w.strip().capitalize() for w in fixword.split(",")) + return tuple(c.capitalize() for c in candidates) if word == word.upper(): - return fixword.upper() + return tuple(c.upper() for c in candidates) # they are both lower case # or we don't have any idea - return fixword + return candidates diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py index a03eeeba0e2..975cc173d53 100644 --- a/codespell_lib/spellchecker.py +++ b/codespell_lib/spellchecker.py @@ -15,7 +15,7 @@ Copyright (C) 2010-2011 Lucas De Marchi Copyright (C) 2011 ProFUSION embedded systems """ -from typing import Dict, Set +from typing import Dict, Set, Sequence # Pass all misspellings through this translation table to generate # alternative misspellings and fixes. @@ -23,8 +23,8 @@ class Misspelling: - def __init__(self, data: str, fix: bool, reason: str) -> None: - self.data = data + def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None: + self.candidates = candidates self.fix = fix self.reason = reason @@ -44,7 +44,11 @@ def add_misspelling( fix = True reason = "" - misspellings[key] = Misspelling(data, fix, reason) + misspellings[key] = Misspelling( + tuple(c.strip() for c in data.split(",")), + fix, + reason, + ) def build_dict(