Replace data: str with candidates: Sequence[str]

When the spelling dictionaries are loaded, previously the correction line was just stored in memory as a simple text. Through out the code, callers would then have to deal with the `data` attribute, correctly `split()` + `strip()` it. With this change, the dictionary parsing code now encapsulates this problem. The auto-correction works from the assumption that there is only one candidate. This assumption is invariant and seem to be properly maintained in the code. Therefore, we can just pick the first candidate word when doing a correction. In the code, the following name changes are performed: * `Misspelling.data` -> `Misspelling.candidates` * `fixword` -> `candidates` when used for multiple candidates (`fixword` remains for when it is a correction) On performance: Performance-wise, this change moves computation from "checking" time to "startup" time. The performance cost does not appear to be noticeable in my baseline (codespell-project#3419). Though, keep the corpus weakness on the ratio of cased vs. non-cased corrections with multiple candidates in mind. The all lowercase typo is now slightly more expensive (it was passed throughout `fix_case` and fed directly into the `print` in the original code. In the new code, it will always need a `join`). There are still an overweight of lower-case only corrections in general, so the unconditional `.join` alone is not sufficient to affect the performance noticeably.
nthykier · May 25, 2024 · 14c86b1 · 14c86b1
1 parent 3a61c38
commit 14c86b1
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 23 deletions.
diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
@@ -715,10 +715,10 @@ def ask_for_word_fix(
     misspelling: Misspelling,
     interactivity: int,
     colors: TermColors,
-) -> Tuple[bool, str]:
+) -> Tuple[bool, Sequence[str]]:
     wrongword = match.group()
     if interactivity <= 0:
-        return misspelling.fix, fix_case(wrongword, misspelling.data)
+        return misspelling.fix, fix_case(wrongword, misspelling.candidates)
 
     line_ui = (
         f"{line[:match.start()]}"
@@ -728,7 +728,8 @@ def ask_for_word_fix(
 
     if misspelling.fix and interactivity & 1:
         r = ""
-        fixword = fix_case(wrongword, misspelling.data)
+        candidates = fix_case(wrongword, misspelling.candidates)
+        fixword = candidates[0]
         while not r:
             print(f"{line_ui}\t{wrongword} ==> {fixword} (Y/n) ", end="", flush=True)
             r = sys.stdin.readline().strip().upper()
@@ -746,12 +747,12 @@ def ask_for_word_fix(
         # we ask the user which word to use
 
         r = ""
-        opt = [w.strip() for w in misspelling.data.split(",")]
+        opt = misspelling.candidates
         while not r:
             print(f"{line_ui} Choose an option (blank for none): ", end="")
-            for i, o in enumerate(opt):
-                fixword = fix_case(wrongword, o)
-                print(f" {i}) {fixword}", end="")
+            cased_candidates = fix_case(wrongword, opt)
+            for i, candidates in enumerate(cased_candidates):
+                print(f" {i}) {candidates}", end="")
             print(": ", end="", flush=True)
 
             n = sys.stdin.readline().strip()
@@ -766,9 +767,9 @@ def ask_for_word_fix(
 
         if r:
             misspelling.fix = True
-            misspelling.data = r
+            misspelling.candidates = (r,)
 
-    return misspelling.fix, fix_case(wrongword, misspelling.data)
+    return misspelling.fix, fix_case(wrongword, misspelling.candidates)
 
 
 def print_context(
@@ -860,14 +861,14 @@ def parse_file(
                 if lword not in misspellings:
                     continue
                 fix = misspellings[lword].fix
-                fixword = fix_case(word, misspellings[lword].data)
+                candidates = fix_case(word, misspellings[lword].candidates)
 
                 if summary and fix:
                     summary.update(lword)
 
                 cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
                 cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
-                crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}"
+                crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"
 
                 reason = misspellings[lword].reason
                 if reason:
@@ -957,13 +958,13 @@ def parse_file(
 
                 context_shown = False
                 fix = misspellings[lword].fix
-                fixword = fix_case(word, misspellings[lword].data)
+                candidates = fix_case(word, misspellings[lword].candidates)
 
                 if options.interactive and lword not in asked_for:
                     if context is not None:
                         context_shown = True
                         print_context(lines, i, context)
-                    fix, fixword = ask_for_word_fix(
+                    fix, candidates = ask_for_word_fix(
                         lines[i],
                         match,
                         misspellings[lword],
@@ -980,7 +981,7 @@ def parse_file(
 
                 if options.write_changes and fix:
                     changed = True
-                    lines[i] = re.sub(rf"\b{word}\b", fixword, lines[i])
+                    lines[i] = re.sub(rf"\b{word}\b", candidates[0], lines[i])
                     fixed_words.add(word)
                     continue
 
@@ -995,7 +996,7 @@ def parse_file(
                 cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
                 cline = f"{colors.FILE}{i + 1}{colors.DISABLE}"
                 cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
-                crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}"
+                crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"
 
                 reason = misspellings[lword].reason
                 if reason:

diff --git a/codespell_lib/_text_util.py b/codespell_lib/_text_util.py
@@ -16,12 +16,14 @@
 Copyright (C) 2011  ProFUSION embedded systems
 """
 
+from typing import Sequence
 
-def fix_case(word: str, fixword: str) -> str:
+
+def fix_case(word: str, candidates: Sequence[str]) -> Sequence[str]:
     if word == word.capitalize():
-        return ", ".join(w.strip().capitalize() for w in fixword.split(","))
+        return tuple(c.capitalize() for c in candidates)
     if word == word.upper():
-        return fixword.upper()
+        return tuple(c.upper() for c in candidates)
     # they are both lower case
     # or we don't have any idea
-    return fixword
+    return candidates
diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py
@@ -15,16 +15,16 @@
 Copyright (C) 2010-2011  Lucas De Marchi <[email protected]>
 Copyright (C) 2011  ProFUSION embedded systems
 """
-from typing import Dict, Set
+from typing import Dict, Set, Sequence
 
 # Pass all misspellings through this translation table to generate
 # alternative misspellings and fixes.
 alt_chars = (("'", "’"),)  # noqa: RUF001
 
 
 class Misspelling:
-    def __init__(self, data: str, fix: bool, reason: str) -> None:
-        self.data = data
+    def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
+        self.candidates = candidates
         self.fix = fix
         self.reason = reason
 
@@ -44,7 +44,11 @@ def add_misspelling(
         fix = True
         reason = ""
 
-    misspellings[key] = Misspelling(data, fix, reason)
+    misspellings[key] = Misspelling(
+        tuple(c.strip() for c in data.split(",")),
+        fix,
+        reason,
+    )
 
 
 def build_dict(