From eb2a63690a5f26f7dad92eabc215423792c841f3 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Fri, 17 May 2024 08:21:10 +0000
Subject: [PATCH] Replace `data: str` with `candidates: Sequence[str]`

When the spelling dictionaries are loaded, previously the correction
line was just stored in memory as a simple text. Through out the code,
callers would then have to deal with the `data` attribute, correctly
`split()` + `strip()` it. With this change, the dictionary parsing
code now encapsulates this problem.

The auto-correction works from the assumption that there is only one
candidate. This assumption is invariant and seem to be properly
maintained in the code. Therefore, we can just pick the first
candidate word when doing a correction.

In the code, the following name changes are performed:

 * `Misspelling.data` -> `Misspelling.candidates`
 * `fixword` -> `candidates` when used for multiple candidates
   (`fixword` remains for when it is a correction)

On performance:

Performance-wise, this change moves computation from "checking" time
to "startup" time.  The performance cost does not appear to be
noticeable in my baseline (#3419). Though, keep the corpus weakness on
the ratio of cased vs. non-cased corrections with multiple candidates
in mind.

The all lowercase typo is now slightly more expensive (it was passed
throughout `fix_case` and fed directly into the `print` in the
original code. In the new code, it will always need a `join`).  There
are still an overweight of lower-case only corrections in general, so
the unconditional `.join` alone is not sufficient to affect the
performance noticeably.
---
 codespell_lib/_codespell.py   | 31 ++++++++++++++++---------------
 codespell_lib/_text_util.py   | 10 ++++++----
 codespell_lib/spellchecker.py | 12 ++++++++----
 3 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index c36a83567d5..48d6d6fd6f6 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -715,10 +715,10 @@ def ask_for_word_fix(
     misspelling: Misspelling,
     interactivity: int,
     colors: TermColors,
-) -> Tuple[bool, str]:
+) -> Tuple[bool, Sequence[str]]:
     wrongword = match.group()
     if interactivity <= 0:
-        return misspelling.fix, fix_case(wrongword, misspelling.data)
+        return misspelling.fix, fix_case(wrongword, misspelling.candidates)
 
     line_ui = (
         f"{line[:match.start()]}"
@@ -728,7 +728,8 @@ def ask_for_word_fix(
 
     if misspelling.fix and interactivity & 1:
         r = ""
-        fixword = fix_case(wrongword, misspelling.data)
+        candidates = fix_case(wrongword, misspelling.candidates)
+        fixword = candidates[0]
         while not r:
             print(f"{line_ui}\t{wrongword} ==> {fixword} (Y/n) ", end="", flush=True)
             r = sys.stdin.readline().strip().upper()
@@ -746,12 +747,12 @@ def ask_for_word_fix(
         # we ask the user which word to use
 
         r = ""
-        opt = [w.strip() for w in misspelling.data.split(",")]
+        opt = misspelling.candidates
         while not r:
             print(f"{line_ui} Choose an option (blank for none): ", end="")
-            for i, o in enumerate(opt):
-                fixword = fix_case(wrongword, o)
-                print(f" {i}) {fixword}", end="")
+            cased_candidates = fix_case(wrongword, opt)
+            for i, candidates in enumerate(cased_candidates):
+                print(f" {i}) {candidates}", end="")
             print(": ", end="", flush=True)
 
             n = sys.stdin.readline().strip()
@@ -766,9 +767,9 @@ def ask_for_word_fix(
 
         if r:
             misspelling.fix = True
-            misspelling.data = r
+            misspelling.candidates = (r,)
 
-    return misspelling.fix, fix_case(wrongword, misspelling.data)
+    return misspelling.fix, fix_case(wrongword, misspelling.candidates)
 
 
 def print_context(
@@ -860,14 +861,14 @@ def parse_file(
                 if lword not in misspellings:
                     continue
                 fix = misspellings[lword].fix
-                fixword = fix_case(word, misspellings[lword].data)
+                candidates = fix_case(word, misspellings[lword].candidates)
 
                 if summary and fix:
                     summary.update(lword)
 
                 cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
                 cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
-                crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}"
+                crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"
 
                 reason = misspellings[lword].reason
                 if reason:
@@ -957,13 +958,13 @@ def parse_file(
 
                 context_shown = False
                 fix = misspellings[lword].fix
-                fixword = fix_case(word, misspellings[lword].data)
+                candidates = fix_case(word, misspellings[lword].candidates)
 
                 if options.interactive and lword not in asked_for:
                     if context is not None:
                         context_shown = True
                         print_context(lines, i, context)
-                    fix, fixword = ask_for_word_fix(
+                    fix, candidates = ask_for_word_fix(
                         lines[i],
                         match,
                         misspellings[lword],
@@ -980,7 +981,7 @@ def parse_file(
 
                 if options.write_changes and fix:
                     changed = True
-                    lines[i] = re.sub(rf"\b{word}\b", fixword, lines[i])
+                    lines[i] = re.sub(rf"\b{word}\b", candidates[0], lines[i])
                     fixed_words.add(word)
                     continue
 
@@ -995,7 +996,7 @@ def parse_file(
                 cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
                 cline = f"{colors.FILE}{i + 1}{colors.DISABLE}"
                 cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
-                crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}"
+                crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"
 
                 reason = misspellings[lword].reason
                 if reason:
diff --git a/codespell_lib/_text_util.py b/codespell_lib/_text_util.py
index 18a2ec89b40..c141db503d7 100644
--- a/codespell_lib/_text_util.py
+++ b/codespell_lib/_text_util.py
@@ -16,12 +16,14 @@
 Copyright (C) 2011  ProFUSION embedded systems
 """
 
+from typing import Sequence
 
-def fix_case(word: str, fixword: str) -> str:
+
+def fix_case(word: str, candidates: Sequence[str]) -> Sequence[str]:
     if word == word.capitalize():
-        return ", ".join(w.strip().capitalize() for w in fixword.split(","))
+        return tuple(c.capitalize() for c in candidates)
     if word == word.upper():
-        return fixword.upper()
+        return tuple(c.upper() for c in candidates)
     # they are both lower case
     # or we don't have any idea
-    return fixword
+    return candidates
diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py
index a03eeeba0e2..975cc173d53 100644
--- a/codespell_lib/spellchecker.py
+++ b/codespell_lib/spellchecker.py
@@ -15,7 +15,7 @@
 Copyright (C) 2010-2011  Lucas De Marchi <lucas.de.marchi@gmail.com>
 Copyright (C) 2011  ProFUSION embedded systems
 """
-from typing import Dict, Set
+from typing import Dict, Set, Sequence
 
 # Pass all misspellings through this translation table to generate
 # alternative misspellings and fixes.
@@ -23,8 +23,8 @@
 
 
 class Misspelling:
-    def __init__(self, data: str, fix: bool, reason: str) -> None:
-        self.data = data
+    def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
+        self.candidates = candidates
         self.fix = fix
         self.reason = reason
 
@@ -44,7 +44,11 @@ def add_misspelling(
         fix = True
         reason = ""
 
-    misspellings[key] = Misspelling(data, fix, reason)
+    misspellings[key] = Misspelling(
+        tuple(c.strip() for c in data.split(",")),
+        fix,
+        reason,
+    )
 
 
 def build_dict(