Skip to content

Commit

Permalink
Replace data: str with candidates: Sequence[str]
Browse files Browse the repository at this point in the history
When the spelling dictionaries are loaded, previously the correction
line was just stored in memory as a simple text. Through out the code,
callers would then have to deal with the `data` attribute, correctly
`split()` + `strip()` it. With this change, the dictionary parsing
code now encapsulates this problem.

The auto-correction works from the assumption that there is only one
candidate. This assumption is invariant and seem to be properly
maintained in the code. Therefore, we can just pick the first
candidate word when doing a correction.

In the code, the following name changes are performed:

 * `Misspelling.data` -> `Misspelling.candidates`
 * `fixword` -> `candidates` when used for multiple candidates
   (`fixword` remains for when it is a correction)

On performance:

Performance-wise, this change moves computation from "checking" time
to "startup" time.  The performance cost does not appear to be
noticeable in my baseline (codespell-project#3419). Though, keep the corpus weakness on
the ratio of cased vs. non-cased corrections with multiple candidates
in mind.

The all lowercase typo is now slightly more expensive (it was passed
throughout `fix_case` and fed directly into the `print` in the
original code. In the new code, it will always need a `join`).  There
are still an overweight of lower-case only corrections in general, so
the unconditional `.join` alone is not sufficient to affect the
performance noticeably.
  • Loading branch information
nthykier committed May 17, 2024
1 parent e226a71 commit ecad6d3
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 23 deletions.
31 changes: 16 additions & 15 deletions codespell_lib/_codespell.py
Original file line number Diff line number Diff line change
Expand Up @@ -715,10 +715,10 @@ def ask_for_word_fix(
misspelling: Misspelling,
interactivity: int,
colors: TermColors,
) -> Tuple[bool, str]:
) -> Tuple[bool, Sequence[str]]:
wrongword = match.group()
if interactivity <= 0:
return misspelling.fix, fix_case(wrongword, misspelling.data)
return misspelling.fix, fix_case(wrongword, misspelling.candidates)

line_ui = (
f"{line[:match.start()]}"
Expand All @@ -728,7 +728,8 @@ def ask_for_word_fix(

if misspelling.fix and interactivity & 1:
r = ""
fixword = fix_case(wrongword, misspelling.data)
candidates = fix_case(wrongword, misspelling.candidates)
fixword = candidates[0]
while not r:
print(f"{line_ui}\t{wrongword} ==> {fixword} (Y/n) ", end="", flush=True)
r = sys.stdin.readline().strip().upper()
Expand All @@ -746,12 +747,12 @@ def ask_for_word_fix(
# we ask the user which word to use

r = ""
opt = [w.strip() for w in misspelling.data.split(",")]
opt = misspelling.candidates
while not r:
print(f"{line_ui} Choose an option (blank for none): ", end="")
for i, o in enumerate(opt):
fixword = fix_case(wrongword, o)
print(f" {i}) {fixword}", end="")
cased_candidates = fix_case(wrongword, opt)
for i, candidates in enumerate(cased_candidates):
print(f" {i}) {candidates}", end="")
print(": ", end="", flush=True)

n = sys.stdin.readline().strip()
Expand All @@ -766,9 +767,9 @@ def ask_for_word_fix(

if r:
misspelling.fix = True
misspelling.data = r
misspelling.candidates = (r,)

return misspelling.fix, fix_case(wrongword, misspelling.data)
return misspelling.fix, fix_case(wrongword, misspelling.candidates)


def print_context(
Expand Down Expand Up @@ -860,14 +861,14 @@ def parse_file(
if lword not in misspellings:
continue
fix = misspellings[lword].fix
fixword = fix_case(word, misspellings[lword].data)
candidates = fix_case(word, misspellings[lword].candidates)

if summary and fix:
summary.update(lword)

cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}"
crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"

reason = misspellings[lword].reason
if reason:
Expand Down Expand Up @@ -957,13 +958,13 @@ def parse_file(

context_shown = False
fix = misspellings[lword].fix
fixword = fix_case(word, misspellings[lword].data)
candidates = fix_case(word, misspellings[lword].candidates)

if options.interactive and lword not in asked_for:
if context is not None:
context_shown = True
print_context(lines, i, context)
fix, fixword = ask_for_word_fix(
fix, candidates = ask_for_word_fix(
lines[i],
match,
misspellings[lword],
Expand All @@ -980,7 +981,7 @@ def parse_file(

if options.write_changes and fix:
changed = True
lines[i] = re.sub(rf"\b{word}\b", fixword, lines[i])
lines[i] = re.sub(rf"\b{word}\b", candidates[0], lines[i])
fixed_words.add(word)
continue

Expand All @@ -995,7 +996,7 @@ def parse_file(
cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
cline = f"{colors.FILE}{i + 1}{colors.DISABLE}"
cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}"
crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"

reason = misspellings[lword].reason
if reason:
Expand Down
9 changes: 5 additions & 4 deletions codespell_lib/_text_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,14 @@
Copyright (C) 2010-2011 Lucas De Marchi <[email protected]>
Copyright (C) 2011 ProFUSION embedded systems
"""
from typing import Sequence


def fix_case(word: str, fixword: str) -> str:
def fix_case(word: str, candidates: Sequence[str]) -> Sequence[str]:
if word == word.capitalize():
return ", ".join(w.strip().capitalize() for w in fixword.split(","))
return tuple(c.capitalize() for c in candidates)
if word == word.upper():
return fixword.upper()
return tuple(c.upper() for c in candidates)
# they are both lower case
# or we don't have any idea
return fixword
return candidates
12 changes: 8 additions & 4 deletions codespell_lib/spellchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@
Copyright (C) 2010-2011 Lucas De Marchi <[email protected]>
Copyright (C) 2011 ProFUSION embedded systems
"""
from typing import Dict, Set
from typing import Dict, Set, Sequence

# Pass all misspellings through this translation table to generate
# alternative misspellings and fixes.
alt_chars = (("'", "’"),) # noqa: RUF001


class Misspelling:
def __init__(self, data: str, fix: bool, reason: str) -> None:
self.data = data
def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
self.candidates = candidates
self.fix = fix
self.reason = reason

Expand All @@ -44,7 +44,11 @@ def add_misspelling(
fix = True
reason = ""

misspellings[key] = Misspelling(data, fix, reason)
misspellings[key] = Misspelling(
tuple(c.strip() for c in data.split(",")),
fix,
reason,
)


def build_dict(
Expand Down

0 comments on commit ecad6d3

Please sign in to comment.