codespell-project · nthykier · May 17, 2024 · May 17, 2024 · May 17, 2024 · May 17, 2024
@@ -39,7 +39,7 @@
     Tuple,
 )
 
-from ._spellchecker import Misspelling, build_dict
+from ._spellchecker import DetectedMisspelling, LineTokenizer, Spellchecker
 from ._text_util import fix_case
 
 # autogenerated by setuptools_scm
@@ -55,7 +55,6 @@
     "(\\b(?:https?|[ts]?ftp|file|git|smb)://[^\\s]+(?=$|\\s)|"
     "\\b[\\w.%+-]+@[\\w.-]+\\b)"
 )
-inline_ignore_regex = re.compile(r"[^\w\s]\s?codespell:ignore\b(\s+(?P<words>[\w,]*))?")
 USAGE = """
 \t%prog [OPTIONS] [file1 file2 ... fileN]
 """
@@ -712,14 +711,16 @@
 
 def ask_for_word_fix(
     line: str,
-    match: Match[str],
-    misspelling: Misspelling,
+    issue: "DetectedMisspelling[re.Match[str]]",
     interactivity: int,
     colors: TermColors,
-) -> Tuple[bool, str]:
-    wrongword = match.group()
+) -> Tuple[bool, Sequence[str]]:
+    wrongword = issue.word
+    misspelling = issue.misspelling
     if interactivity <= 0:
-        return misspelling.fix, fix_case(wrongword, misspelling.data)
+        return misspelling.fix, fix_case(wrongword, misspelling.candidates)
+
+    match = issue.token
 
     line_ui = (
         f"{line[:match.start()]}"
@@ -729,7 +730,8 @@
 
     if misspelling.fix and interactivity & 1:
         r = ""
-        fixword = fix_case(wrongword, misspelling.data)
+        candidates = fix_case(wrongword, misspelling.candidates)
+        fixword = candidates[0]
         while not r:
             print(f"{line_ui}\t{wrongword} ==> {fixword} (Y/n) ", end="", flush=True)
             r = sys.stdin.readline().strip().upper()
@@ -747,12 +749,12 @@
         # we ask the user which word to use
 
         r = ""
-        opt = [w.strip() for w in misspelling.data.split(",")]
+        opt = misspelling.candidates
         while not r:
             print(f"{line_ui} Choose an option (blank for none): ", end="")
-            for i, o in enumerate(opt):
-                fixword = fix_case(wrongword, o)
-                print(f" {i}) {fixword}", end="")
+            cased_candidates = fix_case(wrongword, opt)
+            for i, candidates in enumerate(cased_candidates):
+                print(f" {i}) {candidates}", end="")
             print(": ", end="", flush=True)
 
             n = sys.stdin.readline().strip()
@@ -767,9 +769,9 @@
 
         if r:
             misspelling.fix = True
-            misspelling.data = r
+            misspelling.candidates = (r,)
 
-    return misspelling.fix, fix_case(wrongword, misspelling.data)
+    return misspelling.fix, fix_case(wrongword, misspelling.candidates)
 
 
 def print_context(
@@ -829,12 +831,39 @@
     return check_matches
 
 
+def line_tokenizer_factory(
+    uri_ignore_words: Set[str],
+    uri_regex: Pattern[str],
+    word_regex: Pattern[str],
+    ignore_word_regex: Optional[Pattern[str]],
+) -> "LineTokenizer[re.Match[str]]":
+    def line_tokenizer(line: str) -> Iterable[Match[str]]:
+        # If all URI spelling errors will be ignored, erase any URI before
+        # extracting words. Otherwise, apply ignores after extracting words.
+        # This ensures that if a URI ignore word occurs both inside a URI and
+        # outside, it will still be a spelling error.
+        if "*" in uri_ignore_words:
+            line = uri_regex.sub(" ", line)
+        check_matches = extract_words_iter(line, word_regex, ignore_word_regex)
+        if "*" not in uri_ignore_words:
+            check_matches = apply_uri_ignore_words(
+                check_matches,
+                line,
+                word_regex,
+                ignore_word_regex,
+                uri_regex,
+                uri_ignore_words,
+            )
+        return check_matches
+
+    return line_tokenizer
+
+
 def parse_file(
     filename: str,
     colors: TermColors,
     summary: Optional[Summary],
-    misspellings: Dict[str, Misspelling],
-    ignore_words_cased: Set[str],
+    spellchecker: Spellchecker,
     exclude_lines: Set[str],
     file_opener: FileOpener,
     word_regex: Pattern[str],
@@ -855,22 +884,23 @@
     else:
         if options.check_filenames:
             for word in extract_words(filename, word_regex, ignore_word_regex):
-                if word in ignore_words_cased:
+                if word in spellchecker.ignore_words_cased:
                     continue
                 lword = word.lower()
-                if lword not in misspellings:
+                misspelling = spellchecker.check_lower_cased_word(lword)
+                if misspelling is None:
                     continue
-                fix = misspellings[lword].fix
-                fixword = fix_case(word, misspellings[lword].data)
+                fix = misspelling.fix
+                candidates = fix_case(word, misspelling.candidates)
 
                 if summary and fix:
                     summary.update(lword)
 
                 cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
                 cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
-                crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}"
+                crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"
 
-                reason = misspellings[lword].reason
+                reason = misspelling.reason
                 if reason:
                     if options.quiet_level & QuietLevels.DISABLED_FIXES:
                         continue
@@ -905,127 +935,90 @@
         except OSError:
             return bad_count
 
+    line_tokenizer = line_tokenizer_factory(
+        uri_ignore_words,
+        uri_regex,
+        word_regex,
+        ignore_word_regex,
+    )
+
     for i, line in enumerate(lines):
-        if line.rstrip() in exclude_lines:
+        line = line.rstrip()
+        if not line or line in exclude_lines:
             continue
 
-        extra_words_to_ignore = set()
-        match = inline_ignore_regex.search(line)
-        if match:
-            extra_words_to_ignore = set(
-                filter(None, (match.group("words") or "").split(","))
-            )
-            if not extra_words_to_ignore:
-                continue
-
         fixed_words = set()
         asked_for = set()
 
-        # If all URI spelling errors will be ignored, erase any URI before
-        # extracting words. Otherwise, apply ignores after extracting words.
-        # This ensures that if a URI ignore word occurs both inside a URI and
-        # outside, it will still be a spelling error.
-        if "*" in uri_ignore_words:
-            line = uri_regex.sub(" ", line)
-        check_matches = extract_words_iter(line, word_regex, ignore_word_regex)
-        if "*" not in uri_ignore_words:
-            check_matches = apply_uri_ignore_words(
-                check_matches,
-                line,
-                word_regex,
-                ignore_word_regex,
-                uri_regex,
-                uri_ignore_words,
-            )
-        for match in check_matches:
-            word = match.group()
-            if word in ignore_words_cased:
-                continue
-            lword = word.lower()
-            if lword in misspellings and lword not in extra_words_to_ignore:
-                # Sometimes we find a 'misspelling' which is actually a valid word
-                # preceded by a string escape sequence.  Ignore such cases as
-                # they're usually false alarms; see issue #17 among others.
-                char_before_idx = match.start() - 1
-                if (
-                    char_before_idx >= 0
-                    and line[char_before_idx] == "\\"
-                    # bell, backspace, formfeed, newline, carriage-return, tab, vtab.
-                    and word.startswith(("a", "b", "f", "n", "r", "t", "v"))
-                    and lword[1:] not in misspellings
-                ):
-                    continue
+        for issue in spellchecker.spellcheck_line(line, line_tokenizer):
+            misspelling = issue.misspelling
+            word = issue.word
+            lword = issue.lword
 
-                context_shown = False
-                fix = misspellings[lword].fix
-                fixword = fix_case(word, misspellings[lword].data)
-
-                if options.interactive and lword not in asked_for:
-                    if context is not None:
-                        context_shown = True
-                        print_context(lines, i, context)
-                    fix, fixword = ask_for_word_fix(
-                        lines[i],
-                        match,
-                        misspellings[lword],
-                        options.interactive,
-                        colors=colors,
-                    )
-                    asked_for.add(lword)
+            context_shown = False
+            fix = misspelling.fix
+            candidates = fix_case(word, misspelling.candidates)
 
-                if summary and fix:
-                    summary.update(lword)
+            if options.interactive and lword not in asked_for:
+                if context is not None:
+                    context_shown = True
+                    print_context(lines, i, context)
+                fix, candidates = ask_for_word_fix(
+                    lines[i],
+                    issue,
+                    options.interactive,
+                    colors=colors,
+                )
+                asked_for.add(lword)
 
-                if word in fixed_words:  # can skip because of re.sub below
-                    continue
+            if summary and fix:
+                summary.update(lword)
 
-                if options.write_changes and fix:
-                    changed = True
-                    lines[i] = re.sub(rf"\b{word}\b", fixword, lines[i])
-                    fixed_words.add(word)
-                    continue
+            if word in fixed_words:  # can skip because of re.sub below
+                continue
 
-                # otherwise warning was explicitly set by interactive mode
-                if (
-                    options.interactive & 2
-                    and not fix
-                    and not misspellings[lword].reason
-                ):
-                    continue
+            if options.write_changes and fix:
+                changed = True
+                lines[i] = re.sub(rf"\b{word}\b", candidates[0], lines[i])
+                fixed_words.add(word)
+                continue
 
-                cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
-                cline = f"{colors.FILE}{i + 1}{colors.DISABLE}"
-                cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
-                crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}"
+            # otherwise warning was explicitly set by interactive mode
+            if options.interactive & 2 and not fix and not misspelling.reason:
+                continue
 
-                reason = misspellings[lword].reason
-                if reason:
-                    if options.quiet_level & QuietLevels.DISABLED_FIXES:
-                        continue
-                    creason = f"  | {colors.FILE}{reason}{colors.DISABLE}"
-                else:
-                    if options.quiet_level & QuietLevels.NON_AUTOMATIC_FIXES:
-                        continue
-                    creason = ""
+            cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
+            cline = f"{colors.FILE}{i + 1}{colors.DISABLE}"
+            cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
+            crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"
 
-                # If we get to this point (uncorrected error) we should change
-                # our bad_count and thus return value
-                bad_count += 1
+            reason = misspelling.reason
+            if reason:
+                if options.quiet_level & QuietLevels.DISABLED_FIXES:
+                    continue
+                creason = f"  | {colors.FILE}{reason}{colors.DISABLE}"
+            else:
+                if options.quiet_level & QuietLevels.NON_AUTOMATIC_FIXES:
+                    continue
+                creason = ""
 
-                if (not context_shown) and (context is not None):
-                    print_context(lines, i, context)
-                if filename != "-":
-                    print(
-                        f"{cfilename}:{cline}: {cwrongword} "
-                        f"==> {crightword}{creason}"
-                    )
-                elif options.stdin_single_line:
-                    print(f"{cline}: {cwrongword} ==> {crightword}{creason}")
-                else:
-                    print(
-                        f"{cline}: {line.strip()}\n\t{cwrongword} "
-                        f"==> {crightword}{creason}"
-                    )
+            # If we get to this point (uncorrected error) we should change
+            # our bad_count and thus return value
+            bad_count += 1
+
+            if (not context_shown) and (context is not None):
+                print_context(lines, i, context)
+            if filename != "-":
+                print(
+                    f"{cfilename}:{cline}: {cwrongword} " f"==> {crightword}{creason}"
+                )
+            elif options.stdin_single_line:
+                print(f"{cline}: {cwrongword} ==> {crightword}{creason}")
+            else:
+                print(
+                    f"{cline}: {line.strip()}\n\t{cwrongword} "
+                    f"==> {crightword}{creason}"
+                )
 
     if changed:
         if filename == "-":
@@ -1170,9 +1163,10 @@
                 parser.print_help()
                 return EX_USAGE
             use_dictionaries.append(dictionary)
-    misspellings: Dict[str, Misspelling] = {}
+    spellchecker = Spellchecker()
+    spellchecker.ignore_words_cased = ignore_words_cased
     for dictionary in use_dictionaries:
-        build_dict(dictionary, misspellings, ignore_words)
+        spellchecker.add_from_file(dictionary, ignore_words=ignore_words)
     colors = TermColors()
     if not options.colors:
         colors.disable()
@@ -1247,8 +1241,7 @@
                         fname,
                         colors,
                         summary,
-                        misspellings,
-                        ignore_words_cased,
+                        spellchecker,
                         exclude_lines,
                         file_opener,
                         word_regex,
@@ -1272,8 +1265,7 @@
                 filename,
                 colors,
                 summary,
-                misspellings,
-                ignore_words_cased,
+                spellchecker,
                 exclude_lines,
                 file_opener,
                 word_regex,