diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 64db267776..5598e6b11d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -58,7 +58,7 @@ repos: - -d - "{extends: relaxed, rules: {line-length: {max: 90}}}" - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.4 + rev: v0.4.5 hooks: - id: ruff - id: ruff-format @@ -68,7 +68,7 @@ repos: - id: prettier types_or: [yaml, markdown, html, css, scss, javascript, json] - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 + rev: v2.3.0 hooks: - id: codespell args: [--toml, pyproject-codespell.precommit-toml] diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 62a51b75b3..2ea9764e63 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -39,6 +39,9 @@ Tuple, ) +from ._spellchecker import DetectedMisspelling, LineTokenizer, Spellchecker +from ._text_util import fix_case + # autogenerated by setuptools_scm from ._version import ( # type: ignore[import-not-found] __version__ as VERSION, # noqa: N812 @@ -52,10 +55,6 @@ "(\\b(?:https?|[ts]?ftp|file|git|smb)://[^\\s]+(?=$|\\s)|" "\\b[\\w.%+-]+@[\\w.-]+\\b)" ) -# Pass all misspellings through this translation table to generate -# alternative misspellings and fixes. -alt_chars = (("'", "’"),) # noqa: RUF001 -inline_ignore_regex = re.compile(r"[^\w\s]\s?codespell:ignore\b(\s+(?P[\w,]*))?") USAGE = """ \t%prog [OPTIONS] [file1 file2 ... fileN] """ @@ -167,13 +166,6 @@ def match(self, filename: str) -> bool: return any(fnmatch.fnmatch(filename, p) for p in self.pattern_list) -class Misspelling: - def __init__(self, data: str, fix: bool, reason: str) -> None: - self.data = data - self.fix = fix - self.reason = reason - - class TermColors: def __init__(self) -> None: self.FILE = "\033[33m" @@ -703,48 +695,6 @@ def build_ignore_words( ) -def add_misspelling( - key: str, - data: str, - misspellings: Dict[str, Misspelling], -) -> None: - data = data.strip() - - if "," in data: - fix = False - data, reason = data.rsplit(",", 1) - reason = reason.lstrip() - else: - fix = True - reason = "" - - misspellings[key] = Misspelling(data, fix, reason) - - -def build_dict( - filename: str, - misspellings: Dict[str, Misspelling], - ignore_words: Set[str], -) -> None: - with open(filename, encoding="utf-8") as f: - translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars] - for line in f: - [key, data] = line.split("->") - # TODO: For now, convert both to lower. - # Someday we can maybe add support for fixing caps. - key = key.lower() - data = data.lower() - if key not in ignore_words: - add_misspelling(key, data, misspellings) - # generate alternative misspellings/fixes - for x, table in translate_tables: - if x in key: - alt_key = key.translate(table) - alt_data = data.translate(table) - if alt_key not in ignore_words: - add_misspelling(alt_key, alt_data, misspellings) - - def is_hidden(filename: str, check_hidden: bool) -> bool: bfilename = os.path.basename(filename) @@ -759,26 +709,18 @@ def is_text_file(filename: str) -> bool: return b"\x00" not in s -def fix_case(word: str, fixword: str) -> str: - if word == word.capitalize(): - return ", ".join(w.strip().capitalize() for w in fixword.split(",")) - if word == word.upper(): - return fixword.upper() - # they are both lower case - # or we don't have any idea - return fixword - - def ask_for_word_fix( line: str, - match: Match[str], - misspelling: Misspelling, + issue: "DetectedMisspelling[re.Match[str]]", interactivity: int, colors: TermColors, -) -> Tuple[bool, str]: - wrongword = match.group() +) -> Tuple[bool, Sequence[str]]: + wrongword = issue.word + misspelling = issue.misspelling if interactivity <= 0: - return misspelling.fix, fix_case(wrongword, misspelling.data) + return misspelling.fix, fix_case(wrongword, misspelling.candidates) + + match = issue.token line_ui = ( f"{line[:match.start()]}" @@ -788,7 +730,8 @@ def ask_for_word_fix( if misspelling.fix and interactivity & 1: r = "" - fixword = fix_case(wrongword, misspelling.data) + candidates = fix_case(wrongword, misspelling.candidates) + fixword = candidates[0] while not r: print(f"{line_ui}\t{wrongword} ==> {fixword} (Y/n) ", end="", flush=True) r = sys.stdin.readline().strip().upper() @@ -806,12 +749,12 @@ def ask_for_word_fix( # we ask the user which word to use r = "" - opt = [w.strip() for w in misspelling.data.split(",")] + opt = misspelling.candidates while not r: print(f"{line_ui} Choose an option (blank for none): ", end="") - for i, o in enumerate(opt): - fixword = fix_case(wrongword, o) - print(f" {i}) {fixword}", end="") + cased_candidates = fix_case(wrongword, opt) + for i, candidates in enumerate(cased_candidates): + print(f" {i}) {candidates}", end="") print(": ", end="", flush=True) n = sys.stdin.readline().strip() @@ -826,9 +769,9 @@ def ask_for_word_fix( if r: misspelling.fix = True - misspelling.data = r + misspelling.candidates = (r,) - return misspelling.fix, fix_case(wrongword, misspelling.data) + return misspelling.fix, fix_case(wrongword, misspelling.candidates) def print_context( @@ -888,12 +831,39 @@ def apply_uri_ignore_words( return check_matches +def line_tokenizer_factory( + uri_ignore_words: Set[str], + uri_regex: Pattern[str], + word_regex: Pattern[str], + ignore_word_regex: Optional[Pattern[str]], +) -> "LineTokenizer[re.Match[str]]": + def line_tokenizer(line: str) -> Iterable[Match[str]]: + # If all URI spelling errors will be ignored, erase any URI before + # extracting words. Otherwise, apply ignores after extracting words. + # This ensures that if a URI ignore word occurs both inside a URI and + # outside, it will still be a spelling error. + if "*" in uri_ignore_words: + line = uri_regex.sub(" ", line) + check_matches = extract_words_iter(line, word_regex, ignore_word_regex) + if "*" not in uri_ignore_words: + check_matches = apply_uri_ignore_words( + check_matches, + line, + word_regex, + ignore_word_regex, + uri_regex, + uri_ignore_words, + ) + return check_matches + + return line_tokenizer + + def parse_file( filename: str, colors: TermColors, summary: Optional[Summary], - misspellings: Dict[str, Misspelling], - ignore_words_cased: Set[str], + spellchecker: Spellchecker, exclude_lines: Set[str], file_opener: FileOpener, word_regex: Pattern[str], @@ -914,22 +884,23 @@ def parse_file( else: if options.check_filenames: for word in extract_words(filename, word_regex, ignore_word_regex): - if word in ignore_words_cased: + if word in spellchecker.ignore_words_cased: continue lword = word.lower() - if lword not in misspellings: + misspelling = spellchecker.check_lower_cased_word(lword) + if misspelling is None: continue - fix = misspellings[lword].fix - fixword = fix_case(word, misspellings[lword].data) + fix = misspelling.fix + candidates = fix_case(word, misspelling.candidates) if summary and fix: summary.update(lword) cfilename = f"{colors.FILE}{filename}{colors.DISABLE}" cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}" - crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}" + crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}" - reason = misspellings[lword].reason + reason = misspelling.reason if reason: if options.quiet_level & QuietLevels.DISABLED_FIXES: continue @@ -964,127 +935,90 @@ def parse_file( except OSError: return bad_count + line_tokenizer = line_tokenizer_factory( + uri_ignore_words, + uri_regex, + word_regex, + ignore_word_regex, + ) + for i, line in enumerate(lines): - if line.rstrip() in exclude_lines: + line = line.rstrip() + if not line or line in exclude_lines: continue - extra_words_to_ignore = set() - match = inline_ignore_regex.search(line) - if match: - extra_words_to_ignore = set( - filter(None, (match.group("words") or "").split(",")) - ) - if not extra_words_to_ignore: - continue - fixed_words = set() asked_for = set() - # If all URI spelling errors will be ignored, erase any URI before - # extracting words. Otherwise, apply ignores after extracting words. - # This ensures that if a URI ignore word occurs both inside a URI and - # outside, it will still be a spelling error. - if "*" in uri_ignore_words: - line = uri_regex.sub(" ", line) - check_matches = extract_words_iter(line, word_regex, ignore_word_regex) - if "*" not in uri_ignore_words: - check_matches = apply_uri_ignore_words( - check_matches, - line, - word_regex, - ignore_word_regex, - uri_regex, - uri_ignore_words, - ) - for match in check_matches: - word = match.group() - if word in ignore_words_cased: - continue - lword = word.lower() - if lword in misspellings and lword not in extra_words_to_ignore: - # Sometimes we find a 'misspelling' which is actually a valid word - # preceded by a string escape sequence. Ignore such cases as - # they're usually false alarms; see issue #17 among others. - char_before_idx = match.start() - 1 - if ( - char_before_idx >= 0 - and line[char_before_idx] == "\\" - # bell, backspace, formfeed, newline, carriage-return, tab, vtab. - and word.startswith(("a", "b", "f", "n", "r", "t", "v")) - and lword[1:] not in misspellings - ): - continue + for issue in spellchecker.spellcheck_line(line, line_tokenizer): + misspelling = issue.misspelling + word = issue.word + lword = issue.lword - context_shown = False - fix = misspellings[lword].fix - fixword = fix_case(word, misspellings[lword].data) - - if options.interactive and lword not in asked_for: - if context is not None: - context_shown = True - print_context(lines, i, context) - fix, fixword = ask_for_word_fix( - lines[i], - match, - misspellings[lword], - options.interactive, - colors=colors, - ) - asked_for.add(lword) + context_shown = False + fix = misspelling.fix + candidates = fix_case(word, misspelling.candidates) - if summary and fix: - summary.update(lword) + if options.interactive and lword not in asked_for: + if context is not None: + context_shown = True + print_context(lines, i, context) + fix, candidates = ask_for_word_fix( + lines[i], + issue, + options.interactive, + colors=colors, + ) + asked_for.add(lword) - if word in fixed_words: # can skip because of re.sub below - continue + if summary and fix: + summary.update(lword) - if options.write_changes and fix: - changed = True - lines[i] = re.sub(rf"\b{word}\b", fixword, lines[i]) - fixed_words.add(word) - continue + if word in fixed_words: # can skip because of re.sub below + continue - # otherwise warning was explicitly set by interactive mode - if ( - options.interactive & 2 - and not fix - and not misspellings[lword].reason - ): - continue + if options.write_changes and fix: + changed = True + lines[i] = re.sub(rf"\b{word}\b", candidates[0], lines[i]) + fixed_words.add(word) + continue - cfilename = f"{colors.FILE}{filename}{colors.DISABLE}" - cline = f"{colors.FILE}{i + 1}{colors.DISABLE}" - cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}" - crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}" + # otherwise warning was explicitly set by interactive mode + if options.interactive & 2 and not fix and not misspelling.reason: + continue - reason = misspellings[lword].reason - if reason: - if options.quiet_level & QuietLevels.DISABLED_FIXES: - continue - creason = f" | {colors.FILE}{reason}{colors.DISABLE}" - else: - if options.quiet_level & QuietLevels.NON_AUTOMATIC_FIXES: - continue - creason = "" + cfilename = f"{colors.FILE}{filename}{colors.DISABLE}" + cline = f"{colors.FILE}{i + 1}{colors.DISABLE}" + cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}" + crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}" - # If we get to this point (uncorrected error) we should change - # our bad_count and thus return value - bad_count += 1 + reason = misspelling.reason + if reason: + if options.quiet_level & QuietLevels.DISABLED_FIXES: + continue + creason = f" | {colors.FILE}{reason}{colors.DISABLE}" + else: + if options.quiet_level & QuietLevels.NON_AUTOMATIC_FIXES: + continue + creason = "" - if (not context_shown) and (context is not None): - print_context(lines, i, context) - if filename != "-": - print( - f"{cfilename}:{cline}: {cwrongword} " - f"==> {crightword}{creason}" - ) - elif options.stdin_single_line: - print(f"{cline}: {cwrongword} ==> {crightword}{creason}") - else: - print( - f"{cline}: {line.strip()}\n\t{cwrongword} " - f"==> {crightword}{creason}" - ) + # If we get to this point (uncorrected error) we should change + # our bad_count and thus return value + bad_count += 1 + + if (not context_shown) and (context is not None): + print_context(lines, i, context) + if filename != "-": + print( + f"{cfilename}:{cline}: {cwrongword} " f"==> {crightword}{creason}" + ) + elif options.stdin_single_line: + print(f"{cline}: {cwrongword} ==> {crightword}{creason}") + else: + print( + f"{cline}: {line.strip()}\n\t{cwrongword} " + f"==> {crightword}{creason}" + ) if changed: if filename == "-": @@ -1229,9 +1163,10 @@ def main(*args: str) -> int: parser.print_help() return EX_USAGE use_dictionaries.append(dictionary) - misspellings: Dict[str, Misspelling] = {} + spellchecker = Spellchecker() + spellchecker.ignore_words_cased = ignore_words_cased for dictionary in use_dictionaries: - build_dict(dictionary, misspellings, ignore_words) + spellchecker.add_from_file(dictionary, ignore_words=ignore_words) colors = TermColors() if not options.colors: colors.disable() @@ -1306,8 +1241,7 @@ def main(*args: str) -> int: fname, colors, summary, - misspellings, - ignore_words_cased, + spellchecker, exclude_lines, file_opener, word_regex, @@ -1331,8 +1265,7 @@ def main(*args: str) -> int: filename, colors, summary, - misspellings, - ignore_words_cased, + spellchecker, exclude_lines, file_opener, word_regex, diff --git a/codespell_lib/_spellchecker.py b/codespell_lib/_spellchecker.py new file mode 100644 index 0000000000..ac43074798 --- /dev/null +++ b/codespell_lib/_spellchecker.py @@ -0,0 +1,298 @@ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see +# https://www.gnu.org/licenses/old-licenses/gpl-2.0.html. +""" +Copyright (C) 2010-2011 Lucas De Marchi +Copyright (C) 2011 ProFUSION embedded systems +""" + +import os +import re +from typing import ( + Container, + Dict, + FrozenSet, + Generic, + Iterable, + Optional, + Protocol, + Sequence, + TypeVar, +) + +# Pass all misspellings through this translation table to generate +# alternative misspellings and fixes. +alt_chars = (("'", "’"),) # noqa: RUF001 + +T_co = TypeVar("T_co", bound="Token", covariant=True) + + +supported_languages_en = ("en", "en_GB", "en_US", "en_CA", "en_AU") +supported_languages = supported_languages_en + +# Users might want to link this file into /usr/local/bin, so we resolve the +# symbolic link path to the real path if necessary. +_data_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data") +_builtin_dictionaries = ( + # name, desc, name, err in aspell, correction in aspell, \ + # err dictionary array, rep dictionary array + # The arrays must contain the names of aspell dictionaries + # The aspell tests here aren't the ideal state, but the None's are + # realistic for obscure words + ("clear", "for unambiguous errors", "", False, None, supported_languages_en, None), + ( + "rare", + "for rare (but valid) words that are likely to be errors", + "_rare", + None, + None, + None, + None, + ), + ( + "informal", + "for making informal words more formal", + "_informal", + True, + True, + supported_languages_en, + supported_languages_en, + ), + ( + "usage", + "for replacing phrasing with recommended terms", + "_usage", + None, + None, + None, + None, + ), + ( + "code", + "for words from code and/or mathematics that are likely to be typos in other contexts (such as uint)", # noqa: E501 + "_code", + None, + None, + None, + None, + ), + ( + "names", + "for valid proper names that might be typos", + "_names", + None, + None, + None, + None, + ), + ( + "en-GB_to_en-US", + "for corrections from en-GB to en-US", + "_en-GB_to_en-US", + True, + True, + ("en_GB",), + ("en_US",), + ), +) +_builtin_default = "clear,rare" + +_builtin_default_as_tuple = tuple(_builtin_default.split(",")) + +_codespell_ignore_tag = "codespell:ignore" +_inline_ignore_regex = re.compile( + rf"[^\w\s]\s?{_codespell_ignore_tag}\b(\s+(?P[\w,]*))?" +) + + +class UnknownBuiltinDictionaryError(ValueError): + def __init__(self, name: str) -> None: + super().__init__(f"Unknown built-in dictionary: {name}") + + +class BuiltinDictionariesAlreadyLoadedError(TypeError): + def __init__(self) -> None: + super().__init__( + "load_builtin_dictionaries must not be called more than once", + ) + + +class LineTokenizer(Protocol[T_co]): + """Callable that splits a line into multiple tokens to be spellchecked + + Generally, a regex will do for simple cases. A probably too simple one is: + + >>> tokenizer = re.compile(r"[^ ]+").finditer + + For more complex cases, either use more complex regexes or custom tokenization + code. + """ + + def __call__(self, line: str) -> Iterable[T_co]: ... + + +class Token(Protocol): + """Describes a token + + This is a protocol to support `re.Match[str]` (which codespell uses) and any + other tokenization method that our API consumers might be using. + """ + + def group(self) -> str: ... + + def start(self) -> int: ... + + +class Misspelling: + def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None: + self.candidates = candidates + self.fix = fix + self.reason = reason + + +class DetectedMisspelling(Generic[T_co]): + def __init__( + self, + word: str, + lword: str, + misspelling: Misspelling, + token: T_co, + ) -> None: + self.word = word + self.lword = lword + self.misspelling = misspelling + self.token = token + + +class Spellchecker: + def __init__(self) -> None: + self._misspellings: Dict[str, Misspelling] = {} + self.ignore_words_cased: Container[str] = frozenset() + + def _parse_inline_ignore(self, line: str) -> Optional[FrozenSet[str]]: + if _codespell_ignore_tag not in line: + return frozenset() + inline_ignore_match = _inline_ignore_regex.search(line) + if inline_ignore_match: + words = frozenset( + filter(None, (inline_ignore_match.group("words") or "").split(",")) + ) + return words if words else None + return frozenset() + + def spellcheck_line( + self, + line: str, + tokenizer: LineTokenizer[T_co], + *, + respect_inline_ignore: bool = True, + ) -> Iterable[DetectedMisspelling[T_co]]: + """Tokenize and spellcheck a line + + Split the line into tokens based using the provided tokenizer. See the doc + string for the class for an example. + + :param line: The line to spellcheck. + :param tokenizer: A callable that will tokenize the line + :param respect_inline_ignore: Whether to check the line for + `codespell:ignore` instructions + :returns: An iterable of discovered typos. + """ + misspellings = self._misspellings + ignore_words_cased = self.ignore_words_cased + + extra_words_to_ignore = ( + self._parse_inline_ignore(line) if respect_inline_ignore else frozenset() + ) + if extra_words_to_ignore is None: + return + + for token in tokenizer(line): + word = token.group() + if word in ignore_words_cased: + continue + lword = word.lower() + misspelling = misspellings.get(lword) + if misspelling is not None and lword not in extra_words_to_ignore: + # Sometimes we find a 'misspelling' which is actually a valid word + # preceded by a string escape sequence. Ignore such cases as + # they're usually false alarms; see issue #17 among others. + char_before_idx = token.start() - 1 + if ( + char_before_idx >= 0 + and line[char_before_idx] == "\\" + # bell, backspace, formfeed, newline, carriage-return, tab, vtab. + and word.startswith(("a", "b", "f", "n", "r", "t", "v")) + and lword[1:] not in misspellings + ): + continue + yield DetectedMisspelling(word, lword, misspelling, token) + + def check_lower_cased_word(self, word: str) -> Optional[Misspelling]: + """Check a given word against the loaded dictionaries + + :param word: The word to check. This should be all lower-case. + """ + return self._misspellings.get(word) + + def add_from_file( + self, + filename: str, + *, + ignore_words: Container[str] = frozenset(), + ) -> None: + """Parse a codespell dictionary + + :param filename: The codespell dictionary file to parse + :param ignore_words: Words to ignore from this dictionary. + """ + misspellings = self._misspellings + with open(filename, encoding="utf-8") as f: + translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars] + for line in f: + [key, data] = line.split("->") + # TODO: For now, convert both to lower. + # Someday we can maybe add support for fixing caps. + key = key.lower() + data = data.lower() + if key not in ignore_words: + _add_misspelling(key, data, misspellings) + # generate alternative misspellings/fixes + for x, table in translate_tables: + if x in key: + alt_key = key.translate(table) + alt_data = data.translate(table) + if alt_key not in ignore_words: + _add_misspelling(alt_key, alt_data, misspellings) + + +def _add_misspelling( + key: str, + data: str, + misspellings: Dict[str, Misspelling], +) -> None: + data = data.strip() + + if "," in data: + fix = False + data, reason = data.rsplit(",", 1) + reason = reason.lstrip() + else: + fix = True + reason = "" + + misspellings[key] = Misspelling( + tuple(c.strip() for c in data.split(",")), + fix, + reason, + ) diff --git a/codespell_lib/_text_util.py b/codespell_lib/_text_util.py new file mode 100644 index 0000000000..33e6d7e033 --- /dev/null +++ b/codespell_lib/_text_util.py @@ -0,0 +1,29 @@ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see +# https://www.gnu.org/licenses/old-licenses/gpl-2.0.html. +""" +Copyright (C) 2010-2011 Lucas De Marchi +Copyright (C) 2011 ProFUSION embedded systems +""" + +from typing import Sequence + + +def fix_case(word: str, candidates: Sequence[str]) -> Sequence[str]: + if word == word.capitalize(): + return tuple(c.capitalize() for c in candidates) + if word == word.upper(): + return tuple(c.upper() for c in candidates) + # they are both lower-case + # or we don't have any idea + return candidates