From e0e9bca665edf8e91a11e47bbcce9a71581c3126 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Fri, 17 May 2024 13:41:23 +0000 Subject: [PATCH] Speed up `codespell:ignore` check by skipping the regex in most cases The changes to provide a public API had some performance related costs of about 1% runtime. There is no trivial way to offset this any further without undermining the API we are building. However, we can pull performance-related shenanigans to compenstate for the cost introduced. The codespell codebase unsurprisingly spends a vast majority of its runtime in various regex related code such as `search` and `finditer`. The best way to optimize runtime spend in regexes is to not do a regex in the first place, since the regex engine has a rather steep overhead over regular string primitives (that is the cost of flexibility). If the regex rarely matches and there is a very easy static substring that can be used to rule out the match, then you can speed up the code by using `substring in string` as a conditional to skip the regex. This is assuming the regex is used enough for the performance to matter. An obvious choice here falls on the `codespell:ignore` regex, because it has a very distinctive substring in the form of `codespell:ignore`, which will rule out almost all lines that will not match. With this little trick, runtime goes from ~5.6s to ~4.9s on the corpus mentioned in #3419. --- codespell_lib/spellchecker.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py index 69006e850a0..f46d50d3cbb 100644 --- a/codespell_lib/spellchecker.py +++ b/codespell_lib/spellchecker.py @@ -109,7 +109,10 @@ _builtin_default_as_tuple = tuple(_builtin_default.split(",")) -_inline_ignore_regex = re.compile(r"[^\w\s]\s?codespell:ignore\b(\s+(?P[\w,]*))?") +_codespell_ignore_tag = "codespell:ignore" +_inline_ignore_regex = re.compile( + rf"[^\w\s]\s?{_codespell_ignore_tag}\b(\s+(?P[\w,]*))?" +) class UnknownBuiltinDictionaryError(ValueError): @@ -210,6 +213,8 @@ def __init__( self.load_builtin_dictionaries(builtin_dictionaries) def _parse_inline_ignore(self, line: str) -> Optional[FrozenSet[str]]: + if _codespell_ignore_tag not in line: + return frozenset() inline_ignore_match = _inline_ignore_regex.search(line) if inline_ignore_match: words = frozenset(