Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhancement: --jobs to process files in parallel #3261

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
211 changes: 152 additions & 59 deletions codespell_lib/_codespell.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,19 @@
import re
import sys
import textwrap
from typing import Any, Dict, List, Match, Optional, Pattern, Sequence, Set, Tuple
from multiprocessing import Pool
from typing import (
Any,
Dict,
Generator,
List,
Match,
Optional,
Pattern,
Sequence,
Set,
Tuple,
)

# autogenerated by setuptools_scm
from ._version import __version__ as VERSION # type: ignore # noqa: N812
Expand Down Expand Up @@ -455,6 +467,20 @@
"should match the to-be-excluded lines exactly",
)

parser.add_argument(
"-J",
"--jobs",
action="store",
type=int,
default=0,
help="set number of jobs to parallelize processing - one "
"subprocess per file:\n"
"- 0: no parallelization (default)"
"- positive integer: number of sub-processes to use\n"
"- -1: use all available CPUs\n"
"Interactive mode is not compatible with parallel processing",
)

parser.add_argument(
"-i",
"--interactive",
Expand Down Expand Up @@ -1021,12 +1047,58 @@
return bad_count


class _FileParser:
"""A helper class to provide top level closure for parse_file()"""

def __init__(
self,
colors: TermColors,
summary: Optional[Summary],
misspellings: Dict[str, Misspelling],
exclude_lines: Set[str],
file_opener: FileOpener,
word_regex: Pattern[str],
ignore_word_regex: Optional[Pattern[str]],
uri_regex: Pattern[str],
uri_ignore_words: Set[str],
context: Optional[Tuple[int, int]],
options: argparse.Namespace,
) -> None:
self.colors = colors
self.summary = summary
self.misspellings = misspellings
self.exclude_lines = exclude_lines
self.file_opener = file_opener
self.word_regex = word_regex
self.ignore_word_regex = ignore_word_regex
self.uri_regex = uri_regex
self.uri_ignore_words = uri_ignore_words
self.context = context
self.options = options

def __call__(self, filename: str) -> int:
return parse_file(
filename,
self.colors,
self.summary,
self.misspellings,
self.exclude_lines,
self.file_opener,
self.word_regex,
self.ignore_word_regex,
self.uri_regex,
self.uri_ignore_words,
self.context,
self.options,
)


def _script_main() -> int:
"""Wrap to main() for setuptools."""
return main(*sys.argv[1:])


def main(*args: str) -> int:
def main(*args: str) -> int: # noqa: C901,PLR0915,PLR0911
"""Contains flow control"""
try:
options, parser, used_cfg_files = parse_options(args)
Expand Down Expand Up @@ -1138,6 +1210,25 @@
else:
summary = None

if options.jobs and options.interactive:
print(

Check warning on line 1214 in codespell_lib/_codespell.py

View check run for this annotation

Codecov / codecov/patch

codespell_lib/_codespell.py#L1214

Added line #L1214 was not covered by tests
"ERROR: do not enable parallelization in interactive mode",
file=sys.stderr,
)
# no point to parser.print_help() - just hides ERROR away here
return EX_USAGE

Check warning on line 1219 in codespell_lib/_codespell.py

View check run for this annotation

Codecov / codecov/patch

codespell_lib/_codespell.py#L1219

Added line #L1219 was not covered by tests

jobs = options.jobs
if jobs == -1:
jobs = os.cpu_count()

Check warning on line 1223 in codespell_lib/_codespell.py

View check run for this annotation

Codecov / codecov/patch

codespell_lib/_codespell.py#L1223

Added line #L1223 was not covered by tests
elif jobs < -1:
print(

Check warning on line 1225 in codespell_lib/_codespell.py

View check run for this annotation

Codecov / codecov/patch

codespell_lib/_codespell.py#L1225

Added line #L1225 was not covered by tests
f"ERROR: invalid number of jobs: {jobs}",
file=sys.stderr,
)
parser.print_help()
return EX_USAGE

Check warning on line 1230 in codespell_lib/_codespell.py

View check run for this annotation

Codecov / codecov/patch

codespell_lib/_codespell.py#L1229-L1230

Added lines #L1229 - L1230 were not covered by tests

context = None
if options.context is not None:
if (options.before_context is not None) or (options.after_context is not None):
Expand Down Expand Up @@ -1176,66 +1267,68 @@
)
return EX_USAGE

bad_count = 0
for filename in sorted(options.files):
# ignore hidden files
if is_hidden(filename, options.check_hidden):
continue

if os.path.isdir(filename):
for root, dirs, files in os.walk(filename):
if glob_match.match(root): # skip (absolute) directories
dirs.clear()
continue
if is_hidden(root, options.check_hidden): # dir itself hidden
continue
for file_ in sorted(files):
# ignore hidden files in directories
if is_hidden(file_, options.check_hidden):
continue
if glob_match.match(file_): # skip files
def _find_files() -> Generator[str, None, None]:
"""Yields filename for the parsing"""
for filename in sorted(options.files):
# ignore hidden files
if is_hidden(filename, options.check_hidden):
continue

if os.path.isdir(filename):
for root, dirs, files in os.walk(filename):
if glob_match.match(root): # skip (absolute) directories
dirs.clear()
continue
fname = os.path.join(root, file_)
if glob_match.match(fname): # skip paths
if is_hidden(root, options.check_hidden): # dir itself hidden
continue
bad_count += parse_file(
fname,
colors,
summary,
misspellings,
exclude_lines,
file_opener,
word_regex,
ignore_word_regex,
uri_regex,
uri_ignore_words,
context,
options,
)
for file_ in sorted(files):
# ignore hidden files in directories
if is_hidden(file_, options.check_hidden):
continue
if glob_match.match(file_): # skip files
continue
fname = os.path.join(root, file_)
if glob_match.match(fname): # skip paths
continue
yield fname

# skip (relative) directories
dirs[:] = [
dir_
for dir_ in dirs
if not glob_match.match(dir_)
and not is_hidden(dir_, options.check_hidden)
]

elif not glob_match.match(filename): # skip files
yield filename

# closure to pass only relevant to the job filename
file_parser = _FileParser(
colors,
summary,
misspellings,
exclude_lines,
file_opener,
word_regex,
ignore_word_regex,
uri_regex,
uri_ignore_words,
context,
options,
)

# skip (relative) directories
dirs[:] = [
dir_
for dir_ in dirs
if not glob_match.match(dir_)
and not is_hidden(dir_, options.check_hidden)
]

elif not glob_match.match(filename): # skip files
bad_count += parse_file(
filename,
colors,
summary,
misspellings,
exclude_lines,
file_opener,
word_regex,
ignore_word_regex,
uri_regex,
uri_ignore_words,
context,
options,
)
if jobs:
# parse_file would be in subprocess(es)
with Pool(jobs) as pool:
results = pool.map(file_parser, _find_files())

Check warning on line 1324 in codespell_lib/_codespell.py

View check run for this annotation

Codecov / codecov/patch

codespell_lib/_codespell.py#L1324

Added line #L1324 was not covered by tests
for result in results:
if isinstance(result, Exception):
raise result
bad_count = sum(results)

Check warning on line 1328 in codespell_lib/_codespell.py

View check run for this annotation

Codecov / codecov/patch

codespell_lib/_codespell.py#L1327-L1328

Added lines #L1327 - L1328 were not covered by tests
else:
# serial
bad_count = sum(map(file_parser, _find_files()))

if summary:
print("\n-------8<-------\nSUMMARY:")
Expand Down
Loading