Skip to content

Commit

Permalink
Merge pull request #139 from GitGuardian/jeremy/-/exclude-patterns-wi…
Browse files Browse the repository at this point in the history
…th-regex

fix(filter): stop to list FS to exclude files to scan
  • Loading branch information
jeremyds authored Nov 18, 2021
2 parents ba71c6f + acb3ce6 commit b2e1a1b
Show file tree
Hide file tree
Showing 11 changed files with 92 additions and 37 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ paths-ignore:
```
```sh
ggshield scan --exclude dir/subdir path -r dir
ggshield scan --exclude dir/subdir/** path -r dir
```
## Ignoring a secret
Expand Down
2 changes: 1 addition & 1 deletion ggshield/ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ def ci_cmd(ctx: click.Context) -> int: # pragma: no cover
commit_list=commit_list,
output_handler=ctx.obj["output_handler"],
verbose=config.verbose,
filter_set=ctx.obj["filter_set"],
exclusion_regexes=ctx.obj["exclusion_regexes"],
matches_ignore=config.matches_ignore,
all_policies=config.all_policies,
scan_id=" ".join(commit_list),
Expand Down
5 changes: 2 additions & 3 deletions ggshield/cmd.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/usr/bin/python3
import os
import sys
from pathlib import Path
from typing import Any, List, Optional, Type, cast

import click
Expand All @@ -10,7 +9,7 @@
from .config import CONTEXT_SETTINGS, Cache, Config, load_dot_env
from .dev_scan import path_cmd, range_cmd, repo_cmd
from .docker import docker_archive_cmd, docker_name_cmd
from .filter import path_filter_set
from .filter import init_exclusion_regexes
from .hook_cmd import precommit_cmd, prepush_cmd
from .ignore import ignore
from .install import install
Expand Down Expand Up @@ -129,7 +128,7 @@ def scan(
if not ignore_default_excludes and not ctx.obj["config"].ignore_default_excludes:
paths_ignore.update(IGNORED_DEFAULT_PATTERNS)

ctx.obj["filter_set"] = path_filter_set(Path(os.getcwd()), paths_ignore)
ctx.obj["exclusion_regexes"] = init_exclusion_regexes(paths_ignore)
config: Config = ctx.obj["config"]

if show_secrets is not None:
Expand Down
13 changes: 6 additions & 7 deletions ggshield/dev_scan.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import concurrent.futures
import os
import re
import tempfile
from contextlib import contextmanager
from pathlib import Path
from typing import Iterable, Iterator, List, Optional, Set

import click
Expand All @@ -13,7 +13,6 @@
from ggshield.text_utils import STYLE, format_text

from .config import CPU_COUNT, Cache, Config
from .filter import path_filter_set
from .git_shell import GIT_PATH, get_list_commit_SHA, is_git_dir, shell
from .path import get_files_from_paths
from .utils import REGEX_GIT_URL, SupportedScanMode, handle_exception
Expand Down Expand Up @@ -48,7 +47,7 @@ def scan_repo_path(
commit_list=get_list_commit_SHA("--all"),
output_handler=output_handler,
verbose=config.verbose,
filter_set=path_filter_set(Path(os.getcwd()), []),
exclusion_regexes=set(),
matches_ignore=config.matches_ignore,
all_policies=config.all_policies,
scan_id=scan_id,
Expand Down Expand Up @@ -130,7 +129,7 @@ def range_cmd(ctx: click.Context, commit_range: str) -> int: # pragma: no cover
commit_list=commit_list,
output_handler=ctx.obj["output_handler"],
verbose=config.verbose,
filter_set=ctx.obj["filter_set"],
exclusion_regexes=ctx.obj["exclusion_regexes"],
matches_ignore=config.matches_ignore,
all_policies=config.all_policies,
scan_id=commit_range,
Expand Down Expand Up @@ -159,7 +158,7 @@ def path_cmd(
try:
files = get_files_from_paths(
paths=paths,
paths_ignore=ctx.obj["filter_set"],
exclusion_regexes=ctx.obj["exclusion_regexes"],
recursive=recursive,
yes=yes,
verbose=config.verbose,
Expand Down Expand Up @@ -217,7 +216,7 @@ def scan_commit_range(
commit_list: List[str],
output_handler: OutputHandler,
verbose: bool,
filter_set: Set[str],
exclusion_regexes: Set[re.Pattern],
matches_ignore: Iterable[str],
all_policies: bool,
scan_id: str,
Expand All @@ -238,7 +237,7 @@ def scan_commit_range(
future_to_process = [
executor.submit(
scan_commit,
Commit(sha, filter_set),
Commit(sha, exclusion_regexes),
client,
cache,
verbose,
Expand Down
54 changes: 48 additions & 6 deletions ggshield/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,20 @@
import operator
import re
from collections import OrderedDict
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Set

import click
from pygitguardian.models import Match, PolicyBreak, ScanResult


REGEX_MATCH_HIDE = re.compile(r"[^+\-\s]")
REGEX_SPECIAL_CHARS = set(".^$+*?{}()[]\\|")
INVALID_PATTERNS_REGEX = re.compile(
r"(\*\*\*)" # the "***" sequence is not valid
r"|(\*\*[^/])" # a "**" sequence must be immediately followed by a "/"
r"|([^/]\*\*)" # a "**" sequence must be either at the start of the string or
# immediately preceded by a "/"
)

MAXIMUM_CENSOR_LENGTH = 60

Expand Down Expand Up @@ -118,19 +125,54 @@ def leak_dictionary_by_ignore_sha(
return sha_dict


def path_filter_set(top_dir: Path, paths_ignore: Iterable[str]) -> Set[str]:
def translate_user_pattern(pattern: str) -> str:
"""
Translate the user pattern into a regex. This function assumes that the given
pattern is valid and has been normalized beforehand.
"""

# Escape each special character
pattern = "".join(
f"\\{char}" if char in REGEX_SPECIAL_CHARS else char for char in pattern
)

# Handle start/end of pattern
if pattern[-1] != "/":
pattern += "$"
if pattern[0] == "/":
pattern = "^" + pattern[1:]
else:
pattern = "(^|/)" + pattern

# Replace * and ** sequences
pattern = re.sub(r"\\\*\\\*/", "([^/]+/)*", pattern)
pattern = re.sub(r"\\\*", "([^/]+)", pattern)

return pattern


def is_pattern_valid(pattern: str) -> bool:
return bool(pattern) and not INVALID_PATTERNS_REGEX.search(pattern)


def init_exclusion_regexes(paths_ignore: Iterable[str]) -> Set[re.Pattern]:
"""
filter_set creates a set of paths of the ignored
entries from 3 sources:
.gitguardian.yaml
files in .git
files ignore in .gitignore
"""
filters = set()
for ignored in paths_ignore:
filters.update({str(target) for target in top_dir.glob(ignored)})
res = set()
for path in paths_ignore:
if not is_pattern_valid(path):
raise click.ClickException(f"{path} is not a valid exclude pattern.")
res.add(re.compile(translate_user_pattern(path)))
return res


return filters
def is_filepath_excluded(filepath: str, exclusion_regexes: Set[re.Pattern]) -> bool:
return any(r.search(filepath) for r in exclusion_regexes)


def censor_match(match: Match) -> str:
Expand Down
4 changes: 2 additions & 2 deletions ggshield/hook_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def precommit_cmd(
)
try:
check_git_dir()
results = Commit(filter_set=ctx.obj["filter_set"]).scan(
results = Commit(exclusion_regexes=ctx.obj["exclusion_regexes"]).scan(
client=ctx.obj["client"],
cache=ctx.obj["cache"],
matches_ignore=config.matches_ignore,
Expand Down Expand Up @@ -135,7 +135,7 @@ def prepush_cmd(ctx: click.Context, prepush_args: List[str]) -> int: # pragma:
commit_list=commit_list,
output_handler=ctx.obj["output_handler"],
verbose=config.verbose,
filter_set=ctx.obj["filter_set"],
exclusion_regexes=ctx.obj["exclusion_regexes"],
matches_ignore=config.matches_ignore,
all_policies=config.all_policies,
scan_id=" ".join(commit_list),
Expand Down
16 changes: 10 additions & 6 deletions ggshield/path.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import re
from pathlib import Path
from typing import Iterable, List, Set, Union

Expand All @@ -7,6 +8,7 @@
from ggshield.git_shell import git_ls, is_git_dir

from .config import MAX_FILE_SIZE
from .filter import is_filepath_excluded
from .scan import File, Files


Expand All @@ -15,7 +17,7 @@

def get_files_from_paths(
paths: List[str],
paths_ignore: List[str],
exclusion_regexes: Set[re.Pattern],
recursive: bool,
yes: bool,
verbose: bool,
Expand All @@ -31,7 +33,9 @@ def get_files_from_paths(
:param verbose: Option that displays filepaths as they are scanned
:param ignore_git: Ignore that the folder is a git repository
"""
filepaths = get_filepaths(paths, paths_ignore, recursive, ignore_git=ignore_git)
filepaths = get_filepaths(
paths, exclusion_regexes, recursive, ignore_git=ignore_git
)
files = list(generate_files_from_paths(filepaths, verbose))

if verbose:
Expand All @@ -49,7 +53,7 @@ def get_files_from_paths(

def get_filepaths(
paths: Union[List, str],
paths_ignore: Iterable[str],
exclusion_regexes: Set[re.Pattern],
recursive: bool,
ignore_git: bool,
) -> Set[str]:
Expand Down Expand Up @@ -77,9 +81,9 @@ def get_filepaths(
else:
_targets = {str(target) for target in top_dir.rglob(r"*")}

_targets.difference_update(paths_ignore)

targets.update(_targets)
for file_path in _targets:
if not is_filepath_excluded(file_path, exclusion_regexes):
targets.add(file_path)
return targets


Expand Down
2 changes: 1 addition & 1 deletion ggshield/pre_receive_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def prereceive_cmd(ctx: click.Context, web: bool, prereceive_args: List[str]) ->
commit_list=commit_list,
output_handler=ctx.obj["output_handler"],
verbose=config.verbose,
filter_set=ctx.obj["filter_set"],
exclusion_regexes=ctx.obj["exclusion_regexes"],
matches_ignore=config.matches_ignore,
all_policies=config.all_policies,
scan_id=" ".join(commit_list),
Expand Down
11 changes: 6 additions & 5 deletions ggshield/scan/scannable.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import concurrent.futures
import os
import re
from typing import Any, Callable, Dict, Iterable, List, NamedTuple, Optional, Set

Expand All @@ -10,6 +9,7 @@

from ggshield.config import CPU_COUNT, MAX_FILE_SIZE, Cache
from ggshield.filter import (
is_filepath_excluded,
remove_ignored_from_result,
remove_results_from_banlisted_detectors,
)
Expand Down Expand Up @@ -173,11 +173,13 @@ class Commit(Files):
Commit represents a commit which is a list of commit files.
"""

def __init__(self, sha: Optional[str] = None, filter_set: Set[str] = set()):
def __init__(
self, sha: Optional[str] = None, exclusion_regexes: Set[re.Pattern] = set()
):
self.sha = sha
self._patch: Optional[str] = None
self._files = {}
self.filter_set = filter_set
self.exclusion_regexes = exclusion_regexes
self._info: Optional[CommitInformation] = None

@property
Expand Down Expand Up @@ -262,13 +264,12 @@ def get_files(self) -> Iterable[CommitFile]:
+this is a test patch\n
"""
list_diff = re.split(r"^diff --git ", self.patch, flags=re.MULTILINE)[1:]
work_dir = os.getcwd()

for diff in list_diff:
lines = diff.split("\n")

filename = self.get_filename(lines[0])
if os.path.join(work_dir, filename) in self.filter_set:
if is_filepath_excluded(filename, self.exclusion_regexes):
continue

filemode = self.get_filemode(lines[1])
Expand Down
4 changes: 2 additions & 2 deletions tests/scan/test_scannable.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import os
from collections import namedtuple

import pytest

from ggshield.config import MAX_FILE_SIZE
from ggshield.filter import init_exclusion_regexes
from ggshield.scan import Commit
from ggshield.utils import Filemode, SupportedScanMode
from tests.conftest import (
Expand Down Expand Up @@ -158,7 +158,7 @@ def test_patch_separation_ignore():
c = Commit()
c._patch = PATCH_SEPARATION
file_to_ignore = ".env"
c.filter_set = {os.path.join(os.getcwd(), file_to_ignore)}
c.exclusion_regexes = init_exclusion_regexes([file_to_ignore])
files = list(c.get_files())

assert len(files) == 3
Expand Down
16 changes: 13 additions & 3 deletions tests/test_prepush_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from click.testing import CliRunner

from ggshield.cmd import cli
from ggshield.utils import EMPTY_SHA, EMPTY_TREE
from ggshield.filter import init_exclusion_regexes
from ggshield.utils import EMPTY_SHA, EMPTY_TREE, IGNORED_DEFAULT_PATTERNS


class TestPrepush:
Expand Down Expand Up @@ -88,23 +89,32 @@ def test_prepush_pre_commit_framework(
get_list_mock.assert_called_once_with(
"--max-count=51 " + "b" * 40 + "..." + "a" * 40
)

scan_commit_range_mock.assert_called_once_with(
client=ANY,
cache=ANY,
commit_list=commit_list,
output_handler=ANY,
verbose=True,
filter_set=set(),
exclusion_regexes=ANY,
matches_ignore=ANY,
all_policies=False,
scan_id=ANY,
mode_header="pre_push",
banlisted_detectors=set(),
)
scan_commit_range_mock.assert_called_once()
assert "Commits to scan: 20" in result.output
assert result.exit_code == 0

expected_exclusion_regexes = init_exclusion_regexes(IGNORED_DEFAULT_PATTERNS)
expected_exclusion_patterns = [r.pattern for r in expected_exclusion_regexes]
result_exclusion_regexes = scan_commit_range_mock.call_args_list[0][1][
"exclusion_regexes"
]
result_exclusion_patterns = [r.pattern for r in result_exclusion_regexes]

assert sorted(result_exclusion_patterns) == sorted(expected_exclusion_patterns)

@patch("ggshield.hook_cmd.get_list_commit_SHA")
@patch("ggshield.hook_cmd.scan_commit_range")
@patch("ggshield.hook_cmd.check_git_dir")
Expand Down

0 comments on commit b2e1a1b

Please sign in to comment.