Skip to content

Commit

Permalink
initial regex check
Browse files Browse the repository at this point in the history
  • Loading branch information
doug-szeto-slalom committed Jun 4, 2024
1 parent 6c080f1 commit 05064dd
Show file tree
Hide file tree
Showing 8 changed files with 336 additions and 70 deletions.
22 changes: 21 additions & 1 deletion secureli/actions/scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from secureli.modules.observability.observability_services.logging import LoggingService
from secureli.modules.core.core_services.scanner import HooksScannerService
from secureli.modules.pii_scanner.pii_scanner import PiiScannerService
from secureli.modules.custom_scanner.custom_scanner import CustomScannerService
from secureli.modules.shared.models.scan import ScanMode, ScanResult
from secureli.settings import Settings
from secureli.modules.shared import utilities
Expand All @@ -38,11 +39,13 @@ def __init__(
action_deps: action.ActionDependencies,
hooks_scanner: HooksScannerService,
pii_scanner: PiiScannerService,
custom_scanner: CustomScannerService,
git_repo: GitRepo,
):
super().__init__(action_deps)
self.hooks_scanner = hooks_scanner
self.pii_scanner = pii_scanner
self.custom_scanner = custom_scanner
self.git_repo = git_repo

def publish_results(
Expand Down Expand Up @@ -114,17 +117,28 @@ def scan_repo(

# Execute PII scan (unless `specific_test` is provided, in which case it will be for a hook below)
pii_scan_result: ScanResult | None = None
custom_regex_patterns = self._get_custom_scan_patterns(folder_path=folder_path)
custom_scan_result: ScanResult | None = None
if not specific_test:
pii_scan_result = self.pii_scanner.scan_repo(
folder_path, scan_mode, files=files
)

custom_scan_result = self.custom_scanner.scan_repo(
folder_path=folder_path,
scan_mode=scan_mode,
files=files,
custom_regex_patterns=custom_regex_patterns,
)

# Execute hooks
hooks_scan_result = self.hooks_scanner.scan_repo(
folder_path, scan_mode, specific_test, files=files
)

scan_result = utilities.merge_scan_results([pii_scan_result, hooks_scan_result])
scan_result = utilities.merge_scan_results(
[pii_scan_result, hooks_scan_result, custom_scan_result]
)

details = scan_result.output or "Unknown output during scan"
self.action_deps.echo.print(details)
Expand All @@ -137,6 +151,7 @@ def scan_repo(
individual_failure_count = utilities.convert_failures_to_failure_count(
scan_result.failures
)
temp = "BLAGH.py"

log_data = (
self.action_deps.logging.success(LogAction.scan)
Expand Down Expand Up @@ -207,3 +222,8 @@ def _get_commited_files(self, scan_mode: ScanMode) -> list[Path]:
return [Path(file) for file in committed_files]
except:
return None

def _get_custom_scan_patterns(self, folder_path: Path) -> list[Path]:
settings = self.action_deps.settings.load(folder_path)
custom_scan_patterns = settings.scan_patterns.custom_scan_patterns
return custom_scan_patterns
26 changes: 17 additions & 9 deletions secureli/actions/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,19 +70,23 @@ def _validate_regex(self, pattern: str) -> bool:
re.compile(pattern)
return True
except:
self.action_deps.echo.warning(f'Invalid regex pattern detected: "{pattern}". Excluding pattern.\n')
self.action_deps.echo.warning(
f'Invalid regex pattern detected: "{pattern}". Excluding pattern.\n'
)
return False

def _validate_pattern(self, pattern, patterns):
"""
Checks the pattern is a valid Regex and is not already present in the patterns list
param pattern: A string to be checked
param patterns: A reference list to check for duplicate values
"""
if pattern in patterns:
self.action_deps.echo.warning(f'Duplicate scan pattern detected: "{pattern}". Excluding pattern.')
self.action_deps.echo.warning(
f'Duplicate scan pattern detected: "{pattern}". Excluding pattern.'
)
return False

return self._validate_regex(pattern)

def add_pattern(self, folder_path, patterns: List[str]):
Expand All @@ -92,23 +96,27 @@ def add_pattern(self, folder_path, patterns: List[str]):
:param patterns: A user provided list of regex patterns to be saved
"""

#Algorithm Notes:
#for each pattern
# Algorithm Notes:
# for each pattern
# Check pattern is a valid regex
# if invalid, print warning and filter out pattern
# Check pattern is not present in custom_scan_patterns list
# if present, print warning and do not add duplicate
# Prevent repeated flags from being added twice
#add new patterns to custom_scan_patterns list
#save updated custom_scan_patterns list to secureli yaml file
# add new patterns to custom_scan_patterns list
# save updated custom_scan_patterns list to secureli yaml file

saved_patterns = []
settings = self.action_deps.settings.load(folder_path)
if settings.scan_patterns is not None:
saved_patterns = settings.scan_patterns.custom_scan_patterns

# Use a set comprehension to prevent flag duplicates
new_patterns = { pattern for pattern in patterns if self._validate_pattern(pattern, saved_patterns)}
new_patterns = {
pattern
for pattern in patterns
if self._validate_pattern(pattern, saved_patterns)
}
saved_patterns.extend(new_patterns)

if len(saved_patterns) > 0:
Expand Down
6 changes: 6 additions & 0 deletions secureli/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from secureli.modules.core.core_services.scanner import HooksScannerService
from secureli.modules.core.core_services.updater import UpdaterService
from secureli.modules.pii_scanner.pii_scanner import PiiScannerService
from secureli.modules.custom_scanner.custom_scanner import CustomScannerService
from secureli.modules.secureli_ignore import SecureliIgnoreService
from secureli.settings import Settings

Expand Down Expand Up @@ -143,6 +144,10 @@ class Container(containers.DeclarativeContainer):
echo=echo,
)

custom_scanner_service = providers.Factory(
CustomScannerService, repo_files=repo_files_repository, echo=echo
)

updater_service = providers.Factory(
UpdaterService,
pre_commit=pre_commit_abstraction,
Expand Down Expand Up @@ -183,6 +188,7 @@ class Container(containers.DeclarativeContainer):
action_deps=action_deps,
hooks_scanner=hooks_scanner_service,
pii_scanner=pii_scanner_service,
custom_scanner=custom_scanner_service,
git_repo=git_repo,
)

Expand Down
13 changes: 7 additions & 6 deletions secureli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,13 +184,13 @@ def update(
),
] = Path("."),
new_patterns: Annotated[
Optional[List[str]],
Optional[List[str]],
Option(
"--new-pattern",
"-n",
help="Add a new Regex to the custom scan pattern list"
)
] = None
"--new-pattern",
"-n",
help="Add a new Regex to the custom scan pattern list",
),
] = None,
):
"""
Update linters, configuration, custom scan patterns and all else needed to maintain a secure repository.
Expand All @@ -201,5 +201,6 @@ def update(
SecureliConfig.FOLDER_PATH = Path(directory)
container.update_action().update_hooks(Path(directory), latest)


if __name__ == "__main__":
app()
215 changes: 215 additions & 0 deletions secureli/modules/custom_scanner/custom_scanner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
from secureli.modules.shared.consts.pii import (
Format,
RESULT_FORMAT,
SECURELI_GITHUB,
)
import os
import re
from typing import Optional
from pathlib import Path
import pydantic

import secureli.modules.shared.models.scan as scan
from secureli.modules.shared.abstractions.echo import EchoAbstraction
from secureli.repositories.repo_files import RepoFilesRepository


class CustomScanResult(pydantic.BaseModel):
"""
An individual result of potential custom RegEx found
"""

line_num: int
regex_pattern: str


class CustomScannerService:
"""
Scans the repo for potential custom RegEx
"""

def __init__(
self,
repo_files: RepoFilesRepository,
echo: EchoAbstraction,
):
self.repo_files = repo_files
self.echo = echo

def scan_repo(
self,
folder_path: Path,
scan_mode: scan.ScanMode,
custom_regex_patterns: list[str],
files: Optional[list[str]] = None,
) -> scan.ScanResult:
"""
Scans the repo for potential custom RegEx
:param folder_path: The folder path to initialize the repo for
:param scan_mode: Whether to scan the staged files (i.e., the files about to be
committed) or the entire repository
:param files: A specified list of files to scan
:return: A ScanResult object with details of whether the scan succeeded and, if not, details of the failures
"""

file_paths = self._get_files_list(
folder_path=folder_path, scan_mode=scan_mode, files=files
)
current_line_num = 0
custom_regex_found: dict[str, list[CustomScanResult]] = {}
custom_regex_found_files = set()

for file_path in file_paths:
file_name = str(file_path)
try:
with open(file_path) as file:
for line in file:
current_line_num += 1
for custom_regex in custom_regex_patterns:
if re.search(custom_regex, line):
if not file_name in custom_regex_found:
custom_regex_found[file_name] = []
custom_regex_found[file_name].append(
{
"line_num": current_line_num,
"regex_pattern": custom_regex,
}
)
custom_regex_found_files.add(file_name)
current_line_num = 0

except Exception as e:
self.echo.print(f"Error scanning for custom RegEx {file_name}: {e}")
scan_failures = self._generate_scan_failures(custom_regex_found_files)
output = self._generate_scan_output(custom_regex_found, not custom_regex_found)

return scan.ScanResult(
successful=not custom_regex_found,
output=output,
failures=scan_failures,
)

def _file_excluded(self, filename) -> bool:
print("FILE name!!", filename)
_, file_extension = os.path.splitext(filename)
# return True

return False

def _get_files_list(
self,
folder_path: Path,
scan_mode: scan.ScanMode,
files: Optional[list[str]] = None,
) -> list[Path]:
"""
Gets the list of files to scan based on ScanMode and, if applicable, files provided in arguments
Note: Files cannot be specified for the `all-files` ScanMode. Also, if a provided file is not staged,
it will not be scanned
:param folder_path: The folder path to initialize the repo for
:param scan_mode: Whether to scan the staged files (i.e., the files about to be
committed) or the entire repository
:param files: A specified list of files to scan
:return: List of file names to be scanned
"""
file_paths: list[Path] = []

if scan_mode == scan.ScanMode.STAGED_ONLY:
file_paths = self.repo_files.list_staged_files(folder_path)
if files:
file_paths = list(filter(lambda file: file in file_paths, files))

if scan_mode == scan.ScanMode.ALL_FILES:
file_paths = self.repo_files.list_repo_files(folder_path)

return list(
filter(
lambda file: file != ".secureli.yaml", file_paths
) # must exclude the .secureli.yaml file since it stores the regex patterns that are being checked
)

def _generate_scan_failures(
self, custom_regex_found_files: set[str]
) -> list[scan.ScanFailure]:
"""
Generates a list of ScanFailures for each file in which custom RegEx was found
:param custom_regex_found_files: The set of files in which custom RegEx was found
:return: List of ScanFailures
"""
failures = []

for file in custom_regex_found_files:
failures.append(
scan.ScanFailure(
id="custom_regex_scan", file=file, repo=SECURELI_GITHUB
)
)
return failures

def _generate_initial_output(self, success: bool) -> str:
"""
Generates the initial output of the custom RegEx scan, indicating whether the scan passed or failed
:param success: Whether the scan passed
:return: A string that will be used at the beginning of the output result
"""
CHECK_STR = "check for custom RegEx"
MAX_RESULT_LENGTH = (
82 # this aims to align with the results output by pre-commit hooks
)

result = (
self._format_string("Passed", [Format.GREEN_BG]) + " "
if success
else self._format_string("Failed", [Format.RED_BG]) + "\n"
)
length_of_dots = MAX_RESULT_LENGTH - len(CHECK_STR) - len(result)
final_msg = (
"\n"
+ self._format_string(
"Custom RegEx found!", [Format.BOLD_WEIGHT, Format.RED_TXT]
)
if not success
else ""
)
output = f"{CHECK_STR}{'.' * length_of_dots}{result}{final_msg}"

return output

def _generate_scan_output(
self, custom_regex_found: dict[str, list[CustomScanResult]], success: bool
) -> str:
"""
Generates the scan output of the PII scan, listing all the areas where potential PII was found
:param custom_regex_found: The breakdown of what custom RegEx was found, and where
:param success: Whether the scan passed
:return: The final output result
"""
output = self._generate_initial_output(success)
for file, results in custom_regex_found.items():
output = (
output
+ "\n"
+ self._format_string(
f"File: {file}", [Format.BOLD_WEIGHT, Format.PURPLE_TXT]
)
)
for result in results:
print(result)
output = (
output
+ f"\n Line {result['line_num']} | Pattern Matched: {result['regex_pattern']}"
)
return output + "\n"

def _format_string(self, str: str, formats: list[Format]) -> str:
"""
Applies formatting to a string
:param str: The string to format
:param formats: The formatting to apply to the string
:return: The formatted string
"""

start = "".join(f"{RESULT_FORMAT[format]}" for format in formats)
end = f"{RESULT_FORMAT[Format.DEFAULT]}{RESULT_FORMAT[Format.REG_WEIGHT]}"

return f"{start}{str}{end}"
Loading

0 comments on commit 05064dd

Please sign in to comment.