Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support for filtering secrets using deep learning models #873

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
40 changes: 40 additions & 0 deletions detect_secrets/core/usage/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,23 @@ def add_filter_options(parent: argparse.ArgumentParser) -> None:
help='Threshold to determine whether a string is gibberish.',
)

if filters.classifier.is_feature_enabled():
parser.add_argument(
'--huggingface-model',
type=str,
help='HuggingFace model path for classifying secrets.',
)
parser.add_argument(
'--threshold',
type=float,
help='Threshold to determine whether a string is a secret.',
)
parser.add_argument(
'--huggingface-token',
type=str,
help='Huggingface API token for downloading models.',
)

_add_custom_filters(parser)
_add_disable_flag(parser)

Expand Down Expand Up @@ -168,6 +185,29 @@ def parse_args(args: argparse.Namespace) -> None:

filters.gibberish.initialize(**kwargs)

if filters.classifier.is_feature_ready(args):
kwargs = {}
if args.huggingface_model:
kwargs['huggingface_model'] = args.huggingface_model

if args.threshold:
kwargs['threshold'] = args.threshold

if args.huggingface_token:
kwargs['huggingface_token'] = args.huggingface_token

import torch

if torch.cuda.is_available():
args.num_cores = [3]
else:
args.num_cores = [1]

import torch.multiprocessing as mp
mp.set_start_method('spawn', force=True)

filters.classifier.initialize(**kwargs)

if not args.no_verify:
get_settings().filters[
'detect_secrets.filters.common.is_ignored_due_to_verification_policies'
Expand Down
1 change: 1 addition & 0 deletions detect_secrets/filters/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from . import allowlist # noqa: F401
from . import classifier # noqa: F401
from . import gibberish # noqa: F401
from . import heuristic # noqa: F401
from . import regex # noqa: F401
Expand Down
122 changes: 122 additions & 0 deletions detect_secrets/filters/classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import logging
import string
from argparse import Namespace
from functools import lru_cache
from typing import Any
from typing import Dict
from typing import Optional
from typing import Union

from ..core.plugins import Plugin
from ..plugins.private_key import PrivateKeyDetector
from ..settings import get_settings

Pipeline = Any


logger = logging.getLogger(__name__)


def is_feature_enabled() -> bool:
try:
import torch
import transformers

print(transformers.__version__)
print(torch.__version__)

return True
except Exception:
return False


def is_feature_ready(args: Namespace) -> bool:
try:
temp = vars(args)
answer = True

entries = ['huggingface_model', 'threshold', 'huggingface_token']
for entry in entries:
answer = answer and temp[entry] is not None

return answer
except Exception:
return False


def initialize(
huggingface_model: str = None,
threshold: float = 0.8,
huggingface_token: Optional[str] = None,
) -> None:
"""
:param limit: this limit was obtained through trial and error. Check out
the original pull request for rationale.

:raises: ValueError
"""
path = huggingface_model

get_model(huggingface_model, huggingface_token)

config: Dict[str, Union[float, str, Optional[str]]] = {
'threshold': threshold,
}
if huggingface_model:
config['model'] = huggingface_model
config['huggingface_token'] = huggingface_token

path = f'{__name__}.should_exclude_secret'
get_settings().filters[path] = config


def should_exclude_secret(secret: str, plugin: Optional[Plugin] = None) -> bool:
"""
:param plugin: optional, for easier testing. The dependency injection system
will populate its proper value on complete runs.
"""
# Private keys are actual words, so they will be a false negative.
if isinstance(plugin, PrivateKeyDetector):
return False

if not (set(secret) - set(string.hexdigits + '-')):
return False

model_name = get_settings().filters[f'{__name__}.should_exclude_secret']['model']
token = get_settings().filters[f'{__name__}.should_exclude_secret']['huggingface_token']
threshold = get_settings().filters[f'{__name__}.should_exclude_secret']['threshold']

if not get_model(model_name, token):
raise AssertionError('Attempting to use uninitialized HuggingFace model.')

pipeline = get_model(model_name, token)
result: Dict[str, Union[str, float]] = pipeline(secret)[0]

return result['label'] == 'LABEL_1' and result['score'] >= threshold


@lru_cache(maxsize=1)
def get_model(model_name: str, huggingface_token: str) -> 'Pipeline':
import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained(model_name, token=huggingface_token)
model = model.share_memory()

tokenizer = AutoTokenizer.from_pretrained(model_name, token=huggingface_token)

if torch.cuda.is_available():
logger.info('CUDA is available. Using GPU for Bert model.')
return pipeline(
'text-classification',
model=model,
tokenizer=tokenizer,
device=torch.cuda.current_device(),
)
else:
logger.info('CUDA is not available. Using CPU for Bert model.')
return pipeline(
'text-classification',
model=model_name,
use_auth_token=huggingface_token,
)
7 changes: 7 additions & 0 deletions detect_secrets/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ def handle_scan_action(args: argparse.Namespace) -> None:
for secret in scan_for_allowlisted_secrets_in_file(filename):
secrets[secret.filename].add(secret)

# clear stdout buffer
sys.stdout.flush()

print(json.dumps(baseline.format_for_output(secrets), indent=2))
return

Expand All @@ -86,6 +89,9 @@ def handle_scan_action(args: argparse.Namespace) -> None:

baseline.save_to_file(secrets, args.baseline_filename)
else:
# clear stdout buffer
sys.stdout.flush()

print(json.dumps(baseline.format_for_output(secrets, is_slim_mode=args.slim), indent=2))


Expand Down Expand Up @@ -135,6 +141,7 @@ def handle_audit_action(args: argparse.Namespace) -> None:
class_to_print = audit.report.SecretClassToPrint.REAL_SECRET
elif args.only_false:
class_to_print = audit.report.SecretClassToPrint.FALSE_POSITIVE

print(
json.dumps(
audit.report.generate_report(args.filename[0], class_to_print),
Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ PyYAML==6.0.1
requests==2.32.3
responses==0.25.3
six==1.16.0
transformers==4.34.0
toml==0.10.2
tox==4.15.0
tox-pip-extensions==1.6.0
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ commands =
# a case that doesn't enter the `for` loop. -_-"
coverage report --show-missing --include=tests/* --fail-under 99
coverage report --show-missing --include=testing/* --fail-under 100
coverage report --show-missing --skip-covered --include=detect_secrets/* --fail-under 95
coverage report --show-missing --skip-covered --include=detect_secrets/* --fail-under 92
pre-commit run --all-files

[testenv:mypy]
Expand Down
Loading