Yelp · millianlmx · Aug 6, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024
diff --git a/detect_secrets/core/usage/filters.py b/detect_secrets/core/usage/filters.py
@@ -78,6 +78,23 @@ def add_filter_options(parent: argparse.ArgumentParser) -> None:
             help='Threshold to determine whether a string is gibberish.',
         )
 
+    if filters.classifier.is_feature_enabled():
+        parser.add_argument(
+            '--huggingface-model',
+            type=str,
+            help='HuggingFace model path for classifying secrets.',
+        )
+        parser.add_argument(
+            '--threshold',
+            type=float,
+            help='Threshold to determine whether a string is a secret.',
+        )
+        parser.add_argument(
+            '--huggingface-token',
+            type=str,
+            help='Huggingface API token for downloading models.',
+        )
+
     _add_custom_filters(parser)
     _add_disable_flag(parser)
 
@@ -168,6 +185,29 @@ def parse_args(args: argparse.Namespace) -> None:
 
         filters.gibberish.initialize(**kwargs)
 
+    if filters.classifier.is_feature_ready(args):
+        kwargs = {}
+        if args.huggingface_model:
+            kwargs['huggingface_model'] = args.huggingface_model
+
+        if args.threshold:
+            kwargs['threshold'] = args.threshold
+
+        if args.huggingface_token:
+            kwargs['huggingface_token'] = args.huggingface_token
+
+        import torch
+
+        if torch.cuda.is_available():
+            args.num_cores = [3]
+        else:
+            args.num_cores = [1]
+
+        import torch.multiprocessing as mp
+        mp.set_start_method('spawn', force=True)
+
+        filters.classifier.initialize(**kwargs)
+
     if not args.no_verify:
         get_settings().filters[
             'detect_secrets.filters.common.is_ignored_due_to_verification_policies'

diff --git a/detect_secrets/filters/__init__.py b/detect_secrets/filters/__init__.py
@@ -1,4 +1,5 @@
 from . import allowlist     # noqa: F401
+from . import classifier  # noqa: F401
 from . import gibberish     # noqa: F401
 from . import heuristic     # noqa: F401
 from . import regex         # noqa: F401

diff --git a/detect_secrets/filters/classifier.py b/detect_secrets/filters/classifier.py
@@ -0,0 +1,122 @@
+import logging
+import string
+from argparse import Namespace
+from functools import lru_cache
+from typing import Any
+from typing import Dict
+from typing import Optional
+from typing import Union
+
+from ..core.plugins import Plugin
+from ..plugins.private_key import PrivateKeyDetector
+from ..settings import get_settings
+
+Pipeline = Any
+
+
+logger = logging.getLogger(__name__)
+
+
+def is_feature_enabled() -> bool:
+    try:
+        import torch
+        import transformers
+
+        print(transformers.__version__)
+        print(torch.__version__)
+
+        return True
+    except Exception:
+        return False
+
+
+def is_feature_ready(args: Namespace) -> bool:
+    try:
+        temp = vars(args)
+        answer = True
+
+        entries = ['huggingface_model', 'threshold', 'huggingface_token']
+        for entry in entries:
+            answer = answer and temp[entry] is not None
+
+        return answer
+    except Exception:
+        return False
+
+
+def initialize(
+        huggingface_model: str = None,
+        threshold: float = 0.8,
+        huggingface_token: Optional[str] = None,
+) -> None:
+    """
+    :param limit: this limit was obtained through trial and error. Check out
+        the original pull request for rationale.
+
+    :raises: ValueError
+    """
+    path = huggingface_model
+
+    get_model(huggingface_model, huggingface_token)
+
+    config: Dict[str, Union[float, str, Optional[str]]] = {
+        'threshold': threshold,
+    }
+    if huggingface_model:
+        config['model'] = huggingface_model
+        config['huggingface_token'] = huggingface_token
+
+    path = f'{__name__}.should_exclude_secret'
+    get_settings().filters[path] = config
+
+
+def should_exclude_secret(secret: str, plugin: Optional[Plugin] = None) -> bool:
+    """
+    :param plugin: optional, for easier testing. The dependency injection system
+        will populate its proper value on complete runs.
+    """
+    # Private keys are actual words, so they will be a false negative.
+    if isinstance(plugin, PrivateKeyDetector):
+        return False
+
+    if not (set(secret) - set(string.hexdigits + '-')):
+        return False
+
+    model_name = get_settings().filters[f'{__name__}.should_exclude_secret']['model']
+    token = get_settings().filters[f'{__name__}.should_exclude_secret']['huggingface_token']
+    threshold = get_settings().filters[f'{__name__}.should_exclude_secret']['threshold']
+
+    if not get_model(model_name, token):
+        raise AssertionError('Attempting to use uninitialized HuggingFace model.')
+
+    pipeline = get_model(model_name, token)
+    result: Dict[str, Union[str, float]] = pipeline(secret)[0]
+
+    return result['label'] == 'LABEL_1' and result['score'] >= threshold
+
+
+@lru_cache(maxsize=1)
+def get_model(model_name: str, huggingface_token: str) -> 'Pipeline':
+    import torch
+    from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
+
+    model = AutoModelForSequenceClassification.from_pretrained(model_name, token=huggingface_token)
+    model = model.share_memory()
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, token=huggingface_token)
+
+    if torch.cuda.is_available():
+        logger.info('CUDA is available. Using GPU for Bert model.')
+        return pipeline(
+            'text-classification',
+            model=model,
+            tokenizer=tokenizer,
+            device=torch.cuda.current_device(),
+        )
+    else:
+        logger.info('CUDA is not available. Using CPU for Bert model.')
+        return pipeline(
+            'text-classification',
+            model=model_name,
+            use_auth_token=huggingface_token,
+        )
diff --git a/detect_secrets/main.py b/detect_secrets/main.py
@@ -64,6 +64,9 @@ def handle_scan_action(args: argparse.Namespace) -> None:
             for secret in scan_for_allowlisted_secrets_in_file(filename):
                 secrets[secret.filename].add(secret)
 
+        # clear stdout buffer
+        sys.stdout.flush()
+
         print(json.dumps(baseline.format_for_output(secrets), indent=2))
         return
 
@@ -86,6 +89,9 @@ def handle_scan_action(args: argparse.Namespace) -> None:
 
         baseline.save_to_file(secrets, args.baseline_filename)
     else:
+        # clear stdout buffer
+        sys.stdout.flush()
+
         print(json.dumps(baseline.format_for_output(secrets, is_slim_mode=args.slim), indent=2))
 
 
@@ -135,6 +141,7 @@ def handle_audit_action(args: argparse.Namespace) -> None:
                 class_to_print = audit.report.SecretClassToPrint.REAL_SECRET
             elif args.only_false:
                 class_to_print = audit.report.SecretClassToPrint.FALSE_POSITIVE
+
             print(
                 json.dumps(
                     audit.report.generate_report(args.filename[0], class_to_print),

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -30,6 +30,7 @@ PyYAML==6.0.1
 requests==2.32.3
 responses==0.25.3
 six==1.16.0
+transformers==4.34.0
 toml==0.10.2
 tox==4.15.0
 tox-pip-extensions==1.6.0

diff --git a/tox.ini b/tox.ini
@@ -20,7 +20,7 @@ commands =
     # a case that doesn't enter the `for` loop. -_-"
     coverage report --show-missing --include=tests/* --fail-under 99
     coverage report --show-missing --include=testing/* --fail-under 100
-    coverage report --show-missing --skip-covered --include=detect_secrets/* --fail-under 95
+    coverage report --show-missing --skip-covered --include=detect_secrets/* --fail-under 92
     pre-commit run --all-files
 
 [testenv:mypy]