Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove ML_VALIDATION #658

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions credsweeper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# Directory of credsweeper sources MUST be placed before imports to avoid circular import error
APP_PATH = Path(__file__).resolve().parent

from credsweeper.common.constants import KeyValidationOption, Severity, ThresholdPreset, DiffRowType
from credsweeper.common.constants import Severity, ThresholdPreset, DiffRowType
from credsweeper.config import Config
from credsweeper.credentials import Candidate, CredentialManager, CandidateKey
from credsweeper.deep_scanner.deep_scanner import DeepScanner
Expand Down Expand Up @@ -167,7 +167,8 @@ def _use_ml_validation(self) -> bool:
logger.info("Skip ML validation because no candidates were found")
return False
for i in self.credential_manager.candidates:
if i.use_ml:
# None value means ml should be processed for the candidate
if i.ml_probability is None:
# any() or all() is not used to speedup
return True
logger.info("Skip ML validation because no candidates support it")
Expand Down Expand Up @@ -353,7 +354,7 @@ def post_processing(self) -> None:
for group_key, group_candidates in cred_groups.items():
# Analyze with ML if any candidate in group require ML
for candidate in group_candidates:
if candidate.use_ml:
if candidate.ml_probability is None:
ml_cred_groups.append((group_key, group_candidates))
break
else:
Expand All @@ -366,13 +367,11 @@ def post_processing(self) -> None:
is_cred, probability = self.ml_validator.validate_groups(ml_cred_groups, self.ml_batch_size)
for i, (_, group_candidates) in enumerate(ml_cred_groups):
for candidate in group_candidates:
if candidate.use_ml:
if candidate.ml_probability is None:
if is_cred[i]:
candidate.ml_validation = KeyValidationOption.VALIDATED_KEY
candidate.ml_probability = probability[i]
new_cred_list.append(candidate)
else:
candidate.ml_validation = KeyValidationOption.NOT_AVAILABLE
new_cred_list.append(candidate)
else:
logger.info("Skipping ML validation due not applicable")
Expand Down Expand Up @@ -435,7 +434,8 @@ def export_results(self, change_type: Optional[DiffRowType] = None) -> None:
for line_data in credential.line_data_list:
# bright rule name and path or info
print(Style.BRIGHT + credential.rule_name +
f" {line_data.info or line_data.path}:{line_data.line_num}" + Style.RESET_ALL)
f" {line_data.info or line_data.path}:{line_data.line_num} {credential.ml_probability}" +
Style.RESET_ALL)
print(line_data.get_colored_line(hashed=self.hashed, subtext=self.subtext))

if is_exported is False:
Expand Down
8 changes: 0 additions & 8 deletions credsweeper/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,6 @@ class Chars(Enum):
ENTROPY_LIMIT_BASE3x = 3


class KeyValidationOption(Enum):
"""API validation state"""
INVALID_KEY = 0
VALIDATED_KEY = 1
UNDECIDED = 2
NOT_AVAILABLE = 3


class GroupType(Enum):
"""Group type - used in Group constructor for load predefined set of filters"""
KEYWORD = "keyword"
Expand Down
23 changes: 9 additions & 14 deletions credsweeper/credentials/candidate.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import copy
import re
from json.encoder import py_encode_basestring_ascii
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Union

from credsweeper.common.constants import KeyValidationOption, Severity, Confidence
from credsweeper.common.constants import Severity, Confidence
from credsweeper.config import Config
from credsweeper.credentials.line_data import LineData

Expand All @@ -20,8 +20,7 @@ class Candidate:
severity: critical/high/medium/low
confidence: strong/moderate/weak
config: user configs
validations: List of Validation objects that can check this credential using external API
use_ml: Should ML work on this credential or not. If not prediction based on regular expression and filter only
use_ml: Whether the candidate should be validated with ML. If not - ml_probability is set to -1
"""

def __init__(self,
Expand All @@ -37,18 +36,16 @@ def __init__(self,
self.rule_name = rule_name
self.severity = severity
self.config = config
self.use_ml = use_ml
# None - ML is applicable but not processed yet; "NA" - ML is not applicable; float - the ml decision
# Note: -1.0 is possible too for some activation functions in ml model, so let avoid negative values
self.ml_probability: Union[None, str, float] = None if use_ml else "NA"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about use None not "NA" for ml_probability?
I think "NA" seems better to convey the meaning.

self.confidence = confidence
self.ml_validation = KeyValidationOption.NOT_AVAILABLE
self.ml_probability: Optional[float] = None

def compare(self, other: 'Candidate') -> bool:
"""Comparison method - checks only result of final cred"""
if self.rule_name == other.rule_name \
and self.severity == other.severity \
and self.confidence == other.confidence \
and self.use_ml == other.use_ml \
and self.ml_validation == other.ml_validation \
and self.ml_probability == other.ml_probability \
and len(self.line_data_list) == len(other.line_data_list):
for i, j in zip(self.line_data_list, other.line_data_list):
Expand Down Expand Up @@ -78,8 +75,8 @@ def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
return f"rule: {self.rule_name}" \
f" | severity: {self.severity.value}" \
f" | confidence: {self.confidence.value}" \
f" | line_data_list: [{', '.join([x.to_str(subtext, hashed) for x in self.line_data_list])}]" \
f" | ml_validation: {self.ml_validation.name}"
f" | ml_probability: {self.ml_probability}" \
f" | line_data_list: [{', '.join([x.to_str(subtext, hashed) for x in self.line_data_list])}]"

def __str__(self):
return self.to_str()
Expand All @@ -95,13 +92,11 @@ def to_json(self, hashed: bool, subtext: bool) -> Dict:

"""
full_output = {
"ml_validation": self.ml_validation.name,
"patterns": [pattern.pattern for pattern in self.patterns],
"ml_probability": self.ml_probability,
"rule": self.rule_name,
"severity": self.severity.value,
"confidence": self.confidence.value,
"use_ml": self.use_ml,
"ml_probability": self.ml_probability,
# put the array to end to make json more readable
"line_data_list": [line_data.to_json(hashed, subtext) for line_data in self.line_data_list],
}
Expand Down
1 change: 0 additions & 1 deletion credsweeper/secret/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,6 @@
"rule",
"severity",
"confidence",
"ml_validation",
"ml_probability",
"line_data_list"
]
Expand Down
8 changes: 8 additions & 0 deletions docs/source/credsweeper.deep_scanner.rst
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,14 @@ credsweeper.deep\_scanner.lang\_scanner module
:undoc-members:
:show-inheritance:

credsweeper.deep\_scanner.mxfile\_scanner module
------------------------------------------------

.. automodule:: credsweeper.deep_scanner.mxfile_scanner
:members:
:undoc-members:
:show-inheritance:

credsweeper.deep\_scanner.pdf\_scanner module
---------------------------------------------

Expand Down
6 changes: 0 additions & 6 deletions docs/source/develop.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,6 @@ Tests

To run all tests:

.. code-block:: bash
python -m pytest -s tests/
To run only tests independent from external api:

.. code-block:: bash
python -m pytest -s tests/
Expand Down
36 changes: 19 additions & 17 deletions docs/source/guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ Get output as JSON file:

.. code-block:: bash

python -m credsweeper --ml_validation --path tests/samples/password --save-json output.json
python -m credsweeper --path tests/samples/password.gradle --save-json output.json

To check JSON file run:

Expand All @@ -97,10 +97,10 @@ To check JSON file run:

[
{
"ml_validation": "VALIDATED_KEY",
"ml_probability": 0.99755,
"rule": "Password",
"severity": "medium",
"confidence": "moderate",
"ml_probability": 0.9857242107391357,
"line_data_list": [
{
"line": "password = \"cackle!\"",
Expand All @@ -111,9 +111,11 @@ To check JSON file run:
"value_start": 12,
"value_end": 19,
"variable": "password",
"variable_start": 0,
"variable_end": 8,
"entropy_validation":
{
"iterator": "BASE64_CHARS",
"iterator": "BASE64STDPAD_CHARS",
"entropy": 2.120589933192232,
"valid": false
}
Expand All @@ -126,12 +128,12 @@ Get CLI output only:

.. code-block:: bash

python -m credsweeper --path tests/samples/password
python -m credsweeper --path tests/samples/password.gradle


.. code-block:: ruby
.. code-block:: text

rule: Password / severity: medium / line_data_list: [line : 'password = "cackle!"' / line_num : 1 / path : tests/samples/password / entropy_validation: False] / ml_validation: VALIDATED_KEY
rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 1 | path: tests/samples/password.gradle | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]


Exclude outputs using CLI:
Expand All @@ -143,7 +145,7 @@ Space-like characters at left and right will be ignored.

.. code-block:: bash

$ python -m credsweeper --path tests/samples/password --denylist list.txt
$ python -m credsweeper --path tests/samples/password.gradle --denylist list.txt
Detected Credentials: 0
Time Elapsed: 0.07523202896118164s
$ cat list.txt
Expand All @@ -169,7 +171,7 @@ Then specify your config in CLI:

.. code-block:: bash

$ python -m credsweeper --path tests/samples/password --config my_cfg.json
$ python -m credsweeper --path tests/samples/password.gradle --config my_cfg.json
Detected Credentials: 0
Time Elapsed: 0.07152628898620605s

Expand All @@ -190,9 +192,9 @@ Minimal example for scanning line list:
for r in results:
print(r)

.. code-block:: bash
.. code-block:: text

rule: Password / severity: medium / line_data_list: [line: 'password='in_line_2'' / line_num: 2 / path: / value: 'in_line_2' / entropy_validation: False] / ml_validation: NOT_AVAILABLE
rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 1 | path: | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]

Minimal example for scanning bytes:

Expand All @@ -201,16 +203,16 @@ Minimal example for scanning bytes:
from credsweeper import CredSweeper, ByteContentProvider


to_scan = b"line one\npassword='in_line_2'"
to_scan = b"line one\npassword='cackle!'"
cred_sweeper = CredSweeper()
provider = ByteContentProvider(to_scan)
results = cred_sweeper.file_scan(provider)
for r in results:
print(r)

.. code-block:: bash
.. code-block:: text

rule: Password / severity: medium / line_data_list: [line: 'password='in_line_2'' / line_num: 2 / path: / value: 'in_line_2' / entropy_validation: False] / ml_validation: NOT_AVAILABLE
rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 2 | path: | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]


Minimal example for the ML validation:
Expand All @@ -220,7 +222,7 @@ Minimal example for the ML validation:
from credsweeper import CredSweeper, StringContentProvider, MlValidator, ThresholdPreset


to_scan = ["line one", "secret='fgELsRdFA'", "secret='template'"]
to_scan = ["line one", "password='cackle!'", "secret='template'"]
cred_sweeper = CredSweeper()
provider = StringContentProvider(to_scan)

Expand All @@ -237,9 +239,9 @@ Minimal example for the ML validation:

Note that `"secret='template'"` is not reported due to failing check by the `MlValidator`.

.. code-block:: bash
.. code-block:: text

rule: Secret / severity: medium / line_data_list: [line: 'secret='fgELsRdFA'' / line_num: 2 / path: / value: 'fgELsRdFA' / entropy_validation: False] / ml_validation: NOT_AVAILABLE
rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 2 | path: | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]

Configurations
--------------
Expand Down
2 changes: 1 addition & 1 deletion docs/source/overall_architecture.rst
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ Each Rule_ is dedicated to detect a specific type of credential, imported from `
- filter_type
- The type of the Filter_ group you want to apply. Filter_ groups implemented are as follows: `GeneralKeyword <credsweeper.filters.group.html#module-credsweeper.filters.group.general_keyword>`_, `GeneralPattern <credsweeper.filters.group.html#module-credsweeper.filters.group.general_pattern>`_, `PasswordKeyword <credsweeper.filters.group.html#module-credsweeper.filters.group.password_keyword>`_, and `UrlCredentials <credsweeper.filters.group.html#module-credsweeper.filters.group.url_credentials_group>`_.
- use_ml
- The attribute to set whether to perform ML validation. If true, ML validation will be performed.
- The attribute to set whether to perform ML validation. If true, ML validation will be performed. If false - ml_probability will be set to "NA" in report.

Filter
------
Expand Down
Loading
Loading