Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove ML_VALIDATION #658

Merged
merged 9 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions credsweeper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# Directory of credsweeper sources MUST be placed before imports to avoid circular import error
APP_PATH = Path(__file__).resolve().parent

from credsweeper.common.constants import KeyValidationOption, Severity, ThresholdPreset, DiffRowType
from credsweeper.common.constants import Severity, ThresholdPreset, DiffRowType
from credsweeper.config import Config
from credsweeper.credentials import Candidate, CredentialManager, CandidateKey
from credsweeper.deep_scanner.deep_scanner import DeepScanner
Expand Down Expand Up @@ -368,11 +368,9 @@ def post_processing(self) -> None:
for candidate in group_candidates:
if candidate.use_ml:
if is_cred[i]:
candidate.ml_validation = KeyValidationOption.VALIDATED_KEY
candidate.ml_probability = probability[i]
new_cred_list.append(candidate)
else:
candidate.ml_validation = KeyValidationOption.NOT_AVAILABLE
new_cred_list.append(candidate)
else:
logger.info("Skipping ML validation due not applicable")
Expand Down Expand Up @@ -435,7 +433,8 @@ def export_results(self, change_type: Optional[DiffRowType] = None) -> None:
for line_data in credential.line_data_list:
# bright rule name and path or info
print(Style.BRIGHT + credential.rule_name +
f" {line_data.info or line_data.path}:{line_data.line_num}" + Style.RESET_ALL)
f" {line_data.info or line_data.path}:{line_data.line_num} {credential.ml_probability}" +
Style.RESET_ALL)
print(line_data.get_colored_line(hashed=self.hashed, subtext=self.subtext))

if is_exported is False:
Expand Down
8 changes: 0 additions & 8 deletions credsweeper/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,6 @@ class Chars(Enum):
ENTROPY_LIMIT_BASE3x = 3


class KeyValidationOption(Enum):
"""API validation state"""
INVALID_KEY = 0
VALIDATED_KEY = 1
UNDECIDED = 2
NOT_AVAILABLE = 3


class GroupType(Enum):
"""Group type - used in Group constructor for load predefined set of filters"""
KEYWORD = "keyword"
Expand Down
16 changes: 7 additions & 9 deletions credsweeper/credentials/candidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from json.encoder import py_encode_basestring_ascii
from typing import Any, Dict, List, Optional

from credsweeper.common.constants import KeyValidationOption, Severity, Confidence
from credsweeper.common.constants import Severity, Confidence
from credsweeper.config import Config
from credsweeper.credentials.line_data import LineData

Expand All @@ -20,8 +20,7 @@ class Candidate:
severity: critical/high/medium/low
confidence: strong/moderate/weak
config: user configs
validations: List of Validation objects that can check this credential using external API
use_ml: Should ML work on this credential or not. If not prediction based on regular expression and filter only
use_ml: Whether the candidate should be validated with ML. If not - ml_probability is set to -1
"""

def __init__(self,
Expand All @@ -39,7 +38,8 @@ def __init__(self,
self.config = config
self.use_ml = use_ml
self.confidence = confidence
self.ml_validation = KeyValidationOption.NOT_AVAILABLE
# None - ML is not applicable or not processed yet; float - the ml decision above ml_threshold
# Note: -1.0 is possible too for some activation functions in ml model, so let avoid negative values
self.ml_probability: Optional[float] = None

def compare(self, other: 'Candidate') -> bool:
Expand All @@ -48,7 +48,6 @@ def compare(self, other: 'Candidate') -> bool:
and self.severity == other.severity \
and self.confidence == other.confidence \
and self.use_ml == other.use_ml \
and self.ml_validation == other.ml_validation \
and self.ml_probability == other.ml_probability \
and len(self.line_data_list) == len(other.line_data_list):
for i, j in zip(self.line_data_list, other.line_data_list):
Expand Down Expand Up @@ -78,8 +77,8 @@ def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
return f"rule: {self.rule_name}" \
f" | severity: {self.severity.value}" \
f" | confidence: {self.confidence.value}" \
f" | line_data_list: [{', '.join([x.to_str(subtext, hashed) for x in self.line_data_list])}]" \
f" | ml_validation: {self.ml_validation.name}"
f" | ml_probability: {self.ml_probability}" \
f" | line_data_list: [{', '.join([x.to_str(subtext, hashed) for x in self.line_data_list])}]"

def __str__(self):
return self.to_str()
Expand All @@ -95,13 +94,12 @@ def to_json(self, hashed: bool, subtext: bool) -> Dict:

"""
full_output = {
"ml_validation": self.ml_validation.name,
"patterns": [pattern.pattern for pattern in self.patterns],
"ml_probability": self.ml_probability,
"rule": self.rule_name,
"severity": self.severity.value,
"confidence": self.confidence.value,
"use_ml": self.use_ml,
"ml_probability": self.ml_probability,
# put the array to end to make json more readable
"line_data_list": [line_data.to_json(hashed, subtext) for line_data in self.line_data_list],
}
Expand Down
1 change: 0 additions & 1 deletion credsweeper/secret/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,6 @@
"rule",
"severity",
"confidence",
"ml_validation",
"ml_probability",
"line_data_list"
]
Expand Down
8 changes: 8 additions & 0 deletions docs/source/credsweeper.deep_scanner.rst
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,14 @@ credsweeper.deep\_scanner.lang\_scanner module
:undoc-members:
:show-inheritance:

credsweeper.deep\_scanner.mxfile\_scanner module
------------------------------------------------

.. automodule:: credsweeper.deep_scanner.mxfile_scanner
:members:
:undoc-members:
:show-inheritance:

credsweeper.deep\_scanner.pdf\_scanner module
---------------------------------------------

Expand Down
6 changes: 0 additions & 6 deletions docs/source/develop.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,6 @@ Tests

To run all tests:

.. code-block:: bash

python -m pytest -s tests/

To run only tests independent from external api:

.. code-block:: bash

python -m pytest -s tests/
Expand Down
36 changes: 19 additions & 17 deletions docs/source/guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ Get output as JSON file:

.. code-block:: bash

python -m credsweeper --ml_validation --path tests/samples/password --save-json output.json
python -m credsweeper --path tests/samples/password.gradle --save-json output.json

To check JSON file run:

Expand All @@ -97,10 +97,10 @@ To check JSON file run:

[
{
"ml_validation": "VALIDATED_KEY",
"ml_probability": 0.99755,
"rule": "Password",
"severity": "medium",
"confidence": "moderate",
"ml_probability": 0.9857242107391357,
"line_data_list": [
{
"line": "password = \"cackle!\"",
Expand All @@ -111,9 +111,11 @@ To check JSON file run:
"value_start": 12,
"value_end": 19,
"variable": "password",
"variable_start": 0,
"variable_end": 8,
"entropy_validation":
{
"iterator": "BASE64_CHARS",
"iterator": "BASE64STDPAD_CHARS",
"entropy": 2.120589933192232,
"valid": false
}
Expand All @@ -126,12 +128,12 @@ Get CLI output only:

.. code-block:: bash

python -m credsweeper --path tests/samples/password
python -m credsweeper --path tests/samples/password.gradle


.. code-block:: ruby
.. code-block:: text

rule: Password / severity: medium / line_data_list: [line : 'password = "cackle!"' / line_num : 1 / path : tests/samples/password / entropy_validation: False] / ml_validation: VALIDATED_KEY
rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 1 | path: tests/samples/password.gradle | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]


Exclude outputs using CLI:
Expand All @@ -143,7 +145,7 @@ Space-like characters at left and right will be ignored.

.. code-block:: bash

$ python -m credsweeper --path tests/samples/password --denylist list.txt
$ python -m credsweeper --path tests/samples/password.gradle --denylist list.txt
Detected Credentials: 0
Time Elapsed: 0.07523202896118164s
$ cat list.txt
Expand All @@ -169,7 +171,7 @@ Then specify your config in CLI:

.. code-block:: bash

$ python -m credsweeper --path tests/samples/password --config my_cfg.json
$ python -m credsweeper --path tests/samples/password.gradle --config my_cfg.json
Detected Credentials: 0
Time Elapsed: 0.07152628898620605s

Expand All @@ -190,9 +192,9 @@ Minimal example for scanning line list:
for r in results:
print(r)

.. code-block:: bash
.. code-block:: text

rule: Password / severity: medium / line_data_list: [line: 'password='in_line_2'' / line_num: 2 / path: / value: 'in_line_2' / entropy_validation: False] / ml_validation: NOT_AVAILABLE
rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 1 | path: | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]

Minimal example for scanning bytes:

Expand All @@ -201,16 +203,16 @@ Minimal example for scanning bytes:
from credsweeper import CredSweeper, ByteContentProvider


to_scan = b"line one\npassword='in_line_2'"
to_scan = b"line one\npassword='cackle!'"
cred_sweeper = CredSweeper()
provider = ByteContentProvider(to_scan)
results = cred_sweeper.file_scan(provider)
for r in results:
print(r)

.. code-block:: bash
.. code-block:: text

rule: Password / severity: medium / line_data_list: [line: 'password='in_line_2'' / line_num: 2 / path: / value: 'in_line_2' / entropy_validation: False] / ml_validation: NOT_AVAILABLE
rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 2 | path: | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]


Minimal example for the ML validation:
Expand All @@ -220,7 +222,7 @@ Minimal example for the ML validation:
from credsweeper import CredSweeper, StringContentProvider, MlValidator, ThresholdPreset


to_scan = ["line one", "secret='fgELsRdFA'", "secret='template'"]
to_scan = ["line one", "password='cackle!'", "secret='template'"]
cred_sweeper = CredSweeper()
provider = StringContentProvider(to_scan)

Expand All @@ -237,9 +239,9 @@ Minimal example for the ML validation:

Note that `"secret='template'"` is not reported due to failing check by the `MlValidator`.

.. code-block:: bash
.. code-block:: text

rule: Secret / severity: medium / line_data_list: [line: 'secret='fgELsRdFA'' / line_num: 2 / path: / value: 'fgELsRdFA' / entropy_validation: False] / ml_validation: NOT_AVAILABLE
rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 2 | path: | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]

Configurations
--------------
Expand Down
2 changes: 1 addition & 1 deletion docs/source/overall_architecture.rst
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ Each Rule_ is dedicated to detect a specific type of credential, imported from `
- filter_type
- The type of the Filter_ group you want to apply. Filter_ groups implemented are as follows: `GeneralKeyword <credsweeper.filters.group.html#module-credsweeper.filters.group.general_keyword>`_, `GeneralPattern <credsweeper.filters.group.html#module-credsweeper.filters.group.general_pattern>`_, `PasswordKeyword <credsweeper.filters.group.html#module-credsweeper.filters.group.password_keyword>`_, and `UrlCredentials <credsweeper.filters.group.html#module-credsweeper.filters.group.url_credentials_group>`_.
- use_ml
- The attribute to set whether to perform ML validation. If true, ML validation will be performed.
- The attribute to set whether to perform ML validation. If true, ML validation will be performed. If false - ml_probability will be set to None in report.

Filter
------
Expand Down
Loading
Loading