Samsung · babenek · Jan 15, 2025 · Jan 15, 2025 · Jan 15, 2025 · Jan 15, 2025
@@ -10,7 +10,7 @@
 # Directory of credsweeper sources MUST be placed before imports to avoid circular import error
 APP_PATH = Path(__file__).resolve().parent
 
-from credsweeper.common.constants import KeyValidationOption, Severity, ThresholdPreset, DiffRowType
+from credsweeper.common.constants import Severity, ThresholdPreset, DiffRowType
 from credsweeper.config import Config
 from credsweeper.credentials import Candidate, CredentialManager, CandidateKey
 from credsweeper.deep_scanner.deep_scanner import DeepScanner
@@ -167,7 +167,8 @@ def _use_ml_validation(self) -> bool:
             logger.info("Skip ML validation because no candidates were found")
             return False
         for i in self.credential_manager.candidates:
-            if i.use_ml:
+            # None value means ml should be processed for the candidate
+            if i.ml_probability is None:
                 # any() or all() is not used to speedup
                 return True
         logger.info("Skip ML validation because no candidates support it")
@@ -353,7 +354,7 @@ def post_processing(self) -> None:
             for group_key, group_candidates in cred_groups.items():
                 # Analyze with ML if any candidate in group require ML
                 for candidate in group_candidates:
-                    if candidate.use_ml:
+                    if candidate.ml_probability is None:
                         ml_cred_groups.append((group_key, group_candidates))
                         break
                 else:
@@ -366,13 +367,11 @@ def post_processing(self) -> None:
                 is_cred, probability = self.ml_validator.validate_groups(ml_cred_groups, self.ml_batch_size)
                 for i, (_, group_candidates) in enumerate(ml_cred_groups):
                     for candidate in group_candidates:
-                        if candidate.use_ml:
+                        if candidate.ml_probability is None:
                             if is_cred[i]:
-                                candidate.ml_validation = KeyValidationOption.VALIDATED_KEY
                                 candidate.ml_probability = probability[i]
                                 new_cred_list.append(candidate)
                         else:
-                            candidate.ml_validation = KeyValidationOption.NOT_AVAILABLE
                             new_cred_list.append(candidate)
             else:
                 logger.info("Skipping ML validation due not applicable")
@@ -435,7 +434,8 @@ def export_results(self, change_type: Optional[DiffRowType] = None) -> None:
                 for line_data in credential.line_data_list:
                     # bright rule name and path or info
                     print(Style.BRIGHT + credential.rule_name +
-                          f" {line_data.info or line_data.path}:{line_data.line_num}" + Style.RESET_ALL)
+                          f" {line_data.info or line_data.path}:{line_data.line_num} {credential.ml_probability}" +
+                          Style.RESET_ALL)
                     print(line_data.get_colored_line(hashed=self.hashed, subtext=self.subtext))
 
         if is_exported is False:

@@ -97,14 +97,6 @@ class Chars(Enum):
 ENTROPY_LIMIT_BASE3x = 3
 
 
-class KeyValidationOption(Enum):
-    """API validation state"""
-    INVALID_KEY = 0
-    VALIDATED_KEY = 1
-    UNDECIDED = 2
-    NOT_AVAILABLE = 3
-
-
 class GroupType(Enum):
     """Group type - used in Group constructor for load predefined set of filters"""
     KEYWORD = "keyword"

@@ -1,9 +1,9 @@
 import copy
 import re
 from json.encoder import py_encode_basestring_ascii
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
-from credsweeper.common.constants import KeyValidationOption, Severity, Confidence
+from credsweeper.common.constants import Severity, Confidence
 from credsweeper.config import Config
 from credsweeper.credentials.line_data import LineData
 
@@ -20,8 +20,7 @@ class Candidate:
         severity: critical/high/medium/low
         confidence: strong/moderate/weak
         config: user configs
-        validations: List of Validation objects that can check this credential using external API
-        use_ml: Should ML work on this credential or not. If not prediction based on regular expression and filter only
+        use_ml: Whether the candidate should be validated with ML. If not - ml_probability is set to -1
     """
 
     def __init__(self,
@@ -37,18 +36,16 @@ def __init__(self,
         self.rule_name = rule_name
         self.severity = severity
         self.config = config
-        self.use_ml = use_ml
+        # None - ML is applicable but not processed yet; "NA" - ML is not applicable; float - the ml decision
+        # Note: -1.0 is possible too for some activation functions in ml model, so let avoid negative values
+        self.ml_probability: Union[None, str, float] = None if use_ml else "NA"
         self.confidence = confidence
-        self.ml_validation = KeyValidationOption.NOT_AVAILABLE
-        self.ml_probability: Optional[float] = None
 
     def compare(self, other: 'Candidate') -> bool:
         """Comparison method - checks only result of final cred"""
         if self.rule_name == other.rule_name \
                 and self.severity == other.severity \
                 and self.confidence == other.confidence \
-                and self.use_ml == other.use_ml \
-                and self.ml_validation == other.ml_validation \
                 and self.ml_probability == other.ml_probability \
                 and len(self.line_data_list) == len(other.line_data_list):
             for i, j in zip(self.line_data_list, other.line_data_list):
@@ -78,8 +75,8 @@ def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
         return f"rule: {self.rule_name}" \
                f" | severity: {self.severity.value}" \
                f" | confidence: {self.confidence.value}" \
-               f" | line_data_list: [{', '.join([x.to_str(subtext, hashed) for x in self.line_data_list])}]" \
-               f" | ml_validation: {self.ml_validation.name}"
+               f" | ml_probability: {self.ml_probability}" \
+               f" | line_data_list: [{', '.join([x.to_str(subtext, hashed) for x in self.line_data_list])}]"
 
     def __str__(self):
         return self.to_str()
@@ -95,13 +92,11 @@ def to_json(self, hashed: bool, subtext: bool) -> Dict:
 
         """
         full_output = {
-            "ml_validation": self.ml_validation.name,
             "patterns": [pattern.pattern for pattern in self.patterns],
-            "ml_probability": self.ml_probability,
             "rule": self.rule_name,
             "severity": self.severity.value,
             "confidence": self.confidence.value,
-            "use_ml": self.use_ml,
+            "ml_probability": self.ml_probability,
             # put the array to end to make json more readable
             "line_data_list": [line_data.to_json(hashed, subtext) for line_data in self.line_data_list],
         }

@@ -164,7 +164,6 @@
         "rule",
         "severity",
         "confidence",
-        "ml_validation",
         "ml_probability",
         "line_data_list"
     ]

@@ -92,6 +92,14 @@ credsweeper.deep\_scanner.lang\_scanner module
    :undoc-members:
    :show-inheritance:
 
+credsweeper.deep\_scanner.mxfile\_scanner module
+------------------------------------------------
+
+.. automodule:: credsweeper.deep_scanner.mxfile_scanner
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 credsweeper.deep\_scanner.pdf\_scanner module
 ---------------------------------------------
 

@@ -6,12 +6,6 @@ Tests
 
 To run all tests:
 
-.. code-block:: bash
-
-    python -m pytest -s tests/
-
-To run only tests independent from external api:
-
 .. code-block:: bash
 
     python -m pytest -s tests/

@@ -84,7 +84,7 @@ Get output as JSON file:
 
 .. code-block:: bash
 
-    python -m credsweeper --ml_validation --path tests/samples/password --save-json output.json
+    python -m credsweeper --path tests/samples/password.gradle --save-json output.json
 
 To check JSON file run:
 
@@ -97,10 +97,10 @@ To check JSON file run:
 
     [
         {
-            "ml_validation": "VALIDATED_KEY",
-            "ml_probability": 0.99755,
             "rule": "Password",
             "severity": "medium",
+            "confidence": "moderate",
+            "ml_probability": 0.9857242107391357,
             "line_data_list": [
                 {
                     "line": "password = \"cackle!\"",
@@ -111,9 +111,11 @@ To check JSON file run:
                     "value_start": 12,
                     "value_end": 19,
                     "variable": "password",
+                    "variable_start": 0,
+                    "variable_end": 8,
                     "entropy_validation":
                     {
-                        "iterator": "BASE64_CHARS",
+                        "iterator": "BASE64STDPAD_CHARS",
                         "entropy": 2.120589933192232,
                         "valid": false
                     }
@@ -126,12 +128,12 @@ Get CLI output only:
 
 .. code-block:: bash
 
-    python -m credsweeper --path tests/samples/password
+    python -m credsweeper --path tests/samples/password.gradle
 
 
-.. code-block:: ruby
+.. code-block:: text
 
-    rule: Password / severity: medium / line_data_list: [line : 'password = "cackle!"' / line_num : 1 / path : tests/samples/password / entropy_validation: False] / ml_validation: VALIDATED_KEY
+    rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 1 | path: tests/samples/password.gradle | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]
 
 
 Exclude outputs using CLI:
@@ -143,7 +145,7 @@ Space-like characters at left and right will be ignored.
 
 .. code-block:: bash
 
-    $ python -m credsweeper --path tests/samples/password --denylist list.txt
+    $ python -m credsweeper --path tests/samples/password.gradle --denylist list.txt
     Detected Credentials: 0
     Time Elapsed: 0.07523202896118164s
     $ cat list.txt
@@ -169,7 +171,7 @@ Then specify your config in CLI:
 
 .. code-block:: bash
 
-    $ python -m credsweeper --path tests/samples/password --config my_cfg.json
+    $ python -m credsweeper --path tests/samples/password.gradle --config my_cfg.json
     Detected Credentials: 0
     Time Elapsed: 0.07152628898620605s
 
@@ -190,9 +192,9 @@ Minimal example for scanning line list:
     for r in results:
         print(r)
 
-.. code-block:: bash
+.. code-block:: text
 
-    rule: Password / severity: medium / line_data_list: [line: 'password='in_line_2'' / line_num: 2 / path:  / value: 'in_line_2' / entropy_validation: False] / ml_validation: NOT_AVAILABLE
+    rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 1 | path:  | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]
 
 Minimal example for scanning bytes:
 
@@ -201,16 +203,16 @@ Minimal example for scanning bytes:
     from credsweeper import CredSweeper, ByteContentProvider
 
 
-    to_scan = b"line one\npassword='in_line_2'"
+    to_scan = b"line one\npassword='cackle!'"
     cred_sweeper = CredSweeper()
     provider = ByteContentProvider(to_scan)
     results = cred_sweeper.file_scan(provider)
     for r in results:
         print(r)
 
-.. code-block:: bash
+.. code-block:: text
 
-    rule: Password / severity: medium / line_data_list: [line: 'password='in_line_2'' / line_num: 2 / path:  / value: 'in_line_2' / entropy_validation: False] / ml_validation: NOT_AVAILABLE
+    rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 2 | path:  | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]
 
 
 Minimal example for the ML validation:
@@ -220,7 +222,7 @@ Minimal example for the ML validation:
     from credsweeper import CredSweeper, StringContentProvider, MlValidator, ThresholdPreset
 
 
-    to_scan = ["line one", "secret='fgELsRdFA'", "secret='template'"]
+    to_scan = ["line one", "password='cackle!'", "secret='template'"]
     cred_sweeper = CredSweeper()
     provider = StringContentProvider(to_scan)
 
@@ -237,9 +239,9 @@ Minimal example for the ML validation:
 
 Note that `"secret='template'"` is not reported due to failing check by the `MlValidator`.
 
-.. code-block:: bash
+.. code-block:: text
 
-    rule: Secret / severity: medium / line_data_list: [line: 'secret='fgELsRdFA'' / line_num: 2 / path:  / value: 'fgELsRdFA' / entropy_validation: False] / ml_validation: NOT_AVAILABLE
+    rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 2 | path:  | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]
 
 Configurations
 --------------

@@ -139,7 +139,7 @@ Each Rule_ is dedicated to detect a specific type of credential, imported from `
 - filter_type
    - The type of the Filter_ group you want to apply. Filter_ groups implemented are as follows: `GeneralKeyword <credsweeper.filters.group.html#module-credsweeper.filters.group.general_keyword>`_, `GeneralPattern <credsweeper.filters.group.html#module-credsweeper.filters.group.general_pattern>`_, `PasswordKeyword <credsweeper.filters.group.html#module-credsweeper.filters.group.password_keyword>`_, and `UrlCredentials <credsweeper.filters.group.html#module-credsweeper.filters.group.url_credentials_group>`_.
 - use_ml
-   - The attribute to set whether to perform ML validation. If true, ML validation will be performed.
+   - The attribute to set whether to perform ML validation. If true, ML validation will be performed. If false - ml_probability will be set to "NA" in report.
 
 Filter
 ------