Escaped pattern discovering (#629)

* softreset * BM fix * BM fix * doc fail * patternfix * test fixed * rollback BM ref * samle fix * ValueNotPartEncodedCheck optimization * rollback debug * style * Apply suggestions from code review Co-authored-by: xDizzix <[email protected]> --------- Co-authored-by: xDizzix <[email protected]>
Samsung · Dec 12, 2024 · 7838888 · 7838888
1 parent 84a5ed8
commit 7838888
Show file tree

Hide file tree

Showing 27 changed files with 721 additions and 355 deletions.
diff --git a/.ci/benchmark.txt b/.ci/benchmark.txt
diff --git a/credsweeper/filters/group/weird_base64_token.py b/credsweeper/filters/group/weird_base64_token.py
@@ -1,7 +1,8 @@
 from credsweeper.common.constants import GroupType
 from credsweeper.config import Config
 from credsweeper.filters import ValueCoupleKeywordCheck, ValueNotPartEncodedCheck, \
-    ValueBase64DataCheck, ValueEntropyBase64Check, ValuePatternCheck, ValueNumberCheck, ValueTokenBase64Check
+    ValueBase64DataCheck, ValueEntropyBase64Check, ValuePatternCheck, ValueNumberCheck, ValueTokenBase64Check, \
+    ValueBase64PartCheck
 from credsweeper.filters.group import Group
 
 
@@ -17,5 +18,6 @@ def __init__(self, config: Config) -> None:
             ValueTokenBase64Check(),
             ValueEntropyBase64Check(),
             ValuePatternCheck(config),
-            ValueNotPartEncodedCheck()
+            ValueNotPartEncodedCheck(),
+            ValueBase64PartCheck(),
         ]
diff --git a/credsweeper/filters/value_atlassian_token_check.py b/credsweeper/filters/value_atlassian_token_check.py
@@ -63,8 +63,8 @@ def check_atlassian_struct(value: str) -> bool:
         # there is limit for big integer value: math.log10(1<<64) = 19.265919722494797
         if 0 < delimiter_pos <= 20:
             val = decoded[:delimiter_pos].decode(LATIN_1)
-            # at least 3 digits in the token
-            if 100 < int(val):
+            # at least 4 digits in the token
+            if 1000 <= int(val):
                 # test for ascii and Shannon entropy - there should be random data
                 data = decoded[delimiter_pos + 1:]
                 return Util.is_ascii_entropy_validate(data)

diff --git a/credsweeper/filters/value_base64_part_check.py b/credsweeper/filters/value_base64_part_check.py
@@ -1,11 +1,13 @@
 import contextlib
+import re
 import statistics
 
 from credsweeper.common.constants import Chars
 from credsweeper.config import Config
 from credsweeper.credentials import LineData
 from credsweeper.file_handler.analysis_target import AnalysisTarget
 from credsweeper.filters import Filter
+from credsweeper.filters.value_entropy_base64_check import ValueEntropyBase64Check
 from credsweeper.utils import Util
 
 
@@ -14,6 +16,9 @@ class ValueBase64PartCheck(Filter):
     Check that candidate is NOT a part of base64 long line
     """
 
+    base64_pattern = re.compile(r"^(\\{1,8}[0abfnrtv]|[0-9A-Za-z+/=]){1,4000}")
+    base64_set = set(Chars.BASE64_CHARS.value)
+
     def __init__(self, config: Config = None) -> None:
         pass
 
@@ -30,26 +35,66 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
         """
 
         with contextlib.suppress(Exception):
-            if line_data.value_start and line_data.line[line_data.value_start - 1] in ('/', '+'):
-                if '-' in line_data.value or '_' in line_data.value:
-                    # the value contains url-safe chars, so '/' is a delimiter
+            line = line_data.line
+            len_line = len(line)
+            value = line_data.value
+            len_value = len(value)
+            if 0 == line_data.value_start and len_line >= 2 * len_value \
+                    or 0 < line_data.value_start and line[line_data.value_start - 1] in ('/', '+', '\\', '%') \
+                    or 0 < line_data.value_end < len_line and line[line_data.value_end] in ('/', '+', '\\', '%'):
+
+                if '-' in value or '_' in value:
+                    # the value contains url-safe chars, so '/' or '+' is a delimiter
                     return False
-                value_entropy = Util.get_shannon_entropy(line_data.value, Chars.BASE64STD_CHARS.value)
-                left_start = line_data.value_start - len(line_data.value)
+
+                left_start = line_data.value_start - len_value
                 if 0 > left_start:
                     left_start = 0
-                left_entropy = Util.get_shannon_entropy(line_data.line[left_start:line_data.value_start],
-                                                        Chars.BASE64STD_CHARS.value)
-                right_end = line_data.value_end + len(line_data.value)
-                if len(line_data.line) < right_end:
-                    right_end = len(line_data.line)
-                right_entropy = Util.get_shannon_entropy(line_data.line[line_data.value_end:right_end],
-                                                         Chars.BASE64STD_CHARS.value)
-                data = [value_entropy, left_entropy, right_entropy]
+                right_end = line_data.value_end + len_value
+                if len_line < right_end:
+                    right_end = len_line
+
+                hunk_size = right_end - left_start
+
+                if hunk_size == 3 * len_value:
+                    # simple analysis for maximal data size
+                    if self.base64_pattern.match(line[left_start:right_end]):
+                        # obvious case: all characters are base64 standard
+                        return True
+                elif right_end - left_start >= 2 * len_value:
+                    # simple analysis for data too large to yield sensible insights
+                    part_set = set(line[left_start:right_end])
+                    if not part_set.difference(self.base64_set):
+                        # obvious case: all characters are base64 standard
+                        return True
+
+                left_part = line[left_start:line_data.value_start]
+                len_left = len(left_part)
+                right_part = line[line_data.value_end:right_end]
+                len_right = len(right_part)
+
+                min_entropy_value = ValueEntropyBase64Check.get_min_data_entropy(len_value)
+                value_entropy = Util.get_shannon_entropy(value, Chars.BASE64STD_CHARS.value)
+
+                if ValueEntropyBase64Check.min_length < len_left:
+                    left_entropy = Util.get_shannon_entropy(left_part, Chars.BASE64STD_CHARS.value)
+                    if len_left < len_value:
+                        left_entropy *= len_value / len_left
+                else:
+                    left_entropy = min_entropy_value
+
+                if ValueEntropyBase64Check.min_length < len_right:
+                    right_entropy = Util.get_shannon_entropy(right_part, Chars.BASE64STD_CHARS.value)
+                    if len_right < len_value:
+                        left_entropy *= len_right / len_left
+                else:
+                    right_entropy = min_entropy_value
+
+                data = [left_entropy, value_entropy, right_entropy, min_entropy_value]
                 avg = statistics.mean(data)
                 stdev = statistics.stdev(data, avg)
                 avg_min = avg - 1.1 * stdev
-                if avg_min < left_entropy and avg_min < right_entropy:
+                if avg_min <= left_entropy and avg_min <= right_entropy:
                     # high entropy of bound parts looks like a part of base64 long line
                     return True
 

diff --git a/credsweeper/filters/value_entropy_base64_check.py b/credsweeper/filters/value_entropy_base64_check.py
@@ -11,6 +11,9 @@
 class ValueEntropyBase64Check(Filter):
     """Check that candidate have Shanon Entropy > 3 (for HEX_CHARS or BASE36_CHARS) or > 4.5 (for BASE64_CHARS)."""
 
+    # If the value size is less than this value the entropy evaluation gives an imprecise result
+    min_length = 12
+
     def __init__(self, config: Config = None) -> None:
         pass
 
@@ -25,7 +28,7 @@ def get_min_data_entropy(x: int) -> float:
             y = 4.1
         elif 32 == x:
             y = 4.4
-        elif 12 <= x < 35:
+        elif ValueEntropyBase64Check.min_length <= x < 35:
             # logarithm base 2 - slow, but precise. Approximation does not exceed stdev
             y = 0.77 * math.log2(x) + 0.62
         elif 35 <= x < 60:

diff --git a/credsweeper/filters/value_not_part_encoded_check.py b/credsweeper/filters/value_not_part_encoded_check.py
@@ -1,6 +1,7 @@
 import re
 from typing import Optional
 
+from credsweeper.common import static_keyword_checklist
 from credsweeper.config import Config
 from credsweeper.credentials import LineData
 from credsweeper.file_handler.analysis_target import AnalysisTarget
@@ -29,15 +30,14 @@ def check_line_target_fit(line_data: LineData, target: AnalysisTarget) -> bool:
 
     @staticmethod
     def check_val(line: str, pattern: re.Pattern) -> Optional[bool]:
-        """Verifies whether the line looks like a pattern"""
-        match_obj = pattern.match(line)
-        if match_obj:
+        """Verifies whether the line looks like a base64 pattern"""
+        if match_obj := pattern.match(line):
             val = match_obj.group("val")
             # not a path-like
-            if not val.startswith('/'):
-                return True
-            # padding sign
-            if '=' == val[-1]:
+            if not val.startswith('/') \
+                    or not static_keyword_checklist.check_morphemes(val.lower(), 2) \
+                    or '=' == val[-1]:
+                # padding char is a marker too
                 return True
         return None
 

diff --git a/credsweeper/filters/value_token_base64_check.py b/credsweeper/filters/value_token_base64_check.py
@@ -26,7 +26,7 @@ def get_min_strength(x: int) -> float:
         elif x < 40:
             y = ((0.0000405 * x - 0.004117) * x + 0.141) * x - 0.65
         else:
-            y = 1
+            y = 0.9999
         return y
 
     def run(self, line_data: LineData, target: AnalysisTarget) -> bool: