Skip to content

Commit

Permalink
Escaped pattern discovering (#629)
Browse files Browse the repository at this point in the history
* softreset

* BM fix

* BM fix

* doc fail

* patternfix

* test fixed

* rollback BM ref

* samle fix

* ValueNotPartEncodedCheck optimization

* rollback debug

* style

* Apply suggestions from code review

Co-authored-by: xDizzix <[email protected]>

---------

Co-authored-by: xDizzix <[email protected]>
  • Loading branch information
babenek and xDizzix authored Dec 12, 2024
1 parent 84a5ed8 commit 7838888
Show file tree
Hide file tree
Showing 27 changed files with 721 additions and 355 deletions.
144 changes: 72 additions & 72 deletions .ci/benchmark.txt

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions credsweeper/filters/group/weird_base64_token.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from credsweeper.common.constants import GroupType
from credsweeper.config import Config
from credsweeper.filters import ValueCoupleKeywordCheck, ValueNotPartEncodedCheck, \
ValueBase64DataCheck, ValueEntropyBase64Check, ValuePatternCheck, ValueNumberCheck, ValueTokenBase64Check
ValueBase64DataCheck, ValueEntropyBase64Check, ValuePatternCheck, ValueNumberCheck, ValueTokenBase64Check, \
ValueBase64PartCheck
from credsweeper.filters.group import Group


Expand All @@ -17,5 +18,6 @@ def __init__(self, config: Config) -> None:
ValueTokenBase64Check(),
ValueEntropyBase64Check(),
ValuePatternCheck(config),
ValueNotPartEncodedCheck()
ValueNotPartEncodedCheck(),
ValueBase64PartCheck(),
]
4 changes: 2 additions & 2 deletions credsweeper/filters/value_atlassian_token_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ def check_atlassian_struct(value: str) -> bool:
# there is limit for big integer value: math.log10(1<<64) = 19.265919722494797
if 0 < delimiter_pos <= 20:
val = decoded[:delimiter_pos].decode(LATIN_1)
# at least 3 digits in the token
if 100 < int(val):
# at least 4 digits in the token
if 1000 <= int(val):
# test for ascii and Shannon entropy - there should be random data
data = decoded[delimiter_pos + 1:]
return Util.is_ascii_entropy_validate(data)
Expand Down
73 changes: 59 additions & 14 deletions credsweeper/filters/value_base64_part_check.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import contextlib
import re
import statistics

from credsweeper.common.constants import Chars
from credsweeper.config import Config
from credsweeper.credentials import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters import Filter
from credsweeper.filters.value_entropy_base64_check import ValueEntropyBase64Check
from credsweeper.utils import Util


Expand All @@ -14,6 +16,9 @@ class ValueBase64PartCheck(Filter):
Check that candidate is NOT a part of base64 long line
"""

base64_pattern = re.compile(r"^(\\{1,8}[0abfnrtv]|[0-9A-Za-z+/=]){1,4000}")
base64_set = set(Chars.BASE64_CHARS.value)

def __init__(self, config: Config = None) -> None:
pass

Expand All @@ -30,26 +35,66 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""

with contextlib.suppress(Exception):
if line_data.value_start and line_data.line[line_data.value_start - 1] in ('/', '+'):
if '-' in line_data.value or '_' in line_data.value:
# the value contains url-safe chars, so '/' is a delimiter
line = line_data.line
len_line = len(line)
value = line_data.value
len_value = len(value)
if 0 == line_data.value_start and len_line >= 2 * len_value \
or 0 < line_data.value_start and line[line_data.value_start - 1] in ('/', '+', '\\', '%') \
or 0 < line_data.value_end < len_line and line[line_data.value_end] in ('/', '+', '\\', '%'):

if '-' in value or '_' in value:
# the value contains url-safe chars, so '/' or '+' is a delimiter
return False
value_entropy = Util.get_shannon_entropy(line_data.value, Chars.BASE64STD_CHARS.value)
left_start = line_data.value_start - len(line_data.value)

left_start = line_data.value_start - len_value
if 0 > left_start:
left_start = 0
left_entropy = Util.get_shannon_entropy(line_data.line[left_start:line_data.value_start],
Chars.BASE64STD_CHARS.value)
right_end = line_data.value_end + len(line_data.value)
if len(line_data.line) < right_end:
right_end = len(line_data.line)
right_entropy = Util.get_shannon_entropy(line_data.line[line_data.value_end:right_end],
Chars.BASE64STD_CHARS.value)
data = [value_entropy, left_entropy, right_entropy]
right_end = line_data.value_end + len_value
if len_line < right_end:
right_end = len_line

hunk_size = right_end - left_start

if hunk_size == 3 * len_value:
# simple analysis for maximal data size
if self.base64_pattern.match(line[left_start:right_end]):
# obvious case: all characters are base64 standard
return True
elif right_end - left_start >= 2 * len_value:
# simple analysis for data too large to yield sensible insights
part_set = set(line[left_start:right_end])
if not part_set.difference(self.base64_set):
# obvious case: all characters are base64 standard
return True

left_part = line[left_start:line_data.value_start]
len_left = len(left_part)
right_part = line[line_data.value_end:right_end]
len_right = len(right_part)

min_entropy_value = ValueEntropyBase64Check.get_min_data_entropy(len_value)
value_entropy = Util.get_shannon_entropy(value, Chars.BASE64STD_CHARS.value)

if ValueEntropyBase64Check.min_length < len_left:
left_entropy = Util.get_shannon_entropy(left_part, Chars.BASE64STD_CHARS.value)
if len_left < len_value:
left_entropy *= len_value / len_left
else:
left_entropy = min_entropy_value

if ValueEntropyBase64Check.min_length < len_right:
right_entropy = Util.get_shannon_entropy(right_part, Chars.BASE64STD_CHARS.value)
if len_right < len_value:
left_entropy *= len_right / len_left
else:
right_entropy = min_entropy_value

data = [left_entropy, value_entropy, right_entropy, min_entropy_value]
avg = statistics.mean(data)
stdev = statistics.stdev(data, avg)
avg_min = avg - 1.1 * stdev
if avg_min < left_entropy and avg_min < right_entropy:
if avg_min <= left_entropy and avg_min <= right_entropy:
# high entropy of bound parts looks like a part of base64 long line
return True

Expand Down
5 changes: 4 additions & 1 deletion credsweeper/filters/value_entropy_base64_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
class ValueEntropyBase64Check(Filter):
"""Check that candidate have Shanon Entropy > 3 (for HEX_CHARS or BASE36_CHARS) or > 4.5 (for BASE64_CHARS)."""

# If the value size is less than this value the entropy evaluation gives an imprecise result
min_length = 12

def __init__(self, config: Config = None) -> None:
pass

Expand All @@ -25,7 +28,7 @@ def get_min_data_entropy(x: int) -> float:
y = 4.1
elif 32 == x:
y = 4.4
elif 12 <= x < 35:
elif ValueEntropyBase64Check.min_length <= x < 35:
# logarithm base 2 - slow, but precise. Approximation does not exceed stdev
y = 0.77 * math.log2(x) + 0.62
elif 35 <= x < 60:
Expand Down
14 changes: 7 additions & 7 deletions credsweeper/filters/value_not_part_encoded_check.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
from typing import Optional

from credsweeper.common import static_keyword_checklist
from credsweeper.config import Config
from credsweeper.credentials import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
Expand Down Expand Up @@ -29,15 +30,14 @@ def check_line_target_fit(line_data: LineData, target: AnalysisTarget) -> bool:

@staticmethod
def check_val(line: str, pattern: re.Pattern) -> Optional[bool]:
"""Verifies whether the line looks like a pattern"""
match_obj = pattern.match(line)
if match_obj:
"""Verifies whether the line looks like a base64 pattern"""
if match_obj := pattern.match(line):
val = match_obj.group("val")
# not a path-like
if not val.startswith('/'):
return True
# padding sign
if '=' == val[-1]:
if not val.startswith('/') \
or not static_keyword_checklist.check_morphemes(val.lower(), 2) \
or '=' == val[-1]:
# padding char is a marker too
return True
return None

Expand Down
2 changes: 1 addition & 1 deletion credsweeper/filters/value_token_base64_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def get_min_strength(x: int) -> float:
elif x < 40:
y = ((0.0000405 * x - 0.004117) * x + 0.141) * x - 0.65
else:
y = 1
y = 0.9999
return y

def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
Expand Down
Loading

0 comments on commit 7838888

Please sign in to comment.