Skip to content

Commit

Permalink
Merge pull request #425 from target/2024-01-09-scanner-updates
Browse files Browse the repository at this point in the history
Scanner Updates, Error Handling and IOCs
  • Loading branch information
phutelmyer authored Jan 14, 2024
2 parents e642cad + faa08aa commit cbc086a
Show file tree
Hide file tree
Showing 19 changed files with 646 additions and 280 deletions.
2 changes: 1 addition & 1 deletion configs/python/backend/backend.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,6 @@ scanners:
- 'html_file'
priority: 5
options:
parser: "html5lib"
max_hyperlinks: 50
# 'ScanIni':
# - positive:
Expand Down Expand Up @@ -613,6 +612,7 @@ scanners:
flavors:
- 'vb_file'
- 'vbscript'
- 'hta_file'
priority: 5
'ScanVba':
- positive:
Expand Down
20 changes: 14 additions & 6 deletions src/python/strelka/scanners/scan_gzip.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import gzip
import io
import zlib

from strelka import strelka

Expand All @@ -8,10 +9,17 @@ class ScanGzip(strelka.Scanner):
"""Decompresses gzip files."""

def scan(self, data, file, options, expire_at):
with io.BytesIO(data) as gzip_io:
with gzip.GzipFile(fileobj=gzip_io) as gzip_obj:
decompressed = gzip_obj.read()
self.event["size"] = len(decompressed)
try:
with io.BytesIO(data) as gzip_io:
with gzip.GzipFile(fileobj=gzip_io) as gzip_obj:
decompressed = gzip_obj.read()
self.event["size"] = len(decompressed)

# Send extracted file back to Strelka
self.emit_file(decompressed, name=file.name)
# Send extracted file back to Strelka
self.emit_file(decompressed, name=file.name)
except gzip.BadGzipFile:
self.flags.append("bad_gzip_file")
except zlib.error:
self.flags.append("bad_gzip_file")
except EOFError:
self.flags.append("eof_error")
6 changes: 4 additions & 2 deletions src/python/strelka/scanners/scan_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,10 @@ def scan(self, data, file, options, expire_at):
# Convert PDF to PNG if required.
if pdf_to_png and "application/pdf" in file.flavors.get("mime", []):
try:
doc = fitz.open(stream=data, filetype="pdf")
data = doc.get_page_pixmap(0).tobytes("png")
reader = fitz.open(stream=data, filetype="pdf")
if reader.is_encrypted:
return
data = reader.get_page_pixmap(0).tobytes("png")
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: image_pdf_error: {str(e)[:50]}"
Expand Down
5 changes: 3 additions & 2 deletions src/python/strelka/scanners/scan_ole.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class ScanOle(strelka.Scanner):
"""Extracts files from OLECF files."""

def scan(self, data, file, options, expire_at):
ole = None
self.event["total"] = {"streams": 0, "extracted": 0}

try:
Expand Down Expand Up @@ -47,5 +48,5 @@ def scan(self, data, file, options, expire_at):
except OSError:
self.flags.append("os_error")
finally:
# TODO this should be wrapped with another try / catch as the variable assignment is not guaranteed
ole.close()
if ole:
ole.close()
50 changes: 32 additions & 18 deletions src/python/strelka/scanners/scan_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,44 +65,48 @@ def scan(self, data, file, options, expire_at):
# Set maximum XREF objects to be collected (default: 250)
max_objects = options.get("max_objects", 250)

# Set Default Variables
self.event["images"] = 0
self.event["lines"] = 0
self.event["links"] = []
self.event["words"] = 0
self.event.setdefault("xref_object", list())
keys = list()

try:
with io.BytesIO(data) as pdf_io:
reader = fitz.open(stream=pdf_io, filetype="pdf")

# Collect Metadata
self.event["dirty"] = reader.is_dirty
self.event["encrypted"] = reader.is_encrypted
self.event["language"] = reader.language
self.event["needs_pass"] = reader.needs_pass
self.event["old_xrefs"] = reader.has_old_style_xrefs
self.event["pages"] = reader.page_count
self.event["repaired"] = reader.is_repaired
self.event["xrefs"] = reader.xref_length() - 1

if reader.is_encrypted:
return

# Set Default Variables
self.event["images"] = 0
self.event["lines"] = 0
self.event["links"] = []
self.event["words"] = 0
self.event.setdefault("xref_object", list())
keys = list()

self.event["author"] = reader.metadata["author"]
self.event["creator"] = reader.metadata["creator"]
self.event["creation_date"] = self._convert_timestamp(
reader.metadata["creationDate"]
)
self.event["dirty"] = reader.is_dirty
self.event["embedded_files"] = {
"count": reader.embfile_count(),
"names": reader.embfile_names(),
}
self.event["encrypted"] = reader.is_encrypted
self.event["needs_pass"] = reader.needs_pass
self.event["format"] = reader.metadata["format"]
self.event["keywords"] = reader.metadata["keywords"]
self.event["language"] = reader.language
self.event["modify_date"] = self._convert_timestamp(
reader.metadata["modDate"]
)
self.event["old_xrefs"] = reader.has_old_style_xrefs
self.event["pages"] = reader.page_count
self.event["producer"] = reader.metadata["producer"]
self.event["repaired"] = reader.is_repaired
self.event["subject"] = reader.metadata["subject"]
self.event["title"] = reader.metadata["title"]
self.event["xrefs"] = reader.xref_length() - 1

# Collect Phones Numbers
phones = []
Expand All @@ -129,7 +133,9 @@ def scan(self, data, file, options, expire_at):
if pattern in xref_object:
keys.append(obj.lower())
# Extract urls from xref
self.event["links"].extend(re.findall('"(https?://.*?)"', xref_object))
self.event["links"].extend(
re.findall(r"https?://[^\s)>]+", xref_object)
)
self.event["objects"] = dict(Counter(keys))

# Convert unique xref_object set back to list
Expand Down Expand Up @@ -173,12 +179,20 @@ def scan(self, data, file, options, expire_at):
self.event["words"] += len(
list(filter(None, page.get_text().split(" ")))
)
# extract links
# Extract links
for link in page.get_links():
self.event["links"].append(link.get("uri"))

text += page.get_text()

# Extract urls from text
self.event["links"].extend(re.findall(r"https?://[^\s)>]+", text))

# If links found, remove all duplicates.
# Deduplicate the links
if self.event["links"]:
self.event["links"] = list(set(filter(None, self.event["links"])))

# Send extracted file back to Strelka
self.emit_file(text.encode("utf-8"), name="text")

Expand Down
141 changes: 76 additions & 65 deletions src/python/strelka/scanners/scan_pe.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,15 +397,18 @@ def scan(self, data, file, options, expire_at):
except pefile.PEFormatError:
self.flags.append("pe_format_error")
return
except AttributeError:
self.flags.append("pe_attribute_error")
return

if rich_dict := parse_rich(pe):
if not isinstance(rich_dict, str):
if type(rich_dict) is str:
self.event["rich"] = rich_dict
else:
self.flags.append(rich_dict)

if cert_dict := parse_certificates(data):
if not isinstance(cert_dict, str):
if type(cert_dict) is str:
self.event["security"] = cert_dict
else:
self.flags.append(cert_dict)
Expand Down Expand Up @@ -455,30 +458,33 @@ def scan(self, data, file, options, expire_at):

# https://github.com/erocarrera/pefile/blob/master/pefile.py#L3553
if hasattr(pe, "FileInfo"):
fi = pe.FileInfo[0] # contains a single element
for i in fi:
if i.Key == b"StringFileInfo":
for st in i.StringTable:
for k, v in st.entries.items():
if k.decode() in COMMON_FILE_INFO_NAMES:
self.event["file_info"][
COMMON_FILE_INFO_NAMES[k.decode()]
] = v.decode()
else:
self.event["file_info"]["string"].append(
{
"name": k.decode(),
"value": v.decode(),
}
)
elif i.Key == b"VarFileInfo":
for v in i.Var:
if translation := v.entry.get(b"Translation"):
(lang, char) = translation.split()
self.event["file_info"]["var"] = {
"language": VAR_FILE_INFO_LANGS.get(int(lang, 16)),
"character_set": VAR_FILE_INFO_CHARS.get(int(char, 16)),
}
if pe.FileInfo:
fi = pe.FileInfo[0] # contains a single element
for i in fi:
if i.Key == b"StringFileInfo":
for st in i.StringTable:
for k, v in st.entries.items():
if k.decode() in COMMON_FILE_INFO_NAMES:
self.event["file_info"][
COMMON_FILE_INFO_NAMES[k.decode()]
] = v.decode()
else:
self.event["file_info"]["string"].append(
{
"name": k.decode(),
"value": v.decode(),
}
)
elif i.Key == b"VarFileInfo":
for v in i.Var:
if translation := v.entry.get(b"Translation"):
(lang, char) = translation.split()
self.event["file_info"]["var"] = {
"language": VAR_FILE_INFO_LANGS.get(int(lang, 16)),
"character_set": VAR_FILE_INFO_CHARS.get(
int(char, 16)
),
}

if hasattr(pe, "VS_FIXEDFILEINFO"):
vs_ffi = pe.VS_FIXEDFILEINFO[0] # contains a single element
Expand Down Expand Up @@ -509,7 +515,7 @@ def scan(self, data, file, options, expire_at):
self.event["header"] = {
"machine": {
"id": pe.FILE_HEADER.Machine,
"type": pefile.MACHINE_TYPE.get(pe.FILE_HEADER.Machine).replace(
"type": pefile.MACHINE_TYPE.get(pe.FILE_HEADER.Machine, "").replace(
"IMAGE_FILE_MACHINE_", ""
),
},
Expand All @@ -518,7 +524,7 @@ def scan(self, data, file, options, expire_at):
"image": MAGIC_IMAGE.get(pe.OPTIONAL_HEADER.Magic, ""),
},
"subsystem": pefile.SUBSYSTEM_TYPE.get(
pe.OPTIONAL_HEADER.Subsystem
pe.OPTIONAL_HEADER.Subsystem, ""
).replace("IMAGE_SUBSYSTEM_", ""),
}

Expand Down Expand Up @@ -600,43 +606,48 @@ def scan(self, data, file, options, expire_at):
resource_sha256_set = set()

for res0 in pe.DIRECTORY_ENTRY_RESOURCE.entries:
for res1 in res0.directory.entries:
for res2 in res1.directory.entries:
lang = res2.data.lang
sub = res2.data.sublang
sub = pefile.get_sublang_name_for_lang(lang, sub)
data = pe.get_data(
res2.data.struct.OffsetToData, res2.data.struct.Size
)

resource_md5 = hashlib.md5(data).hexdigest()
resource_sha1 = hashlib.sha1(data).hexdigest()
resource_sha256 = hashlib.sha256(data).hexdigest()

resource_md5_set.add(resource_md5)
resource_sha1_set.add(resource_sha1)
resource_sha256_set.add(resource_sha256)

resource_dict = {
"id": res1.id,
"language": {"sub": sub.replace("SUBLANG_", "")},
"type": pefile.RESOURCE_TYPE.get(res0.id, "").replace(
"RT_", ""
),
"md5": resource_md5,
"sha1": resource_sha1,
"sha256": resource_sha256,
}

if lang in pefile.LANG:
resource_dict["language"]["primary"] = pefile.LANG[
lang
].replace("LANG_", "")

if res1.name:
resource_dict["name"] = str(res1.name)

self.event["resources"].append(resource_dict)
if hasattr(res0, "directory"):
for res1 in res0.directory.entries:
if hasattr(res1, "directory"):
for res2 in res1.directory.entries:
lang = res2.data.lang
sub = res2.data.sublang
sub = pefile.get_sublang_name_for_lang(lang, sub)
try:
data = pe.get_data(
res2.data.struct.OffsetToData,
res2.data.struct.Size,
)
except pefile.PEFormatError:
continue
resource_md5 = hashlib.md5(data).hexdigest()
resource_sha1 = hashlib.sha1(data).hexdigest()
resource_sha256 = hashlib.sha256(data).hexdigest()

resource_md5_set.add(resource_md5)
resource_sha1_set.add(resource_sha1)
resource_sha256_set.add(resource_sha256)

resource_dict = {
"id": res1.id,
"language": {"sub": sub.replace("SUBLANG_", "")},
"type": pefile.RESOURCE_TYPE.get(
res0.id, ""
).replace("RT_", ""),
"md5": resource_md5,
"sha1": resource_sha1,
"sha256": resource_sha256,
}

if lang in pefile.LANG:
resource_dict["language"]["primary"] = pefile.LANG[
lang
].replace("LANG_", "")

if res1.name:
resource_dict["name"] = str(res1.name)

self.event["resources"].append(resource_dict)

# TODO: Add optional resource extraction

Expand Down
31 changes: 16 additions & 15 deletions src/python/strelka/scanners/scan_plist.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,21 @@ def scan(self, data, file, options, expire_at):
plist = plistlib.loads(data)

self.event["keys"] = []
for k, v in plist.items():
if keys and k not in keys:
continue

try:
v = ast.literal_eval(v)
except (ValueError, SyntaxError):
pass

self.event["keys"].append(
{
"key": k,
"value": v,
}
)
if isinstance(plist, dict):
for k, v in plist.items():
if keys and k not in keys:
continue

try:
v = ast.literal_eval(v)
except (ValueError, SyntaxError):
pass

self.event["keys"].append(
{
"key": k,
"value": v,
}
)
except xml.parsers.expat.ExpatError:
self.flags.append("invalid_format")
Loading

0 comments on commit cbc086a

Please sign in to comment.