Skip to content

Commit

Permalink
Merge pull request #431 from target/email-js-pdf-update-01292024
Browse files Browse the repository at this point in the history
Enhancements: JNLP Signature, ScanEmail Preview Image, IOC Support, and Dependency Optimization
phutelmyer authored Jan 29, 2024
2 parents 7c632da + 78c46f2 commit b37e761
Showing 12 changed files with 1,529 additions and 1,388 deletions.
4 changes: 4 additions & 0 deletions configs/python/backend/backend.yaml
Original file line number Diff line number Diff line change
@@ -108,6 +108,10 @@ scanners:
- 'message/rfc822'
- 'email_file'
priority: 5
options:
create_thumbnail: True
thumbnail_header: False
thumbnail_size: [ 500, 500 ]
'ScanEncryptedDoc':
- positive:
flavors:
13 changes: 13 additions & 0 deletions configs/python/backend/taste/taste.yara
Original file line number Diff line number Diff line change
@@ -773,6 +773,19 @@ rule batch_file {
$a at 0
}

rule jnlp_file {
meta:
description = "Detect JNLP (Java Network Launch Protocol) files"
author = "Paul Hutelmyer"
reference = "https://docs.oracle.com/javase/tutorial/deployment/webstart/deploying.html"
type = "script"
strings:
$jnlp_header = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" nocase
$jnlp_tag = "<jnlp" nocase
condition:
$jnlp_header at 0 and $jnlp_tag
}

rule javascript_file {
meta:
type = "script"
8 changes: 4 additions & 4 deletions docs/README.md

Large diffs are not rendered by default.

2,036 changes: 809 additions & 1,227 deletions poetry.lock

Large diffs are not rendered by default.

15 changes: 8 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "strelka-worker"
version = "0.23.10.19"
version = "0.24.01.19"
description = "Strelka's backend Python worker"
authors = [
"Paul Hutelmyer <[email protected]>",
@@ -16,13 +16,13 @@ boltons = "23.0.0"
boto3 = "1.28.60"
construct = "2.10.68"
cryptography = "41.0.6"
dncil = "1.0.2"
dnfile = "0.14.1"
docker = "6.1.3"
dotnetfile = "0.2.4"
eml-parser = "1.17.5"
esprima = "4.0.1"
flare-capa = "6.1.0"
formulas = "1.2.6"
grpcio-tools = "1.59.0"
grpcio = "1.59.0"
html5lib = "1.1"
inflection = "0.5.1"
jsbeautifier = "1.14.9"
@@ -44,12 +44,12 @@ opentelemetry-exporter-otlp-proto-grpc = "1.17.0"
opentelemetry-exporter-otlp-proto-http = "1.17.0"
opentelemetry-sdk = "1.17.0"
pefile = "2023.2.7"
pillow-avif-plugin = "1.4.1"
pillow-heif = "^0.13.1"
pillow-avif-plugin = "1.4.2"
pillow-heif = "^0.14.0"
pgpdump3 = "1.5.2"
py-tlsh = "4.7.2"
pycdlib = "1.14.0"
pycryptodomex = "3.18.0"
pycryptodomex = "3.20.0"
pyelftools = "0.29"
pygments = "2.15.0"
pylzma = "0.5.0"
@@ -75,6 +75,7 @@ ssdeep = "3.4"
tldextract = "3.4.0"
tnefparse = "1.4.0"
validators = "0.20.0"
weasyprint = "60.2"
xlrd2 = "1.3.4"
xlrd = "2.0.1"
xmltodict = "0.13.0"
308 changes: 288 additions & 20 deletions src/python/strelka/scanners/scan_email.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,82 @@
import base64
import email
import email.header
import hashlib
import io
import logging
import os
import tempfile

import eml_parser
import fitz # PyMuPDF
import pytz
from PIL import Image
from weasyprint import HTML

from strelka import strelka

# Configure logging to suppress warnings for WeasyPrint and informational messages for fontTools
weasyprint_logger = logging.getLogger("weasyprint")
weasyprint_logger.setLevel(logging.ERROR)

fonttools_logger = logging.getLogger("fontTools.subset")
fonttools_logger.setLevel(logging.WARNING)


class ScanEmail(strelka.Scanner):
"""Collects metadata and extract files from email messages."""
"""
Scanner that collects metadata, extracts files from email messages, and generates thumbnails.
This scanner processes email files to extract metadata, attachments, and generates
thumbnail images of the email content for a visual overview. It handles both plain text and HTML emails,
including inline images.
"""

def scan(self, data, file, options, expire_at):
"""
Processes the email, extracts metadata and attachments, and optionally generates a thumbnail.
Args:
data: The raw email data.
file: File details.
options: Scanner options including thumbnail creation and size.
expire_at: Expiry time of the scan.
"""
# Initialize data structures for storing scan results
attachments = []
self.event["total"] = {"attachments": 0, "extracted": 0}

# Thumbnail creation based on user option
create_thumbnail = options.get("create_thumbnail", False)
thumbnail_header = options.get("thumbnail_header", False)
thumbnail_size = options.get("thumbnail_size", (500, 500))

# ----------------
# Thumbnail
# ----------------
# Create a thumbnail from the image.
# Stores as a base64 value in the key: base64_thumbnail
if create_thumbnail:
try:
image = self.create_email_thumbnail(data, thumbnail_header)
if image:
image.thumbnail(thumbnail_size, Image.Resampling.BILINEAR)
buffered = io.BytesIO()
image.save(buffered, format="WEBP", quality=30, optimize=True)
base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
self.event["base64_thumbnail"] = base64_image
else:
self.flags.append(
f"{self.__class__.__name__}: image_thumbnail_error: Could not generate thumbnail."
)
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: image_thumbnail_error: {str(e)[:50]}"
)

# ----------------
# Parse Email Contents
# -------------------
try:
# Open and parse email byte string
# If fail to open, return.
@@ -23,9 +87,10 @@ def scan(self, data, file, options, expire_at):
parsed_eml = ep.decode_email_bytes(data)
except strelka.ScannerTimeout:
raise
except Exception:
self.flags.append("parse_load_error")
return
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: email_parse_error: {str(e)[:50]}"
)

# Check if email was parsed properly and attempt to deconflict and reload.
# If fail to reparse, return.
@@ -47,13 +112,16 @@ def scan(self, data, file, options, expire_at):
parsed_eml["header"]["subject"]
and parsed_eml["header"]["header"]
):
self.flags.append("parse_manual_email_error")
self.flags.append(
f"{self.__class__.__name__}: email_parse_error"
)
return
except strelka.ScannerTimeout:
raise
except Exception:
self.flags.append("parse_manual_email_error")
return
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: email_parse_error: {str(e)[:50]}"
)

# Body
# If body exists in email, collect partial message contents and domains
@@ -81,8 +149,10 @@ def scan(self, data, file, options, expire_at):
self.event["domains"] = body["domain"]
except strelka.ScannerTimeout:
raise
except Exception:
self.flags.append("parse_body_error")
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: email_parse_body_error: {str(e)[:50]}"
)

# Attachments
# If attachments exist in email, collect attachment details and raw data to be resubmitted to pipeline.
@@ -111,8 +181,10 @@ def scan(self, data, file, options, expire_at):
)
except strelka.ScannerTimeout:
raise
except Exception:
self.flags.append("parse_attachment_error")
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: email_parse_attachment_error: {str(e)[:50]}"
)

# Header
# Collect email header information
@@ -137,8 +209,10 @@ def scan(self, data, file, options, expire_at):
self.event["received_ip"] = parsed_eml["header"]["received_ip"]
except strelka.ScannerTimeout:
raise
except Exception:
self.flags.append("parse_header_error")
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: email_parse_header_error: {str(e)[:50]}"
)

# If attachments were found, submit back into pipeline
try:
@@ -153,18 +227,212 @@ def scan(self, data, file, options, expire_at):
.encode("utf-8")
.partition(b";")[0]
]
except Exception:
flavors = []
self.flags.append("content_type_error")
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: email_extract_attachment_error: {str(e)[:50]}"
)

# Send extracted file back to Strelka
self.emit_file(attachment["raw"], name=name, flavors=flavors)

self.event["total"]["extracted"] += 1
except strelka.ScannerTimeout:
raise
except Exception:
self.flags.append("extract_attachment_error")
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: email_extract_attachment_error: {str(e)[:50]}"
)

except AssertionError:
self.flags.append("assertion_error")
self.flags.append(f"{self.__class__.__name__}: email_assertion_error")

def create_email_thumbnail(self, data, show_header):
"""
Generates a thumbnail image from the content of an email message.
This function processes the email to extract images and text, combines them into
a single image, and returns that image.
Args:
show_header: Whether to show the header details in the output.
data: Raw email data.
Returns:
A PIL Image object representing the combined thumbnail image of the email.
None if no images could be created.
"""
# Supported image types for extraction from the email
image_types = [
"image/gif",
"image/jpeg",
"image/png",
"image/jpg",
"image/bmp",
"image/ico",
"image/svg",
"image/web",
]

# Dictionary to map content IDs to images
images_dict = {}

# Create a temporary directory to store generated images
with tempfile.TemporaryDirectory() as temp_dir:
# Parse the email data
msg = email.message_from_bytes(data)

# List to store paths of generated images
images_list = []

# Extract and format header details from the email
if show_header:
header_fields = ["Date", "From", "To", "Subject", "Message-Id"]
header_values = {
field: self.decode_and_format_header(msg, field)
for field in header_fields
}

# Generate an HTML table from the header values
headers_html = '<table width="100%">\n'
for field, value in header_values.items():
headers_html += f' <tr><td align="right"><b>{field}:</b></td><td>{value}</td></tr>\n'
headers_html += "</table>\n<hr></p>\n"

# Convert HTML header details to an image
header_image_path = self.html_to_image(headers_html, temp_dir)
if header_image_path:
images_list.append(header_image_path)

# Process the MIME parts to extract images
for part in msg.walk():
if part.is_multipart():
continue

mime_type = part.get_content_type()
if mime_type in image_types:
# Extract image data and create a base64 encoded version
content_id = part.get("Content-ID", "").strip("<>")
image_data = part.get_payload(decode=True)
img_data_base64 = base64.b64encode(image_data).decode("utf-8")
images_dict[content_id] = img_data_base64

# Process HTML body parts and replace CID references with base64 data
for part in msg.walk():
if part.get_content_type() == "text/html":
payload = part.get_payload(decode=True).decode("utf-8")
for cid, img_data in images_dict.items():
payload = payload.replace(
f"cid:{cid}", f"data:image/jpeg;base64,{img_data}"
)

# Convert the modified HTML body to an image
body_image_path = self.html_to_image(payload, temp_dir)
if body_image_path:
images_list.append(body_image_path)

# Combine all extracted images into a single image
if images_list:
images = [Image.open(path) for path in images_list]
return self.append_images(images)

return None

@staticmethod
def html_to_image(html_content, temp_dir):
"""
Converts HTML content to an image.
This method uses WeasyPrint to convert the HTML content to a PDF and then
uses PyMuPDF (fitz) to render the PDF as an image. The rendered image is saved as a PNG file.
Args:
html_content: HTML content to be converted into an image.
temp_dir: Temporary directory to store intermediate files.
Returns:
The file path to the generated image, or None if the process fails.
"""
# Generate a unique filename for the PDF
pdf_filename = hashlib.md5(html_content.encode()).hexdigest() + ".pdf"
pdf_path = os.path.join(temp_dir, pdf_filename)

# Convert HTML to a PDF using WeasyPrint
try:
HTML(string=html_content).write_pdf(pdf_path)

# Open the PDF with fitz and render the first page as an image
with fitz.open(pdf_path) as doc:
if doc.page_count > 0:
page = doc.load_page(0) # first page
pix = page.get_pixmap()
image_path = os.path.join(
temp_dir, pdf_filename.replace(".pdf", ".png")
)
pix.save(image_path)
return image_path
else:
return None
except Exception:
return None

@staticmethod
def append_images(images):
"""
Combines multiple image objects into a single image.
This function stacks the provided images vertically to create one continuous image.
It's particularly useful for creating a visual summary of an email's content.
Args:
images: A list of PIL Image objects to be combined.
Returns:
A single PIL Image object that combines all the input images.
"""
# Define the background color for the combined image
bg_color = (255, 255, 255)

# Calculate the total width (max width among images) and total height (sum of heights of all images)
widths, heights = zip(*(img.size for img in images))
total_width = max(widths)
total_height = sum(heights)

# Create a new image with the calculated dimensions
combined_image = Image.new("RGB", (total_width, total_height), color=bg_color)

# Paste each image onto the combined image, one below the other
y_offset = 0
for img in images:
combined_image.paste(img, (0, y_offset))
y_offset += img.height

return combined_image

@staticmethod
def decode_and_format_header(msg, header_name):
"""
Decodes and safely formats a specific header field from an email message.
Email headers can be encoded in various formats. This function decodes the header
into a human-readable format, and also ensures that the text is safe for HTML display.
Args:
msg: Parsed email message object.
header_name: The name of the header field to decode.
Returns:
A string representing the decoded and header field values.
Returns a placeholder string if the header field is missing or cannot be decoded.
"""
try:
# Decode the specified header field
decoded_header = email.header.decode_header(msg[header_name])[0]
# Convert bytes to string if necessary
field_value = decoded_header[0]
if isinstance(field_value, bytes):
field_value = field_value.decode(decoded_header[1] or "utf-8")
except Exception:
field_value = "&lt;Unknown&gt;"

# Replace angle brackets for HTML safety
return field_value.replace("<", "&lt;").replace(">", "&gt;")
167 changes: 104 additions & 63 deletions src/python/strelka/scanners/scan_javascript.py

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion src/python/strelka/scanners/scan_pdf.py
Original file line number Diff line number Diff line change
@@ -188,11 +188,14 @@ def scan(self, data, file, options, expire_at):
# Extract urls from text
self.event["links"].extend(re.findall(r"https?://[^\s)>]+", text))

# If links found, remove all duplicates.
# If links found, remove all duplicates and submit as IOCs.
# Deduplicate the links
if self.event["links"]:
self.event["links"] = list(set(filter(None, self.event["links"])))

# Submit all links to the IOCs pipeline.
self.add_iocs(self.event["links"])

# Send extracted file back to Strelka
self.emit_file(text.encode("utf-8"), name="text")

35 changes: 35 additions & 0 deletions src/python/strelka/tests/fixtures/test.js
Original file line number Diff line number Diff line change
@@ -7,6 +7,8 @@

var path = require('path');
var fs = require('fs');
var WebSocket = require('ws'); // Suspicious keyword


// Export helpers
module.exports.register = function (Handlebars, opt, params) {
@@ -71,8 +73,41 @@ module.exports.register = function (Handlebars, opt, params) {

btoa: function(b) {
return new Buffer(b, 'utf8').toString('base64');
},
// Suspicious function using WebSocket
establishWebSocket: function(url) {
var ws = new WebSocket(url);
ws.on('open', function open() {
ws.send('Connection established');
});
},

// Function using eval
dynamicEval: function(code) {
eval(code);
},

// Function with embedded IOC URL
fetchDataFromUrl: function() {
var suspiciousUrl = "http://example-malicious-site.com/data";
// Code to fetch data from the URL
console.log("Fetching data from: " + suspiciousUrl);
},

// Function with multiple IOC URLs
checkMultipleUrls: function() {
var urls = [
"http://example-malicious-site.com",
"http://example-malicious-site.com",
"https://another-example-bad-site.net",
"ftp://suspicious-ftp-server.org"
];
urls.forEach(url => {
console.log("Checking URL: " + url);
});
}
};
};

opt = opt || {};
for (var helper in helpers) {
79 changes: 79 additions & 0 deletions src/python/strelka/tests/test_scan_email.py

Large diffs are not rendered by default.

233 changes: 167 additions & 66 deletions src/python/strelka/tests/test_scan_javascript.py
Original file line number Diff line number Diff line change
@@ -18,110 +18,186 @@ def test_scan_javascript(mocker):
"flags": [],
"tokens": unordered(
[
"BlockComment",
"LineComment",
"String",
"Punctuator",
"Keyword",
"Identifier",
"LineComment",
"Punctuator",
"RegularExpression",
"Numeric",
"BlockComment",
"Keyword",
]
),
"keywords": unordered(
[
"var",
"function",
"in",
"this",
"typeof",
"return",
"for",
"function",
"if",
"throw",
"else",
"typeof",
"throw",
"new",
"this",
"in",
"for",
]
),
"strings": unordered(
[
"use strict",
"",
"ws",
"open",
"string",
"ftp://suspicious-ftp-server.org",
"Checking URL: ",
"Fetching data from: ",
"base64",
"path",
".",
"-",
" (",
"fs",
"utf8",
"package.json",
"",
"-",
"Could not find partial with name ",
".",
"string",
"function",
"https://another-example-bad-site.net",
"Found unknown type of partial ",
" (",
"use strict",
") in Handlebars partial Array => ",
"base64",
"utf8",
"function",
"Could not find partial with name ",
"http://example-malicious-site.com",
"http://example-malicious-site.com/data",
"Connection established",
]
),
"identifiers": unordered(
[
"path",
"require",
"fs",
"module",
"exports",
"register",
"compile",
"Handlebars",
"opt",
"params",
"pkg",
"send",
"urls",
"partials",
"JSON",
"parse",
"readFileSync",
"join",
"process",
"pkg",
"open",
"eval",
"console",
"params",
"cwd",
"slugify",
"str",
"toLowerCase",
"register",
"key",
"replace",
"suspiciousUrl",
"toLowerCase",
"hasOwnProperty",
"WebSocket",
"concat",
"arguments",
"ws",
"partial",
"Buffer",
"helpers",
"key",
"escape",
"Utils",
"escapeExpression",
"btoa",
"dynamicEval",
"opt",
"slugify",
"str",
"jsonStringify",
"obj",
"process",
"url",
"stringify",
"concat",
"arr",
"i",
"arguments",
"length",
"partial",
"name",
"fetchDataFromUrl",
"context",
"partials",
"compile",
"log",
"SafeString",
"atob",
"on",
"checkMultipleUrls",
"code",
"helper",
"escape",
"a",
"Buffer",
"toString",
"btoa",
"Utils",
"name",
"atob",
"fs",
"obj",
"join",
"path",
"module",
"forEach",
"length",
"establishWebSocket",
"arr",
"b",
"helper",
"hasOwnProperty",
"require",
"readFileSync",
"toString",
"parse",
"exports",
"escapeExpression",
"registerHelper",
]
),
"regular_expressions": unordered(["/[^\\w ]+/g", "/ +/g"]),
"regular_expressions": unordered(["/ +/g", "/[^\\w ]+/g"]),
"suspicious_keywords": unordered(["WebSocket", "eval"]),
"urls": unordered(
[
"https://another-example-bad-site.net",
"http://example-malicious-site.com",
"ftp://suspicious-ftp-server.org",
"http://example-malicious-site.com/data",
]
),
"beautified": True,
"script_length_bytes": 3127,
"iocs": unordered(
[
{
"ioc": "suspicious-ftp-server.org",
"ioc_type": "domain",
"scanner": "ScanJavascript",
},
{
"ioc": "ftp://suspicious-ftp-server.org",
"ioc_type": "url",
"scanner": "ScanJavascript",
},
{
"ioc": "example-malicious-site.com",
"ioc_type": "domain",
"scanner": "ScanJavascript",
},
{
"ioc": "http://example-malicious-site.com",
"ioc_type": "url",
"scanner": "ScanJavascript",
},
{
"ioc": "http://example-malicious-site.com/data",
"ioc_type": "url",
"scanner": "ScanJavascript",
},
{
"ioc": "another-example-bad-site.net",
"ioc_type": "domain",
"scanner": "ScanJavascript",
},
{
"ioc": "https://another-example-bad-site.net",
"ioc_type": "url",
"scanner": "ScanJavascript",
},
]
),
}

scanner_event = run_test_scan(
mocker=mocker,
scan_class=ScanUnderTest,
fixture_path=Path(__file__).parent / "fixtures/test.js",
options=({"max_strings": 500}),
)

TestCase.maxDiff = None
@@ -137,26 +213,51 @@ def test_scan_javascript_character_max_strings(mocker):
test_scan_event = {
"elapsed": mock.ANY,
"flags": [],
"tokens": unordered(
"tokens": unordered(["Punctuator", "BlockComment"]),
"keywords": unordered(["return", "this"]),
"strings": unordered(["", "Checking URL: "]),
"identifiers": unordered(["arguments", "process"]),
"regular_expressions": unordered(["/ +/g", "/[^\\w ]+/g"]),
"suspicious_keywords": unordered(["WebSocket", "eval"]),
"urls": unordered(
[
"BlockComment",
"String",
"Punctuator",
"Keyword",
"http://example-malicious-site.com/data",
"https://another-example-bad-site.net",
]
),
"keywords": unordered(["throw", "return", "else", "var", "new"]),
"strings": unordered(["", "path", "string", "-", "base64"]),
"identifiers": unordered(["exports", "params", "cwd", "Buffer", "escape"]),
"regular_expressions": unordered(["/[^\\w ]+/g", "/ +/g"]),
"beautified": True,
"script_length_bytes": 3127,
"iocs": unordered(
[
{
"ioc": "example-malicious-site.com",
"ioc_type": "domain",
"scanner": "ScanJavascript",
},
{
"ioc": "http://example-malicious-site.com/data",
"ioc_type": "url",
"scanner": "ScanJavascript",
},
{
"ioc": "another-example-bad-site.net",
"ioc_type": "domain",
"scanner": "ScanJavascript",
},
{
"ioc": "https://another-example-bad-site.net",
"ioc_type": "url",
"scanner": "ScanJavascript",
},
]
),
}

scanner_event = run_test_scan(
mocker=mocker,
scan_class=ScanUnderTest,
fixture_path=Path(__file__).parent / "fixtures/test.js",
options={"max_strings": 5},
options={"max_strings": 2},
)

TestCase.maxDiff = None
14 changes: 14 additions & 0 deletions src/python/strelka/tests/test_scan_pdf.py
Original file line number Diff line number Diff line change
@@ -21,6 +21,20 @@ def test_scan_pdf(mocker):
"links": unordered(
["http://bing.com", "https://duckduckgo.com", "https://google.com"]
),
"iocs": unordered(
[
{"ioc": "bing.com", "ioc_type": "domain", "scanner": "ScanPdf"},
{"ioc": "http://bing.com", "ioc_type": "url", "scanner": "ScanPdf"},
{"ioc": "duckduckgo.com", "ioc_type": "domain", "scanner": "ScanPdf"},
{
"ioc": "https://duckduckgo.com",
"ioc_type": "url",
"scanner": "ScanPdf",
},
{"ioc": "google.com", "ioc_type": "domain", "scanner": "ScanPdf"},
{"ioc": "https://google.com", "ioc_type": "url", "scanner": "ScanPdf"},
]
),
"words": 421,
"xref_object": unordered(
[

0 comments on commit b37e761

Please sign in to comment.