Skip to content

Commit

Permalink
improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
doomedraven committed Sep 13, 2023
1 parent 6e608a0 commit d7a2752
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 49 deletions.
33 changes: 33 additions & 0 deletions data/scraper_safe_url_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
safe_url_list = (
'aadcdn.msftauth.net',
'https://aadcdn.msauth.net',
'https://aadcdn.msftauth.net',
'https://login.live.com',
'https://login.microsoftonline.com',
'https://outlook.office365.com',
'https://outlook-1.cdn.office.net',
'https://outlook-2.cdn.office.net',
'https://go.microsoft.com',
'https://aka.ms',
'https://privacy.microsoft.com',
'https://www.microsoft.com',
'https://support.mozilla.org',
'http://www.w3.org',
'aadcdn.msauth.net',
'https://ajax.googleapis.com',
'https://code.jquery.com',
'https://fonts.googleapis.com',
'https://maxcdn.bootstrapcdn.com',
'https://kit.fontawesome.com',
'https://cdnjs.cloudflare.com',
'https://getbootstrap.com',
'https://use.fontawesome.com',
'https://www.office.com',
'https://github.com/twbs/bootstrap',
'http://fonts.cdnfonts.com',
'https://www.google.com/',
'https://www.gstatic.com',
'https://stackpath.bootstrapcdn.com',
'https://ka-f.fontawesome.com',
'https://fontawesome.com'
)
71 changes: 22 additions & 49 deletions modules/processing/html_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,58 +3,28 @@
import os
import urllib.parse
from typing import Optional

from urlextract import URLExtract
from contextlib import suppress

from lib.cuckoo.common.abstracts import Processing
from data.scraper_safe_url_list import safe_url_list

safe_url_list = (
'aadcdn.msftauth.net',
'https://aadcdn.msauth.net',
'https://aadcdn.msftauth.net',
'https://login.live.com',
'https://login.microsoftonline.com',
'https://outlook.office365.com',
'https://outlook-1.cdn.office.net',
'https://outlook-2.cdn.office.net',
'https://go.microsoft.com',
'https://aka.ms',
'https://privacy.microsoft.com',
'https://www.microsoft.com',
'https://support.mozilla.org',
'http://www.w3.org',
'aadcdn.msauth.net',
'https://ajax.googleapis.com',
'https://code.jquery.com',
'https://fonts.googleapis.com',
'https://maxcdn.bootstrapcdn.com',
'https://kit.fontawesome.com',
'https://cdnjs.cloudflare.com',
'https://getbootstrap.com',
'https://use.fontawesome.com',
'https://www.office.com',
'https://github.com/twbs/bootstrap',
'http://fonts.cdnfonts.com',
'https://www.google.com/',
'https://www.gstatic.com',
'https://stackpath.bootstrapcdn.com',
'https://ka-f.fontawesome.com',
'https://fontawesome.com'
)
HAVE_URLEXTRACT = False
with suppress(ImportError):
from urlextract import URLExtract
HAVE_URLEXTRACT = True

log = logging.getLogger(__name__)


def try_base64_decode(text: str, validate: bool = True) -> Optional[bytes]:
try:
result = None
with suppress(Exception):
result = base64.b64decode(text, validate=validate)
return result
except Exception:
return None

return result

def force_decode(text: str, max_decode_depth: int) -> Optional[str]:
for current_depth in range(max_decode_depth):
for _ in range(max_decode_depth):
new_text = text
base64_decoded_text = try_base64_decode(text, validate=True)
if base64_decoded_text:
Expand All @@ -74,13 +44,18 @@ def force_decode(text: str, max_decode_depth: int) -> Optional[str]:

class HtmlScraper(Processing):
def run(self):
log.info('Started html dump processing')

if not HAVE_URLEXTRACT:
print("Missed optional dependency: poetry run pip install -r extra/optional_dependencies.txt")
return

log.debug('Started html dump processing')
self.key = 'html_scraper'

html_dump_path = os.path.join(self.analysis_path, 'htmldump', 'html_dump.dump')
last_url_path = os.path.join(self.analysis_path, 'htmldump', 'last_url.dump')
if not os.path.isfile(html_dump_path):
log.info('Dump File not found, nothing to process')
log.debug('Dump File not found, nothing to process')
return {}

try:
Expand Down Expand Up @@ -109,14 +84,12 @@ def run(self):
with open(last_url_path, 'r') as f:
addresses_in_html.add(f.read())

filtered_addresses = {url.strip('\\x27') for url in addresses_in_html if
not url.startswith(safe_url_list)}
filtered_addresses = {url.strip('\\x27') for url in addresses_in_html if not url.startswith(safe_url_list)}

log.info('Finished html dump processing')
log.debug('Finished html dump processing')

return {
'addresses': list(filtered_addresses)
}
return {'addresses': list(filtered_addresses)}
except Exception:
log.exception('Html dump processing failed')
return {}

return {}

0 comments on commit d7a2752

Please sign in to comment.