diff --git a/data/scraper_safe_url_list.py b/data/scraper_safe_url_list.py new file mode 100644 index 00000000..254f55d5 --- /dev/null +++ b/data/scraper_safe_url_list.py @@ -0,0 +1,33 @@ +safe_url_list = ( + 'aadcdn.msftauth.net', + 'https://aadcdn.msauth.net', + 'https://aadcdn.msftauth.net', + 'https://login.live.com', + 'https://login.microsoftonline.com', + 'https://outlook.office365.com', + 'https://outlook-1.cdn.office.net', + 'https://outlook-2.cdn.office.net', + 'https://go.microsoft.com', + 'https://aka.ms', + 'https://privacy.microsoft.com', + 'https://www.microsoft.com', + 'https://support.mozilla.org', + 'http://www.w3.org', + 'aadcdn.msauth.net', + 'https://ajax.googleapis.com', + 'https://code.jquery.com', + 'https://fonts.googleapis.com', + 'https://maxcdn.bootstrapcdn.com', + 'https://kit.fontawesome.com', + 'https://cdnjs.cloudflare.com', + 'https://getbootstrap.com', + 'https://use.fontawesome.com', + 'https://www.office.com', + 'https://github.com/twbs/bootstrap', + 'http://fonts.cdnfonts.com', + 'https://www.google.com/', + 'https://www.gstatic.com', + 'https://stackpath.bootstrapcdn.com', + 'https://ka-f.fontawesome.com', + 'https://fontawesome.com' +) diff --git a/modules/processing/html_scraper.py b/modules/processing/html_scraper.py index a39ba068..eb054116 100644 --- a/modules/processing/html_scraper.py +++ b/modules/processing/html_scraper.py @@ -3,58 +3,28 @@ import os import urllib.parse from typing import Optional - -from urlextract import URLExtract +from contextlib import suppress from lib.cuckoo.common.abstracts import Processing +from data.scraper_safe_url_list import safe_url_list -safe_url_list = ( - 'aadcdn.msftauth.net', - 'https://aadcdn.msauth.net', - 'https://aadcdn.msftauth.net', - 'https://login.live.com', - 'https://login.microsoftonline.com', - 'https://outlook.office365.com', - 'https://outlook-1.cdn.office.net', - 'https://outlook-2.cdn.office.net', - 'https://go.microsoft.com', - 'https://aka.ms', - 'https://privacy.microsoft.com', - 'https://www.microsoft.com', - 'https://support.mozilla.org', - 'http://www.w3.org', - 'aadcdn.msauth.net', - 'https://ajax.googleapis.com', - 'https://code.jquery.com', - 'https://fonts.googleapis.com', - 'https://maxcdn.bootstrapcdn.com', - 'https://kit.fontawesome.com', - 'https://cdnjs.cloudflare.com', - 'https://getbootstrap.com', - 'https://use.fontawesome.com', - 'https://www.office.com', - 'https://github.com/twbs/bootstrap', - 'http://fonts.cdnfonts.com', - 'https://www.google.com/', - 'https://www.gstatic.com', - 'https://stackpath.bootstrapcdn.com', - 'https://ka-f.fontawesome.com', - 'https://fontawesome.com' -) +HAVE_URLEXTRACT = False +with suppress(ImportError): + from urlextract import URLExtract + HAVE_URLEXTRACT = True log = logging.getLogger(__name__) def try_base64_decode(text: str, validate: bool = True) -> Optional[bytes]: - try: + result = None + with suppress(Exception): result = base64.b64decode(text, validate=validate) - return result - except Exception: - return None + return result def force_decode(text: str, max_decode_depth: int) -> Optional[str]: - for current_depth in range(max_decode_depth): + for _ in range(max_decode_depth): new_text = text base64_decoded_text = try_base64_decode(text, validate=True) if base64_decoded_text: @@ -74,13 +44,18 @@ def force_decode(text: str, max_decode_depth: int) -> Optional[str]: class HtmlScraper(Processing): def run(self): - log.info('Started html dump processing') + + if not HAVE_URLEXTRACT: + print("Missed optional dependency: poetry run pip install -r extra/optional_dependencies.txt") + return + + log.debug('Started html dump processing') self.key = 'html_scraper' html_dump_path = os.path.join(self.analysis_path, 'htmldump', 'html_dump.dump') last_url_path = os.path.join(self.analysis_path, 'htmldump', 'last_url.dump') if not os.path.isfile(html_dump_path): - log.info('Dump File not found, nothing to process') + log.debug('Dump File not found, nothing to process') return {} try: @@ -109,14 +84,12 @@ def run(self): with open(last_url_path, 'r') as f: addresses_in_html.add(f.read()) - filtered_addresses = {url.strip('\\x27') for url in addresses_in_html if - not url.startswith(safe_url_list)} + filtered_addresses = {url.strip('\\x27') for url in addresses_in_html if not url.startswith(safe_url_list)} - log.info('Finished html dump processing') + log.debug('Finished html dump processing') - return { - 'addresses': list(filtered_addresses) - } + return {'addresses': list(filtered_addresses)} except Exception: log.exception('Html dump processing failed') - return {} + + return {}