From 56dc2b53f6c41b234f0ba7eba6e888d91c03bc07 Mon Sep 17 00:00:00 2001 From: redshiftzero Date: Thu, 17 Sep 2020 19:33:26 -0400 Subject: [PATCH] onion-location: factor meta tag retrieval into function --- sites/management/commands/scan.py | 57 +++++++++++++++++++------------ 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/sites/management/commands/scan.py b/sites/management/commands/scan.py index db4d2693..e50897c5 100644 --- a/sites/management/commands/scan.py +++ b/sites/management/commands/scan.py @@ -14,6 +14,7 @@ TIMEOUT_REQUESTS = 5 + def pshtt(domain): pshtt_cmd = ['pshtt', '--json', '--timeout', '5', domain] @@ -31,14 +32,36 @@ def pshtt(domain): return pshtt_results, stdout, stderr +def is_onion_loc_in_meta_tag(url: str) -> Optional[bool]: + """ + Make request to target URL, parse page content and see if there is a + tag with format: + + + """ + try: + r = requests.get(url, timeout=TIMEOUT_REQUESTS) + tree = html.fromstring(r.content) + tags = tree.xpath('//meta[@http-equiv="onion-location"]/@content') + if len(tags) >= 1: + return True + except (etree.ParserError, requests.exceptions.RequestException) as e: + # Error when requesting or parsing the page content, we log and + # continue on. + logger.error(e) + return None + + return False + + def is_onion_available(pshtt_results) -> Optional[bool]: """ - For HTTPS sites, we inspect the headers to see if the - Onion-Location header is present, indicating that the - site is available as an onion service. + For HTTPS sites, we see if an Onion-Location is provided, indicating that + the site is available as an onion service. """ onion_available = False + # First we see if the header is provided. for key in ["https", "httpswww"]: try: headers = pshtt_results["endpoints"][key]["headers"] @@ -51,23 +74,14 @@ def is_onion_available(pshtt_results) -> Optional[bool]: # If the header is not provided, it's possible the news organization # has included it the HTML of the page in a meta tag using the `http-equiv` # attribute. - if not onion_available: - for key in ["https", "httpswww"]: - try: - r = requests.get(pshtt_results["endpoints"][key]["url"], - timeout=TIMEOUT_REQUESTS) - - tree = html.fromstring(r.content) - matching_meta_tags = tree.xpath('//meta[@http-equiv="onion-location"]/@content') - if len(matching_meta_tags) >= 1: - onion_available = True - return onion_available - except KeyError: - pass - except (etree.ParserError, requests.exceptions.RequestException) as e: - onion_available = None - # Error when requesting or parsing the page content, we log and continue on. - logger.error(e) + for key in ["https", "httpswww"]: + try: + url = pshtt_results["endpoints"][key]["url"] + onion_available = is_onion_loc_in_meta_tag(url) + if onion_available is not None: + return onion_available + except KeyError: + pass return onion_available @@ -78,7 +92,6 @@ def scan(site): scan = Scan( site=site, - onion_available=is_onion_available(results), live=results['Live'], @@ -92,6 +105,8 @@ def scan(site): hsts_preload_ready=results['HSTS Preload Ready'], hsts_preloaded=results['HSTS Preloaded'], + onion_available=is_onion_available(results), + pshtt_stdout=stdout, pshtt_stderr=stderr, )