From 56dc2b53f6c41b234f0ba7eba6e888d91c03bc07 Mon Sep 17 00:00:00 2001
From: redshiftzero <jen@freedom.press>
Date: Thu, 17 Sep 2020 19:33:26 -0400
Subject: [PATCH] onion-location: factor meta tag retrieval into function

---
 sites/management/commands/scan.py | 57 +++++++++++++++++++------------
 1 file changed, 36 insertions(+), 21 deletions(-)
diff --git a/sites/management/commands/scan.py b/sites/management/commands/scan.py
index db4d2693..e50897c5 100644
--- a/sites/management/commands/scan.py
+++ b/sites/management/commands/scan.py
@@ -14,6 +14,7 @@
 
 TIMEOUT_REQUESTS = 5
 
+
 def pshtt(domain):
     pshtt_cmd = ['pshtt', '--json', '--timeout', '5', domain]
 
@@ -31,14 +32,36 @@ def pshtt(domain):
     return pshtt_results, stdout, stderr
 
 
+def is_onion_loc_in_meta_tag(url: str) -> Optional[bool]:
+    """
+    Make request to target URL, parse page content and see if there is a
+    tag with format:
+
+    <meta http-equiv="onion-location" content="http://myonion.onion">
+    """
+    try:
+        r = requests.get(url, timeout=TIMEOUT_REQUESTS)
+        tree = html.fromstring(r.content)
+        tags = tree.xpath('//meta[@http-equiv="onion-location"]/@content')
+        if len(tags) >= 1:
+            return True
+    except (etree.ParserError, requests.exceptions.RequestException) as e:
+        # Error when requesting or parsing the page content, we log and
+        # continue on.
+        logger.error(e)
+        return None
+
+    return False
+
+
 def is_onion_available(pshtt_results) -> Optional[bool]:
     """
-    For HTTPS sites, we inspect the headers to see if the
-    Onion-Location header is present, indicating that the
-    site is available as an onion service.
+    For HTTPS sites, we see if an Onion-Location is provided, indicating that
+    the site is available as an onion service.
     """
     onion_available = False
 
+    # First we see if the header is provided.
     for key in ["https", "httpswww"]:
         try:
             headers = pshtt_results["endpoints"][key]["headers"]
@@ -51,23 +74,14 @@ def is_onion_available(pshtt_results) -> Optional[bool]:
     # If the header is not provided, it's possible the news organization
     # has included it the HTML of the page in a meta tag using the `http-equiv`
     # attribute.
-    if not onion_available:
-        for key in ["https", "httpswww"]:
-            try:
-                r = requests.get(pshtt_results["endpoints"][key]["url"],
-                                 timeout=TIMEOUT_REQUESTS)
-
-                tree = html.fromstring(r.content)
-                matching_meta_tags = tree.xpath('//meta[@http-equiv="onion-location"]/@content')
-                if len(matching_meta_tags) >= 1:
-                    onion_available = True
-                    return onion_available
-            except KeyError:
-                pass
-            except (etree.ParserError, requests.exceptions.RequestException) as e:
-                onion_available = None
-                # Error when requesting or parsing the page content, we log and continue on.
-                logger.error(e)
+    for key in ["https", "httpswww"]:
+        try:
+            url = pshtt_results["endpoints"][key]["url"]
+            onion_available = is_onion_loc_in_meta_tag(url)
+            if onion_available is not None:
+                return onion_available
+        except KeyError:
+            pass
 
     return onion_available
 
@@ -78,7 +92,6 @@ def scan(site):
 
     scan = Scan(
         site=site,
-        onion_available=is_onion_available(results),
 
         live=results['Live'],
 
@@ -92,6 +105,8 @@ def scan(site):
         hsts_preload_ready=results['HSTS Preload Ready'],
         hsts_preloaded=results['HSTS Preloaded'],
 
+        onion_available=is_onion_available(results),
+
         pshtt_stdout=stdout,
         pshtt_stderr=stderr,
     )