Skip to content

Commit

Permalink
Switch HTTP requests to python-requests (ref #714) (#904)
Browse files Browse the repository at this point in the history
  • Loading branch information
mxsasha authored Apr 11, 2023
1 parent 9d9431b commit c5ba541
Show file tree
Hide file tree
Showing 12 changed files with 202 additions and 196 deletions.
102 changes: 102 additions & 0 deletions checks/http_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# Copyright: 2022, ECP, NLnet Labs and the Internet.nl contributors
# SPDX-License-Identifier: Apache-2.0
import socket
from timeit import default_timer as timer
from typing import Optional, Dict

import requests
import unbound
import urllib3
from forcediphttpsadapter.adapters import ForcedIPHTTPSAdapter

from checks.tasks import SetupUnboundContext
from checks.tasks.tls_connection import DEFAULT_TIMEOUT
from checks.tasks.tls_connection_exceptions import NoIpError
from interface.views.shared import ub_resolve_with_timeout
from internetnl import log

# Disable HTTPS warnings as we intentionally disable HTTPS verification
urllib3.disable_warnings()


def http_get(
url: str, headers: Optional[Dict] = None, session: Optional[requests.Session] = None, *args, **kwargs
) -> requests.Response:
"""
Perform a standard HTTP GET request. If session is given, it is used.
Other (kw)args are passed to requests.get.
"""
start_time = timer()

if not headers:
headers = {}
headers["User-Agent"] = "internetnl/1.0"
if not session:
session = requests.session()

try:
response = session.get(url, headers=headers, stream=True, *args, **kwargs)
except requests.RequestException:
# Retry, once, then log and raise the exception
try:
response = session.get(url, headers=headers, stream=True, *args, **kwargs)
except requests.RequestException as exc:
log.debug(f"HTTP request raised exception: {url} (headers: {headers}): {exc}", exc_info=exc)
raise exc

log.debug(f"HTTP request completed in {timer()-start_time:.06f}s: {url} (headers: {headers})")
return response


def http_get_ip(
hostname: str,
ip: str,
port: int,
path: str = "/",
https: bool = True,
headers: Optional[Dict] = None,
*args,
**kwargs,
) -> requests.Response:
"""
Perform an HTTP GET with the given parameters, while forcing the destination IP
to a particular IP that may not match the hostname.
TLS certificate verification is always disabled.
Other (kw)args are passed to requests.get.
"""
path = path.lstrip("/")
if not headers:
headers = {}

session = requests.session()
if https:
session.mount(f"https://{hostname}", ForcedIPHTTPSAdapter(dest_ip=ip))
url = f"https://{hostname}:{port}/{path}"
else:
if ":" in ip:
ip = f"[{ip}]"
url = f"http://{ip}:{port}/{path}"
headers["Host"] = hostname
return http_get(url, verify=False, headers=headers, session=session, *args, **kwargs)


def http_get_af(
hostname: str, port: int, af: socket.AddressFamily, task: Optional[SetupUnboundContext] = None, *args, **kwargs
) -> requests.Response:
"""
Perform an HTTP GET request to the given hostname/port, restricting to a certain address family.
Other (kw)args are passed to requests.get.
"""
rr_type = unbound.RR_TYPE_AAAA if af == socket.AF_INET6 else unbound.RR_TYPE_A
if task:
ips = task.resolve(hostname, rr_type)
else:
cb_data = ub_resolve_with_timeout(hostname, rr_type, unbound.RR_CLASS_IN, DEFAULT_TIMEOUT)
ips = [socket.inet_ntop(af, rr) for rr in cb_data["data"].data]
exc = NoIpError(f"Unable to resolve {rr_type} record for host '{hostname}'")
for ip in ips:
try:
return http_get_ip(hostname, ip, port, *args, **kwargs)
except requests.RequestException as request_exception:
exc = request_exception
raise exc
16 changes: 1 addition & 15 deletions checks/tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
# Copyright: 2022, ECP, NLnet Labs and the Internet.nl contributors
# SPDX-License-Identifier: Apache-2.0
import os
import requests
import socket
import time

import unbound
from celery import Task
from celery.exceptions import SoftTimeLimitExceeded
from django.conf import settings

import unbound
from internetnl import log
from timeit import default_timer as timer


class SetupUnboundContext(Task):
Expand Down Expand Up @@ -114,15 +112,3 @@ def callback(self, data, status, result):
data["data"] = result.data
data["rcode"] = result.rcode
data["done"] = True

def get(self, url, *args, **kwargs):
"""
Perform a HTTP GET request using the stored session
"""
start_time = timer()
if not hasattr(self, "_requests_session"):
self._requests_session = requests.session()

response = self._requests_session.get(url, *args, **kwargs)
log.debug(f"HTTP request completed in {timer()-start_time:.06f}s: {url}")
return response
33 changes: 12 additions & 21 deletions checks/tasks/http_headers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Copyright: 2022, ECP, NLnet Labs and the Internet.nl contributors
# SPDX-License-Identifier: Apache-2.0
import http.client
import re
from collections import defaultdict, namedtuple

import requests

from checks import scoring
from checks.tasks.tls_connection import MAX_REDIRECT_DEPTH, http_fetch
from checks.tasks.tls_connection_exceptions import ConnectionHandshakeException, ConnectionSocketException, NoIpError
from checks.http_client import http_get_ip


def get_multiple_values_from_header(header):
Expand Down Expand Up @@ -700,31 +700,22 @@ def http_headers_check(af_ip_pair, domain, header_checkers, task):
for h in header_checkers:
results.update(h.get_positive_values())

put_headers = (("Accept-Encoding", "compress, deflate, exi, gzip, " "pack200-gzip, x-compress, x-gzip"),)
get_headers = [h.name for h in header_checkers]
put_headers = {"Accept-Encoding": "compress, deflate, exi, gzip, pack200-gzip, x-compress, x-gzip"}
try:
conn, res, headers, visited_hosts = http_fetch(
domain,
af=af_ip_pair[0],
path="",
response = http_get_ip(
hostname=domain,
ip=af_ip_pair[1],
port=443,
task=task,
ip_address=af_ip_pair[1],
put_headers=put_headers,
depth=MAX_REDIRECT_DEPTH,
needed_headers=get_headers,
headers=put_headers,
allow_redirects=False,
)
except (OSError, http.client.BadStatusLine, NoIpError, ConnectionHandshakeException, ConnectionSocketException):
except requests.RequestException:
# Not able to connect, return negative values
for h in header_checkers:
results.update(h.get_negative_values())
results["server_reachable"] = False
else:
if 443 in headers:
for name, value in headers[443]:
for header_checker in header_checkers:
if name == header_checker.name:
header_checker.check(value, results, domain)
break
for header_checker in header_checkers:
header_checker.check(response.headers.get(header_checker.name), results, domain)

return results
50 changes: 20 additions & 30 deletions checks/tasks/ipv6.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Copyright: 2022, ECP, NLnet Labs and the Internet.nl contributors
# SPDX-License-Identifier: Apache-2.0
import http.client
import ipaddress
import socket
import time
from difflib import SequenceMatcher
from typing import List

import requests
from bs4 import BeautifulSoup
from celery import shared_task
from celery.exceptions import SoftTimeLimitExceeded
Expand All @@ -15,16 +15,17 @@
from django.db import transaction

from checks import categories, scoring
from checks.http_client import http_get_af
from checks.models import DomainTestIpv6, MailTestIpv6, MxDomain, MxStatus, NsDomain, WebDomain
from checks.tasks import SetupUnboundContext, dispatcher, shared
from checks.tasks.dispatcher import check_registry
from checks.tasks.tls_connection import http_fetch
from checks.tasks.tls_connection_exceptions import ConnectionHandshakeException, ConnectionSocketException, NoIpError
from interface import batch, batch_shared_task, redis_id
from interface.views.shared import pretty_domain_name
from internetnl import log
from unbound import RR_CLASS_IN, RR_TYPE_A, RR_TYPE_AAAA, RR_TYPE_NS, ub_ctx

SIMHASH_MAX_RESPONSE_SIZE = 500000

# mapping tasks to models
model_map = dict(web=WebDomain, ns=NsDomain, mx=MxDomain)

Expand Down Expand Up @@ -557,47 +558,36 @@ def strip_irrelevant_html(html):
simhash_score = scoring.WEB_IPV6_WS_SIMHASH_FAIL
distance = settings.SIMHASH_MAX + 100

v4_conn = None
v6_conn = None
v4_response = None
v6_response = None
for port in [80, 443]:
try:
v4_conn, v4_res, _, _ = http_fetch(url, socket.AF_INET, port=port, task=task, keep_conn_open=True)
v6_conn, v6_res, _, _ = http_fetch(url, socket.AF_INET6, port=port, task=task, keep_conn_open=True)
v4_response = http_get_af(hostname=url, port=port, af=socket.AF_INET, task=task, https=port == 443)
v6_response = http_get_af(hostname=url, port=port, af=socket.AF_INET6, task=task, https=port == 443)
break
except (OSError, NoIpError, http.client.BadStatusLine, ConnectionHandshakeException, ConnectionSocketException):
except requests.RequestException:
# Could not connect on given port, try another port.
# If we managed to connect on IPv4 however, fail the test.
if v4_conn:
v4_conn.close()
if v4_response:
return simhash_score, distance

if not v4_conn:
if not v4_response:
# FAIL: Could not establish a connection on both addresses.
return simhash_score, distance

html_v4 = ""
html_v6 = ""
try:
# read max 0.5MB
html_v4 = v4_res.read(500000)
v4_conn.close()
v4_conn = None

html_v6 = v6_res.read(500000)
v6_conn.close()
v6_conn = None
except http.client.IncompleteRead:
log.debug(
"simhash IncompleteRead content > 5000000 - if this happens more often we may "
"need to enlarge it, logging for statistical purposes"
)
except OSError:
if v4_conn:
v4_conn.close()
if v6_conn:
v6_conn.close()
html_v4 = next(v4_response.iter_content(SIMHASH_MAX_RESPONSE_SIZE))
html_v6 = next(v6_response.iter_content(SIMHASH_MAX_RESPONSE_SIZE))
except (OSError, IOError) as exc:
log.debug("simhash encountered exception while reading response: {exc}", exc_info=exc)
return simhash_score, distance

for html, response in (html_v4, v4_response), (html_v6, v6_response):
content_length = response.headers.get("content-length", "")
if content_length.isnumeric() and len(html) < int(content_length):
log.debug(f"simhash only read first {SIMHASH_MAX_RESPONSE_SIZE} out of {content_length} bytes")

html_v4 = strip_irrelevant_html(html_v4)
html_v6 = strip_irrelevant_html(html_v6)
sim = SequenceMatcher(None, html_v4, html_v6)
Expand Down
8 changes: 4 additions & 4 deletions checks/tasks/mail.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@

import unbound
from checks import DMARC_NON_SENDING_POLICY, DMARC_NON_SENDING_POLICY_ORG, SPF_NON_SENDING_POLICY, categories, scoring
from checks.http_client import http_get
from checks.models import DmarcPolicyStatus, MailTestAuth, SpfPolicyStatus
from checks.tasks import SetupUnboundContext
from checks.tasks.dispatcher import check_registry, post_callback_hook
from checks.tasks.dmarc_parser import parse as dmarc_parse
from checks.tasks.spf_parser import parse as spf_parse
from checks.tasks.tls_connection import http_get
from interface import batch, batch_shared_task, redis_id
from internetnl import log

Expand Down Expand Up @@ -759,11 +759,11 @@ def dmarc_fetch_public_suffix_list():
"""
public_suffix_list = []
r = http_get(settings.PUBLIC_SUFFIX_LIST_URL)
if not r:
response = http_get(settings.PUBLIC_SUFFIX_LIST_URL)
if response.status_code != 200 or not response.text:
return public_suffix_list

lines = r.text.split("\n")
lines = response.text.split("\n")
for line in lines:
labels = []
line = line.rstrip()
Expand Down
4 changes: 3 additions & 1 deletion checks/tasks/routing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

from typing import Any, Dict, List, NewType, Tuple, Type, TypeVar

from checks.http_client import http_get

Asn = NewType("Asn", int)
Ip = NewType("Ip", str)
Prefix = NewType("Prefix", str)
Expand Down Expand Up @@ -309,7 +311,7 @@ def query(task: T, asn: Asn, prefix: Prefix) -> Dict:
"""
request = f"{settings.ROUTINATOR_URL}/{asn}/{prefix}"
try:
response = task.get(request)
response = http_get(request)

# throw exception during Routinator initialization
response.raise_for_status()
Expand Down
Loading

0 comments on commit c5ba541

Please sign in to comment.