Skip to content
This repository has been archived by the owner on Dec 11, 2024. It is now read-only.

Commit

Permalink
Web Connector to only allow Global IPs (onyx-dot-app#1357)
Browse files Browse the repository at this point in the history
  • Loading branch information
yuhongsun96 authored Apr 20, 2024
1 parent 7d51549 commit f616b7e
Showing 1 changed file with 49 additions and 15 deletions.
64 changes: 49 additions & 15 deletions backend/danswer/connectors/web/connector.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import io
import ipaddress
import socket
from enum import Enum
from typing import Any
from typing import cast
Expand Down Expand Up @@ -27,7 +29,6 @@
from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger


logger = setup_logger()


Expand All @@ -43,16 +44,33 @@ class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):


def protected_url_check(url: str) -> None:
"""Couple considerations:
- DNS mapping changes over time so we don't want to cache the results
- Fetching this is assumed to be relatively fast compared to other bottlenecks like reading
the page or embedding the contents
- To be extra safe, all IPs associated with the URL must be global
"""
parse = urlparse(url)
if parse.scheme == "file":
raise ValueError("Not permitted to read local files via Web Connector.")
if (
parse.scheme == "localhost"
or parse.scheme == "127.0.0.1"
or parse.hostname == "localhost"
or parse.hostname == "127.0.0.1"
):
raise ValueError("Not permitted to read localhost urls.")
if parse.scheme != "http" and parse.scheme != "https":
raise ValueError("URL must be of scheme https?://")

if not parse.hostname:
raise ValueError("URL must include a hostname")

try:
# This may give a large list of IP addresses for domains with extensive DNS configurations
# such as large distributed systems of CDNs
info = socket.getaddrinfo(parse.hostname, None)
except socket.gaierror as e:
raise ConnectionError(f"DNS resolution failed for {parse.hostname}: {e}")

for address in info:
ip = address[4][0]
if not ipaddress.ip_address(ip).is_global:
raise ValueError(
f"Non-global IP address detected: {ip}, skipping page {url}. "
f"The Web Connector is not allowed to read loopback, link-local, or private ranges"
)


def check_internet_connection(url: str) -> None:
Expand Down Expand Up @@ -194,6 +212,10 @@ def load_from_state(self) -> GenerateDocumentsOutput:
base_url = to_visit[0] # For the recursive case
doc_batch: list[Document] = []

# Needed to report error
at_least_one_doc = False
last_error = None

playwright, context = start_playwright()
restart_playwright = False
while to_visit:
Expand All @@ -202,7 +224,12 @@ def load_from_state(self) -> GenerateDocumentsOutput:
continue
visited_links.add(current_url)

protected_url_check(current_url)
try:
protected_url_check(current_url)
except Exception as e:
last_error = f"Invalid URL {current_url} due to {e}"
logger.warning(last_error)
continue

logger.info(f"Visiting {current_url}")

Expand Down Expand Up @@ -251,9 +278,8 @@ def load_from_state(self) -> GenerateDocumentsOutput:
to_visit.append(link)

if page_response and str(page_response.status)[0] in ("4", "5"):
logger.info(
f"Skipped indexing {current_url} due to HTTP {page_response.status} response"
)
last_error = f"Skipped indexing {current_url} due to HTTP {page_response.status} response"
logger.info(last_error)
continue

parsed_html = web_html_cleanup(soup, self.mintlify_cleanup)
Expand All @@ -272,21 +298,29 @@ def load_from_state(self) -> GenerateDocumentsOutput:

page.close()
except Exception as e:
logger.error(f"Failed to fetch '{current_url}': {e}")
last_error = f"Failed to fetch '{current_url}': {e}"
logger.error(last_error)
playwright.stop()
restart_playwright = True
continue

if len(doc_batch) >= self.batch_size:
playwright.stop()
restart_playwright = True
at_least_one_doc = True
yield doc_batch
doc_batch = []

if doc_batch:
playwright.stop()
at_least_one_doc = True
yield doc_batch

if not at_least_one_doc:
if last_error:
raise RuntimeError(last_error)
raise RuntimeError("No valid pages found.")


if __name__ == "__main__":
connector = WebConnector("https://docs.danswer.dev/")
Expand Down

0 comments on commit f616b7e

Please sign in to comment.