diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py index 6114f0a867a..c5fb6db26d4 100644 --- a/backend/danswer/connectors/web/connector.py +++ b/backend/danswer/connectors/web/connector.py @@ -1,4 +1,6 @@ import io +import ipaddress +import socket from enum import Enum from typing import Any from typing import cast @@ -27,7 +29,6 @@ from danswer.connectors.models import Section from danswer.utils.logger import setup_logger - logger = setup_logger() @@ -43,16 +44,33 @@ class WEB_CONNECTOR_VALID_SETTINGS(str, Enum): def protected_url_check(url: str) -> None: + """Couple considerations: + - DNS mapping changes over time so we don't want to cache the results + - Fetching this is assumed to be relatively fast compared to other bottlenecks like reading + the page or embedding the contents + - To be extra safe, all IPs associated with the URL must be global + """ parse = urlparse(url) - if parse.scheme == "file": - raise ValueError("Not permitted to read local files via Web Connector.") - if ( - parse.scheme == "localhost" - or parse.scheme == "127.0.0.1" - or parse.hostname == "localhost" - or parse.hostname == "127.0.0.1" - ): - raise ValueError("Not permitted to read localhost urls.") + if parse.scheme != "http" and parse.scheme != "https": + raise ValueError("URL must be of scheme https?://") + + if not parse.hostname: + raise ValueError("URL must include a hostname") + + try: + # This may give a large list of IP addresses for domains with extensive DNS configurations + # such as large distributed systems of CDNs + info = socket.getaddrinfo(parse.hostname, None) + except socket.gaierror as e: + raise ConnectionError(f"DNS resolution failed for {parse.hostname}: {e}") + + for address in info: + ip = address[4][0] + if not ipaddress.ip_address(ip).is_global: + raise ValueError( + f"Non-global IP address detected: {ip}, skipping page {url}. " + f"The Web Connector is not allowed to read loopback, link-local, or private ranges" + ) def check_internet_connection(url: str) -> None: @@ -194,6 +212,10 @@ def load_from_state(self) -> GenerateDocumentsOutput: base_url = to_visit[0] # For the recursive case doc_batch: list[Document] = [] + # Needed to report error + at_least_one_doc = False + last_error = None + playwright, context = start_playwright() restart_playwright = False while to_visit: @@ -202,7 +224,12 @@ def load_from_state(self) -> GenerateDocumentsOutput: continue visited_links.add(current_url) - protected_url_check(current_url) + try: + protected_url_check(current_url) + except Exception as e: + last_error = f"Invalid URL {current_url} due to {e}" + logger.warning(last_error) + continue logger.info(f"Visiting {current_url}") @@ -251,9 +278,8 @@ def load_from_state(self) -> GenerateDocumentsOutput: to_visit.append(link) if page_response and str(page_response.status)[0] in ("4", "5"): - logger.info( - f"Skipped indexing {current_url} due to HTTP {page_response.status} response" - ) + last_error = f"Skipped indexing {current_url} due to HTTP {page_response.status} response" + logger.info(last_error) continue parsed_html = web_html_cleanup(soup, self.mintlify_cleanup) @@ -272,7 +298,8 @@ def load_from_state(self) -> GenerateDocumentsOutput: page.close() except Exception as e: - logger.error(f"Failed to fetch '{current_url}': {e}") + last_error = f"Failed to fetch '{current_url}': {e}" + logger.error(last_error) playwright.stop() restart_playwright = True continue @@ -280,13 +307,20 @@ def load_from_state(self) -> GenerateDocumentsOutput: if len(doc_batch) >= self.batch_size: playwright.stop() restart_playwright = True + at_least_one_doc = True yield doc_batch doc_batch = [] if doc_batch: playwright.stop() + at_least_one_doc = True yield doc_batch + if not at_least_one_doc: + if last_error: + raise RuntimeError(last_error) + raise RuntimeError("No valid pages found.") + if __name__ == "__main__": connector = WebConnector("https://docs.danswer.dev/")