From de4b0199ce9e0b4175221e36b162a29f803650cc Mon Sep 17 00:00:00 2001 From: kazet Date: Wed, 3 Jan 2024 09:45:04 +0100 Subject: [PATCH] Detecting duplicate with parent has been moved to scanning stage to speed up report generation (#65) --- autoreporter_addons/ssl_checks/reporter.py | 19 +----------------- karton_ssl_checks/karton_ssl_checks.py | 23 ++++++++++++++++++++++ 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/autoreporter_addons/ssl_checks/reporter.py b/autoreporter_addons/ssl_checks/reporter.py index 4d7ac8f..d82f4db 100644 --- a/autoreporter_addons/ssl_checks/reporter.py +++ b/autoreporter_addons/ssl_checks/reporter.py @@ -1,4 +1,3 @@ -from difflib import SequenceMatcher from pathlib import Path from typing import Any, Callable, Dict, List from urllib.parse import urlparse @@ -14,7 +13,7 @@ from artemis.reporting.base.report_type import ReportType from artemis.reporting.base.reporter import Reporter from artemis.reporting.base.templating import ReportEmailTemplateFragment -from artemis.reporting.utils import cached_get, get_top_level_target +from artemis.reporting.utils import get_top_level_target from bs4 import BeautifulSoup from extra_modules_config import ExtraModulesConfig @@ -61,22 +60,6 @@ def create_reports(task_result: Dict[str, Any], language: Language) -> List[Repo if not isinstance(result, dict): return [] - try: - response = cached_get(f"https://{domain}") - parent_response = cached_get(f"https://{'.'.join(domain_parts[1:])}") - if SequenceMatcher(None, response.content, parent_response.content).quick_ratio() >= 0.8: - # Do not report misconfigurations if a domain has identical content to a parent domain - e.g. - # if we have mail.domain.com with identical content to domain.com, we assume that it's domain.com - # which is actually used, and therefore don't report subdomains. - return [] - except Exception: - logger.warning( - f"Unable to check whether domain {domain} has similar content to parent domain. Artemis SSL check " - "module tries to reduce the number of false positives by skipping reports where domain has similar " - "content to parent domain, as there are cases where e.g. mail.example.com serves the same content " - "as example.com. If this fails, two similar reports may get sent." - ) - if "response_status_code" in result and "response_content_prefix" in result: response_status_code = result["response_status_code"] response_content_prefix = result["response_content_prefix"] diff --git a/karton_ssl_checks/karton_ssl_checks.py b/karton_ssl_checks/karton_ssl_checks.py index ddedbff..a3de32f 100644 --- a/karton_ssl_checks/karton_ssl_checks.py +++ b/karton_ssl_checks/karton_ssl_checks.py @@ -2,6 +2,7 @@ import datetime import subprocess import urllib.parse +from difflib import SequenceMatcher from typing import Any, Dict, List import requests @@ -49,6 +50,28 @@ def run(self, current_task: Task) -> None: self.db.save_task_result(task=current_task, status=TaskStatus.OK) return + try: + response = http_requests.get(f"https://{domain}") + parent_domain = ".".join(domain_parts[1:]) + parent_response = http_requests.get(f"https://{parent_domain}") + if SequenceMatcher(None, response.content, parent_response.content).quick_ratio() >= 0.8: + # Do not report misconfigurations if a domain has identical content to a parent domain - e.g. + # if we have mail.domain.com with identical content to domain.com, we assume that it's domain.com + # which is actually used, and therefore don't report subdomains. + self.db.save_task_result( + task=current_task, + status=TaskStatus.OK, + status_reason=f"Detected that {domain} has similar content to {parent_domain}, not scanning to avoid duplicate reports", + ) + return + except Exception: + self.log.exception( + f"Unable to check whether domain {domain} has similar content to parent domain. Artemis SSL check " + "module tries to reduce the number of false positives by skipping scanning domains when domain has " + "similar content to parent domain, as there are cases where e.g. mail.example.com serves the same " + "content as example.com.", + ) + messages = [] result: Dict[str, Any] = {}