From c5ba13084247939b7f894dc0113eeef0ac724b95 Mon Sep 17 00:00:00 2001 From: Nekmo Date: Wed, 9 Aug 2023 07:56:49 +0200 Subject: [PATCH] Issue #83: Use Asyncio --- dirhunt/colors.py | 12 +-- dirhunt/crawler.py | 19 ++-- dirhunt/crawler_url.py | 212 ++++++++++++++++++++++++--------------- dirhunt/processors.py | 222 ++++++++++++++++++++++------------------- dirhunt/url.py | 7 +- dirhunt/utils.py | 1 + 6 files changed, 276 insertions(+), 197 deletions(-) diff --git a/dirhunt/colors.py b/dirhunt/colors.py index 3de69fa..983cbba 100644 --- a/dirhunt/colors.py +++ b/dirhunt/colors.py @@ -3,14 +3,14 @@ def status_code_colors(status_code): if 100 <= status_code < 200: - return Fore.WHITE + return "white" elif 200 == status_code: - return Fore.LIGHTGREEN_EX + return "green1" elif 200 < status_code < 300: - return Fore.GREEN + return "green3" elif 300 <= status_code < 400: - return Fore.LIGHTBLUE_EX + return "deep_sky_blue1" elif 500 == status_code: - return Fore.LIGHTMAGENTA_EX + return "magenta1" else: - return Fore.MAGENTA + return "medium_orchid1" diff --git a/dirhunt/crawler.py b/dirhunt/crawler.py index f43b281..cf4aa73 100644 --- a/dirhunt/crawler.py +++ b/dirhunt/crawler.py @@ -8,9 +8,11 @@ from concurrent.futures.thread import _python_exit from threading import Lock, ThreadError import datetime +from typing import Optional import humanize as humanize from click import get_terminal_size +from rich.console import Console from dirhunt import processors from dirhunt import __version__ @@ -65,12 +67,13 @@ def __init__(self, configuration: Configuration, loop: asyncio.AbstractEventLoop self.configuration = configuration self.loop = loop self.tasks = set() + self.crawler_urls = set() + self.domains = set() + self.console = Console(highlight=False) self.session = Session() self.domain_semaphore = DomainSemaphore(configuration.concurrency) - self.domains = set() self.results = Queue() self.index_of_processors = [] - self.processing = {} self.processed = {} self.add_lock = Lock() self.start_dt = datetime.datetime.now() @@ -82,16 +85,18 @@ async def start(self): await self.add_crawler_url( CrawlerUrl(self, url, depth=self.configuration.max_depth) ) - await asyncio.wait(self.tasks) + while self.tasks: + await asyncio.wait(self.tasks) - async def add_crawler_url(self, crawler_url: CrawlerUrl): + async def add_crawler_url(self, crawler_url: CrawlerUrl) -> Optional[asyncio.Task]: """Add crawler_url to tasks""" - if crawler_url.url.url in self.processing: + if crawler_url.url.url in self.crawler_urls: return task = self.loop.create_task(crawler_url.retrieve()) self.tasks.add(task) - self.processing[crawler_url.url.url] = task - task.add_done_callback(lambda: self.tasks.discard(task)) + self.crawler_urls.add(crawler_url) + task.add_done_callback(self.tasks.discard) + return task def add_init_urls(self, *urls): """Add urls to queue.""" diff --git a/dirhunt/crawler_url.py b/dirhunt/crawler_url.py index 0d8d463..8a2b9da 100644 --- a/dirhunt/crawler_url.py +++ b/dirhunt/crawler_url.py @@ -1,8 +1,9 @@ # -*- coding: utf-8 -*- import cgi import socket -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Optional +from aiohttp.web_response import Response from bs4 import BeautifulSoup from requests import RequestException from urllib3.exceptions import ReadTimeoutError @@ -19,9 +20,86 @@ if TYPE_CHECKING: from dirhunt.crawler import Crawler + from dirhunt.processors import ProcessBase -class CrawlerUrl(object): +class CrawlerUrlRequest: + response = Optional[Response] + content: Optional[str] = None + _soup: Optional[BeautifulSoup] = None + + def __init__(self, crawler_url: "CrawlerUrl"): + self.crawler_url = crawler_url + self.crawler = crawler_url.crawler + + async def retrieve(self) -> "ProcessBase": + from dirhunt.processors import ( + get_processor, + Error, + ) + + text = "" + try: + await self.crawler.domain_semaphore.acquire(self.crawler_url.url.domain) + pass + async with self.crawler.session.get( + self.crawler_url.url.url, + verify_ssl=False, + timeout=self.crawler.configuration.timeout, + allow_redirects=False, + ) as response: + self.crawler_url.set_type(response.headers.get("Content-Type")) + self.crawler_url.flags.add(str(response.status)) + self.response = response + processor = get_processor(self) + if processor and processor.requires_content: + encoding = response.get_encoding() + self.content = ( + await response.content.read(MAX_RESPONSE_SIZE) + ).decode(encoding, errors="ignore") + if processor.has_descendants: + processor = get_processor(self) + # text = "" + # soup = None + # processor = None + # if response.status_code < 300 and self.must_be_downloaded(response): + # try: + # text = response.raw.read(MAX_RESPONSE_SIZE, decode_content=True) + # except (RequestException, ReadTimeoutError, socket.timeout) as e: + # self.crawler.current_processed_count += 1 + # self.crawler.results.put(Error(self, e)) + # self.close() + # return self + # content_type = cgi.parse_header( + # response.headers.get("Content-Type", "") + # )[0] + # soup = ( + # BeautifulSoup(text, "html.parser") + # if content_type == "text/html" + # else None + # ) + except RequestException as e: + self.crawler.current_processed_count += 1 + processor = Error(self, e) + else: + await processor.process(self) + finally: + self.crawler.domain_semaphore.release(self.crawler_url.url.domain) + return processor + + @property + def soup(self): + if self._soup is None and self.content is not None: + self._soup = BeautifulSoup(self.content, "html.parser") + return self._soup + + def __repr__(self): + return "".format(self.crawler_url.url) + + +class CrawlerUrl: + processor: Optional["ProcessBase"] = None + def __init__( self, crawler: "Crawler", @@ -50,13 +128,13 @@ def __init__( self.exists = exists self.url_type = url_type if url.is_valid() and (not url.path or url.path == "/"): - self.type = "directory" + self.url_type = "directory" self.resp = None self.processor_data = None - def add_self_directories(self, exists=None, url_type=None): + async def add_self_directories(self, exists=None, url_type=None): for url in self.url.breadcrumb(): - self.crawler.add_crawler_url( + await self.crawler.add_crawler_url( CrawlerUrl( self.crawler, url, @@ -66,91 +144,56 @@ def add_self_directories(self, exists=None, url_type=None): url_type, ) ) - # TODO: si no se puede añadir porque ya se ha añadido, establecer como que ya existe si la orden es exists + # TODO: if exists=True and the urls is already processed before add it, but the already processed + # url has exists=False, then update the exists to True async def retrieve(self): - from dirhunt.processors import ( - get_processor, - GenericProcessor, - Error, - ProcessIndexOfRequest, - ) - - try: - await self.crawler.domain_semaphore.acquire(self.url.domain) - async with self.crawler.session.get( - self.url.url, - verify_ssl=False, - timeout=self.crawler.configuration.timeout, - allow_redirects=False, - ) as resp: - self.set_type(resp.headers.get("Content-Type")) - self.flags.add(str(resp.status)) - text = "" - soup = None - processor = None - if resp.status_code < 300 and self.must_be_downloaded(resp): - try: - text = resp.raw.read(MAX_RESPONSE_SIZE, decode_content=True) - except (RequestException, ReadTimeoutError, socket.timeout) as e: - self.crawler.current_processed_count += 1 - self.crawler.results.put(Error(self, e)) - self.close() - return self - content_type = cgi.parse_header( - resp.headers.get("Content-Type", "") - )[0] - soup = ( - BeautifulSoup(text, "html.parser") - if content_type == "text/html" - else None - ) - except RequestException as e: - self.crawler.current_processed_count += 1 - self.crawler.results.put(Error(self, e)) - self.close() - return self - finally: - self.crawler.domain_semaphore.release(self.url.domain) - - if self.must_be_downloaded(resp): - processor = get_processor(resp, text, self, soup) or GenericProcessor( - resp, self - ) - processor.process(text, soup) - self.flags.update(processor.flags) - if self.maybe_directory(): - self.crawler.results.put(processor) - if processor is not None: - self.processor_data = processor.json() - if processor and isinstance(processor, ProcessIndexOfRequest): - self.crawler.index_of_processors.append(processor) - else: - self.crawler.current_processed_count += 1 - # TODO: Podemos fijarnos en el processor.index_file. Si existe y es un 200, entonces es que existe. - if self.exists is None and resp.status_code < 404: + from processors import GenericProcessor + + crawler_url_request = CrawlerUrlRequest(self) + processor = await crawler_url_request.retrieve() + if processor is not None and not isinstance(processor, GenericProcessor): + self.crawler.console.print(processor.get_text()) + # if self.must_be_downloaded(response): + # processor = get_processor(response, text, self, soup) or GenericProcessor( + # response, self + # ) + # processor.process(text, soup) + # self.flags.update(processor.flags) + # if self.maybe_directory(): + # self.crawler.results.put(processor) + # if processor is not None: + # self.processor_data = processor.json() + # if processor and isinstance(processor, ProcessIndexOfRequest): + # self.crawler.index_of_processors.append(processor) + # else: + # self.crawler.current_processed_count += 1 + if ( + self.exists is None + and crawler_url_request.response is not None + and crawler_url_request.response.status < 404 + ): self.exists = True - self.add_self_directories( - True if (not self.maybe_rewrite() and self.exists) else None, - "directory" if not self.maybe_rewrite() else None, - ) - self.close() - return self + # TODO: uncomment + # await self.add_self_directories( + # True if (not self.maybe_rewrite() and self.exists) else None, + # "directory" if not self.maybe_rewrite() else None, + # ) def set_type(self, content_type): from dirhunt.processors import INDEX_FILES - if not self.type and not (content_type or "").startswith("text/html"): - self.type = "asset" + if not self.url_type and not (content_type or "").startswith("text/html"): + self.url_type = "asset" if ( - not self.type + not self.url_type and (content_type or "").startswith("text/html") and self.url.name in INDEX_FILES ): - self.type = "document" + self.url_type = "document" def maybe_rewrite(self): - return self.type not in ["asset", "directory"] + return self.url_type not in ["asset", "directory"] def must_be_downloaded(self, response): """The file must be downloaded to obtain information.""" @@ -174,10 +217,6 @@ def weight(self): value -= len(list(self.url.breadcrumb())) * 1.5 return value - def close(self): - self.crawler.processed[self.url.url] = self - del self.crawler.processing[self.url.url] - def json(self): return { "flags": self.flags, @@ -186,3 +225,14 @@ def json(self): "type": self.type, "exists": self.exists, } + + def __repr__(self): + return f"" + + def __eq__(self, other: Any) -> bool: + if not isinstance(other, CrawlerUrl): + return False + return self.url.url == other.url.url + + def __hash__(self): + return hash(self.url.url) diff --git a/dirhunt/processors.py b/dirhunt/processors.py index 4a07cd9..5a18b49 100644 --- a/dirhunt/processors.py +++ b/dirhunt/processors.py @@ -1,6 +1,10 @@ # -*- coding: utf-8 -*- import re import sys +from typing import List, Type + +from aiohttp.web_response import Response +from rich.text import Text from dirhunt.directory_lists import get_directory_list @@ -12,7 +16,7 @@ from colorama import Fore, Back from dirhunt.colors import status_code_colors -from dirhunt.crawler_url import CrawlerUrl +from dirhunt.crawler_url import CrawlerUrl, CrawlerUrlRequest from dirhunt.url import Url, full_url_address from dirhunt.url_loop import is_url_loop from dirhunt.utils import colored @@ -61,63 +65,56 @@ """ -class ProcessBase(object): +class ProcessBase: name = "" key_name = "" index_file = None - status_code = 0 + status_code = 0 # TODO: rename to status + requires_content = False + # If the processor has descendants, use get_processor after retrieve the content + # to get the correct processor + has_descendants = False - def __init__(self, response, crawler_url): + def __init__(self, crawler_url_request): """ - - :type crawler_url: CrawlerUrl or None + :type crawler_url_request: CrawlerUrlRequest """ - # TODO: hay que pensar en no pasar response, text y soup por aquí para establecerlo en self, - # para no llenar la memoria. Deben ser cosas "volátiles". - if response is not None: - self.status_code = response.status_code - # TODO: procesar otras cosas (css, etc.) - self.crawler_url = crawler_url + if crawler_url_request.response is not None: + self.status_code = crawler_url_request.response.status + # The crawler_url_request takes a lot of memory, so we don't save it + self.crawler_url = crawler_url_request.crawler_url self.keywords_found = set() - def search_index_files(self): - if self.crawler_url.type not in ["directory", None]: + async def search_index_files(self): + if self.crawler_url.url_type not in ["directory", None]: return crawler = self.crawler_url.crawler for index_file in INDEX_FILES: url = self.crawler_url.url.copy() url.set_children(index_file) - future = self.crawler_url.crawler.add_url( - CrawlerUrl( - crawler, - url, - self.crawler_url.depth - 1, - self, - None, - "document", - timeout=self.crawler_url.timeout, - ), - True, + sub_crawler_url = CrawlerUrl( + crawler, + url, + self.crawler_url.depth - 1, + self, + None, + "document", ) - if self.crawler_url.crawler.closing or future is None: - return - result = future.result() - if result.exists: + await self.crawler_url.crawler.add_crawler_url(sub_crawler_url) + if sub_crawler_url.exists and sub_crawler_url.processor.status_code == 200: self.index_file = url break def search_keywords(self, text): - if sys.version_info > (3,) and isinstance(text, bytes): - text = text.decode("utf-8") - for keyword in self.crawler_url.crawler.interesting_keywords: + for keyword in self.crawler_url.crawler.configuration.interesting_keywords: if keyword in text: self.keywords_found.add(keyword) @classmethod - def is_applicable(cls, request, text, crawler_url, soup): + def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"): raise NotImplementedError - def process(self, text, soup=None): + async def process(self, crawler_url_request: "CrawlerUrlRequest"): raise NotImplementedError @property @@ -127,39 +124,33 @@ def flags(self): def maybe_directory(self): return self.crawler_url.maybe_directory() - def url_line(self): - body = colored( + def get_url_line_text(self): + text = Text() + text.append( "[{}]".format(self.status_code), status_code_colors(self.status_code) ) - body += " {} ".format(self.crawler_url.url.url) - body += colored( - " ({})".format(self.name or self.__class__.__name__), Fore.LIGHTYELLOW_EX - ) - return body + text.append(" {} ".format(self.crawler_url.url.url)) + text.append(" ({})".format(self.name or self.__class__.__name__), "gold1") + return text - def add_url(self, url, depth=3, **kwargs): + async def add_url(self, url: Url, depth: int = 3, **kwargs): if is_url_loop(url): return - return self.crawler_url.crawler.add_url( + await self.crawler_url.crawler.add_crawler_url( CrawlerUrl( - self.crawler_url.crawler, - url, - depth, - self.crawler_url, - timeout=self.crawler_url.timeout, - **kwargs + self.crawler_url.crawler, str(url), depth, self.crawler_url, **kwargs ) ) - def __str__(self): - body = self.url_line() + def get_text(self): + text = self.get_url_line_text() if self.index_file: - body += colored("\n Index file found: ", Fore.BLUE) - body += "{}".format(self.index_file.name) + text.append("\n Index file found: ", "blue1") + text.append("{}".format(self.index_file.name)) if self.keywords_found: - body += colored("\n Keywords found: ", Fore.BLUE) - body += ", ".join(self.keywords_found) - return body + text.append("\n Keywords found: ", "blue1") + text.append(", ".join(self.keywords_found)) + return text def json(self): return { @@ -174,8 +165,10 @@ class Error(ProcessBase): name = "Error" key_name = "error" - def __init__(self, crawler_url, error): - super(Error, self).__init__(None, crawler_url) + def __init__( + self, crawler_url_request: "CrawlerUrlRequest", error + ): # TODO: remove error? + super(Error, self).__init__(crawler_url_request) self.error = error def process(self, text, soup=None): @@ -188,10 +181,11 @@ def __str__(self): return body @classmethod - def is_applicable(cls, request, text, crawler_url, soup): + def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"): pass +# TODO: remove this class class Message(Error): def __init__(self, error, level="ERROR"): super(Error, self).__init__(None, CrawlerUrl(None, "")) @@ -211,8 +205,12 @@ class GenericProcessor(ProcessBase): name = "Generic" key_name = "generic" - def process(self, text, soup=None): - self.search_index_files() + async def process(self, text, soup=None): + await self.search_index_files() + + @classmethod + def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"): + return True class ProcessRedirect(ProcessBase): @@ -220,19 +218,22 @@ class ProcessRedirect(ProcessBase): key_name = "redirect" redirector = None - def __init__(self, response, crawler_url): - super(ProcessRedirect, self).__init__(response, crawler_url) + def __init__(self, crawler_url_request: "CrawlerUrlRequest"): + super(ProcessRedirect, self).__init__(crawler_url_request) self.redirector = full_url_address( - response.headers.get("Location"), self.crawler_url.url + crawler_url_request.response.headers.get("Location"), self.crawler_url.url ) - def process(self, text, soup=None): - if not self.crawler_url.crawler.not_allow_redirects: - self.add_url(self.redirector) + async def process(self, crawler_url_request: "CrawlerUrlRequest"): + if not self.crawler_url.crawler.configuration.not_allow_redirects: + await self.add_url(self.redirector) @classmethod - def is_applicable(cls, request, text, crawler_url, soup): - return 300 <= request.status_code < 400 + def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"): + return ( + crawler_url_request.response is not None + and 300 <= crawler_url_request.response.status < 400 + ) def __str__(self): body = super(ProcessRedirect, self).__str__() @@ -245,12 +246,15 @@ class ProcessNotFound(ProcessBase): name = "Not Found" key_name = "not_found" - def process(self, text, soup=None): - self.search_index_files() + async def process(self, text, soup=None): + await self.search_index_files() @classmethod - def is_applicable(cls, request, text, crawler_url, soup): - return request.status_code == 404 + def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"): + return ( + crawler_url_request.response is not None + and crawler_url_request.response.status == 404 + ) def __str__(self): body = self.url_line() @@ -271,6 +275,7 @@ def flags(self): class ProcessCssStyleSheet(ProcessBase): name = "CSS StyleSheet" key_name = "css" + requires_content = True def process(self, text, soup=None): if sys.version_info > (3,) and isinstance(text, bytes): @@ -285,16 +290,20 @@ def process(self, text, soup=None): return urls @classmethod - def is_applicable(cls, response, text, crawler_url, soup): + def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"): return ( - response.headers.get("Content-Type", "").lower().startswith("text/css") - and response.status_code < 300 + crawler_url_request.response is not None + and crawler_url_request.response.headers.get("Content-Type", "") + .lower() + .startswith("text/css") + and crawler_url_request.response.status < 300 ) class ProcessJavaScript(ProcessBase): name = "JavaScript" key_name = "js" + requires_content = True def process(self, text, soup=None): if sys.version_info > (3,) and isinstance(text, bytes): @@ -309,24 +318,27 @@ def process(self, text, soup=None): return urls @classmethod - def is_applicable(cls, response, text, crawler_url, soup): + def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"): return ( - response.headers.get("Content-Type", "") + crawler_url_request.response is not None + and crawler_url_request.response.headers.get("Content-Type", "") .lower() .startswith("application/javascript") - and response.status_code < 300 + and crawler_url_request.response.status < 300 ) class ProcessHtmlRequest(ProcessBase): name = "HTML document" key_name = "html" + requires_content = True + has_descendants = True - def process(self, text, soup=None): - self.search_keywords(text) - self.assets(soup) - self.links(soup) - self.search_index_files() + async def process(self, crawler_url_request: "CrawlerUrlRequest"): + self.search_keywords(crawler_url_request.content) + self.assets(crawler_url_request.soup) + self.links(crawler_url_request.soup) + await self.search_index_files() def links(self, soup): links = [ @@ -390,11 +402,13 @@ def analyze_asset(self, asset): self.crawler_url.depth -= 1 @classmethod - def is_applicable(cls, response, text, crawler_url, soup): + def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"): return ( - response.headers.get("Content-Type", "").lower().startswith("text/html") - and response.status_code < 300 - and soup is not None + crawler_url_request.response is not None + and crawler_url_request.response.headers.get("Content-Type", "") + .lower() + .startswith("text/html") + and crawler_url_request.response.status < 300 ) @@ -403,6 +417,8 @@ class ProcessIndexOfRequest(ProcessHtmlRequest): key_name = "index_of" files = None index_titles = ("index of", "directory listing for") + requires_content = True + has_descendants = False def process(self, text, soup=None): self.search_keywords(text) @@ -456,12 +472,13 @@ def repr_file(cls, file): return text @classmethod - def is_applicable(cls, response, text, crawler_url, soup): - if not super(ProcessIndexOfRequest, cls).is_applicable( - response, text, crawler_url, soup + def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"): + if ( + not super(ProcessIndexOfRequest, cls).is_applicable(crawler_url_request) + or crawler_url_request.content is None ): return False - title = soup.find("title") + title = crawler_url_request.soup.find("title") if not title: return False title = title.text.lower() @@ -483,11 +500,14 @@ def flags(self): class ProcessBlankPageRequest(ProcessHtmlRequest): name = "Blank page" key_name = "blank" + requires_content = True + has_descendants = False @classmethod - def is_applicable(cls, response, text, crawler_url, soup): - if not super(ProcessBlankPageRequest, cls).is_applicable( - response, text, crawler_url, soup + def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"): + if ( + not super(ProcessBlankPageRequest, cls).is_applicable(crawler_url_request) + or crawler_url_request.content is None ): return False @@ -505,7 +525,7 @@ def tag_visible(element): return False return True - texts = soup.findAll(text=True) + texts = crawler_url_request.soup.findAll(text=True) visible_texts = filter(tag_visible, texts) for text in visible_texts: if text.strip(): @@ -513,14 +533,13 @@ def tag_visible(element): return True -def get_processor(response, text, crawler_url, soup): +def get_processor(crawler_url_request: "CrawlerUrlRequest"): for processor_class in PROCESSORS: - if processor_class.is_applicable(response, text, crawler_url, soup): - # TODO: resp por None - return processor_class(response, crawler_url) + if processor_class.is_applicable(crawler_url_request): + return processor_class(crawler_url_request) -PROCESSORS = [ +PROCESSORS: List[Type[ProcessBase]] = [ ProcessRedirect, ProcessNotFound, ProcessCssStyleSheet, @@ -528,4 +547,5 @@ def get_processor(response, text, crawler_url, soup): ProcessIndexOfRequest, ProcessBlankPageRequest, ProcessHtmlRequest, + GenericProcessor, ] diff --git a/dirhunt/url.py b/dirhunt/url.py index f3d91ca..7cb2bc9 100644 --- a/dirhunt/url.py +++ b/dirhunt/url.py @@ -204,5 +204,8 @@ def __eq__(self, other): other = other.url return self.url == other - def __str__(self): - return "".format(self.url) + def __repr__(self): + return f"" + + def __str__(self) -> str: + return self.url diff --git a/dirhunt/utils.py b/dirhunt/utils.py index 1fcc2e9..6ad37bc 100644 --- a/dirhunt/utils.py +++ b/dirhunt/utils.py @@ -20,6 +20,7 @@ def lrange(start, end): return list(range(start, end)) +# TODO: remove def colored(text, *colors): return "".join(colors) + text + Fore.RESET + Back.RESET