From c5ba13084247939b7f894dc0113eeef0ac724b95 Mon Sep 17 00:00:00 2001
From: Nekmo <contacto@nekmo.com>
Date: Wed, 9 Aug 2023 07:56:49 +0200
Subject: [PATCH] Issue #83: Use Asyncio

---
 dirhunt/colors.py      |  12 +--
 dirhunt/crawler.py     |  19 ++--
 dirhunt/crawler_url.py | 212 ++++++++++++++++++++++++---------------
 dirhunt/processors.py  | 222 ++++++++++++++++++++++-------------------
 dirhunt/url.py         |   7 +-
 dirhunt/utils.py       |   1 +
 6 files changed, 276 insertions(+), 197 deletions(-)

diff --git a/dirhunt/colors.py b/dirhunt/colors.py
index 3de69fa..983cbba 100644
--- a/dirhunt/colors.py
+++ b/dirhunt/colors.py
@@ -3,14 +3,14 @@
 
 def status_code_colors(status_code):
     if 100 <= status_code < 200:
-        return Fore.WHITE
+        return "white"
     elif 200 == status_code:
-        return Fore.LIGHTGREEN_EX
+        return "green1"
     elif 200 < status_code < 300:
-        return Fore.GREEN
+        return "green3"
     elif 300 <= status_code < 400:
-        return Fore.LIGHTBLUE_EX
+        return "deep_sky_blue1"
     elif 500 == status_code:
-        return Fore.LIGHTMAGENTA_EX
+        return "magenta1"
     else:
-        return Fore.MAGENTA
+        return "medium_orchid1"
diff --git a/dirhunt/crawler.py b/dirhunt/crawler.py
index f43b281..cf4aa73 100644
--- a/dirhunt/crawler.py
+++ b/dirhunt/crawler.py
@@ -8,9 +8,11 @@
 from concurrent.futures.thread import _python_exit
 from threading import Lock, ThreadError
 import datetime
+from typing import Optional
 
 import humanize as humanize
 from click import get_terminal_size
+from rich.console import Console
 
 from dirhunt import processors
 from dirhunt import __version__
@@ -65,12 +67,13 @@ def __init__(self, configuration: Configuration, loop: asyncio.AbstractEventLoop
         self.configuration = configuration
         self.loop = loop
         self.tasks = set()
+        self.crawler_urls = set()
+        self.domains = set()
+        self.console = Console(highlight=False)
         self.session = Session()
         self.domain_semaphore = DomainSemaphore(configuration.concurrency)
-        self.domains = set()
         self.results = Queue()
         self.index_of_processors = []
-        self.processing = {}
         self.processed = {}
         self.add_lock = Lock()
         self.start_dt = datetime.datetime.now()
@@ -82,16 +85,18 @@ async def start(self):
             await self.add_crawler_url(
                 CrawlerUrl(self, url, depth=self.configuration.max_depth)
             )
-        await asyncio.wait(self.tasks)
+        while self.tasks:
+            await asyncio.wait(self.tasks)
 
-    async def add_crawler_url(self, crawler_url: CrawlerUrl):
+    async def add_crawler_url(self, crawler_url: CrawlerUrl) -> Optional[asyncio.Task]:
         """Add crawler_url to tasks"""
-        if crawler_url.url.url in self.processing:
+        if crawler_url.url.url in self.crawler_urls:
             return
         task = self.loop.create_task(crawler_url.retrieve())
         self.tasks.add(task)
-        self.processing[crawler_url.url.url] = task
-        task.add_done_callback(lambda: self.tasks.discard(task))
+        self.crawler_urls.add(crawler_url)
+        task.add_done_callback(self.tasks.discard)
+        return task
 
     def add_init_urls(self, *urls):
         """Add urls to queue."""
diff --git a/dirhunt/crawler_url.py b/dirhunt/crawler_url.py
index 0d8d463..8a2b9da 100644
--- a/dirhunt/crawler_url.py
+++ b/dirhunt/crawler_url.py
@@ -1,8 +1,9 @@
 # -*- coding: utf-8 -*-
 import cgi
 import socket
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Optional
 
+from aiohttp.web_response import Response
 from bs4 import BeautifulSoup
 from requests import RequestException
 from urllib3.exceptions import ReadTimeoutError
@@ -19,9 +20,86 @@
 
 if TYPE_CHECKING:
     from dirhunt.crawler import Crawler
+    from dirhunt.processors import ProcessBase
 
 
-class CrawlerUrl(object):
+class CrawlerUrlRequest:
+    response = Optional[Response]
+    content: Optional[str] = None
+    _soup: Optional[BeautifulSoup] = None
+
+    def __init__(self, crawler_url: "CrawlerUrl"):
+        self.crawler_url = crawler_url
+        self.crawler = crawler_url.crawler
+
+    async def retrieve(self) -> "ProcessBase":
+        from dirhunt.processors import (
+            get_processor,
+            Error,
+        )
+
+        text = ""
+        try:
+            await self.crawler.domain_semaphore.acquire(self.crawler_url.url.domain)
+            pass
+            async with self.crawler.session.get(
+                self.crawler_url.url.url,
+                verify_ssl=False,
+                timeout=self.crawler.configuration.timeout,
+                allow_redirects=False,
+            ) as response:
+                self.crawler_url.set_type(response.headers.get("Content-Type"))
+                self.crawler_url.flags.add(str(response.status))
+                self.response = response
+                processor = get_processor(self)
+                if processor and processor.requires_content:
+                    encoding = response.get_encoding()
+                    self.content = (
+                        await response.content.read(MAX_RESPONSE_SIZE)
+                    ).decode(encoding, errors="ignore")
+                if processor.has_descendants:
+                    processor = get_processor(self)
+                # text = ""
+                # soup = None
+                # processor = None
+                # if response.status_code < 300 and self.must_be_downloaded(response):
+                #     try:
+                #         text = response.raw.read(MAX_RESPONSE_SIZE, decode_content=True)
+                #     except (RequestException, ReadTimeoutError, socket.timeout) as e:
+                #         self.crawler.current_processed_count += 1
+                #         self.crawler.results.put(Error(self, e))
+                #         self.close()
+                #         return self
+                #     content_type = cgi.parse_header(
+                #         response.headers.get("Content-Type", "")
+                #     )[0]
+                #     soup = (
+                #         BeautifulSoup(text, "html.parser")
+                #         if content_type == "text/html"
+                #         else None
+                #     )
+        except RequestException as e:
+            self.crawler.current_processed_count += 1
+            processor = Error(self, e)
+        else:
+            await processor.process(self)
+        finally:
+            self.crawler.domain_semaphore.release(self.crawler_url.url.domain)
+        return processor
+
+    @property
+    def soup(self):
+        if self._soup is None and self.content is not None:
+            self._soup = BeautifulSoup(self.content, "html.parser")
+        return self._soup
+
+    def __repr__(self):
+        return "<CrawlerUrlRequest {}>".format(self.crawler_url.url)
+
+
+class CrawlerUrl:
+    processor: Optional["ProcessBase"] = None
+
     def __init__(
         self,
         crawler: "Crawler",
@@ -50,13 +128,13 @@ def __init__(
         self.exists = exists
         self.url_type = url_type
         if url.is_valid() and (not url.path or url.path == "/"):
-            self.type = "directory"
+            self.url_type = "directory"
         self.resp = None
         self.processor_data = None
 
-    def add_self_directories(self, exists=None, url_type=None):
+    async def add_self_directories(self, exists=None, url_type=None):
         for url in self.url.breadcrumb():
-            self.crawler.add_crawler_url(
+            await self.crawler.add_crawler_url(
                 CrawlerUrl(
                     self.crawler,
                     url,
@@ -66,91 +144,56 @@ def add_self_directories(self, exists=None, url_type=None):
                     url_type,
                 )
             )
-            # TODO: si no se puede añadir porque ya se ha añadido, establecer como que ya existe si la orden es exists
+            # TODO: if exists=True and the urls is already processed before add it, but the already processed
+            # url has exists=False, then update the exists to True
 
     async def retrieve(self):
-        from dirhunt.processors import (
-            get_processor,
-            GenericProcessor,
-            Error,
-            ProcessIndexOfRequest,
-        )
-
-        try:
-            await self.crawler.domain_semaphore.acquire(self.url.domain)
-            async with self.crawler.session.get(
-                self.url.url,
-                verify_ssl=False,
-                timeout=self.crawler.configuration.timeout,
-                allow_redirects=False,
-            ) as resp:
-                self.set_type(resp.headers.get("Content-Type"))
-                self.flags.add(str(resp.status))
-                text = ""
-                soup = None
-                processor = None
-                if resp.status_code < 300 and self.must_be_downloaded(resp):
-                    try:
-                        text = resp.raw.read(MAX_RESPONSE_SIZE, decode_content=True)
-                    except (RequestException, ReadTimeoutError, socket.timeout) as e:
-                        self.crawler.current_processed_count += 1
-                        self.crawler.results.put(Error(self, e))
-                        self.close()
-                        return self
-                    content_type = cgi.parse_header(
-                        resp.headers.get("Content-Type", "")
-                    )[0]
-                    soup = (
-                        BeautifulSoup(text, "html.parser")
-                        if content_type == "text/html"
-                        else None
-                    )
-        except RequestException as e:
-            self.crawler.current_processed_count += 1
-            self.crawler.results.put(Error(self, e))
-            self.close()
-            return self
-        finally:
-            self.crawler.domain_semaphore.release(self.url.domain)
-
-        if self.must_be_downloaded(resp):
-            processor = get_processor(resp, text, self, soup) or GenericProcessor(
-                resp, self
-            )
-            processor.process(text, soup)
-            self.flags.update(processor.flags)
-        if self.maybe_directory():
-            self.crawler.results.put(processor)
-        if processor is not None:
-            self.processor_data = processor.json()
-        if processor and isinstance(processor, ProcessIndexOfRequest):
-            self.crawler.index_of_processors.append(processor)
-        else:
-            self.crawler.current_processed_count += 1
-        # TODO: Podemos fijarnos en el processor.index_file. Si existe y es un 200, entonces es que existe.
-        if self.exists is None and resp.status_code < 404:
+        from processors import GenericProcessor
+
+        crawler_url_request = CrawlerUrlRequest(self)
+        processor = await crawler_url_request.retrieve()
+        if processor is not None and not isinstance(processor, GenericProcessor):
+            self.crawler.console.print(processor.get_text())
+        # if self.must_be_downloaded(response):
+        #     processor = get_processor(response, text, self, soup) or GenericProcessor(
+        #         response, self
+        #     )
+        #     processor.process(text, soup)
+        #     self.flags.update(processor.flags)
+        # if self.maybe_directory():
+        #     self.crawler.results.put(processor)
+        # if processor is not None:
+        #     self.processor_data = processor.json()
+        # if processor and isinstance(processor, ProcessIndexOfRequest):
+        #     self.crawler.index_of_processors.append(processor)
+        # else:
+        #     self.crawler.current_processed_count += 1
+        if (
+            self.exists is None
+            and crawler_url_request.response is not None
+            and crawler_url_request.response.status < 404
+        ):
             self.exists = True
-        self.add_self_directories(
-            True if (not self.maybe_rewrite() and self.exists) else None,
-            "directory" if not self.maybe_rewrite() else None,
-        )
-        self.close()
-        return self
+        # TODO: uncomment
+        # await self.add_self_directories(
+        #     True if (not self.maybe_rewrite() and self.exists) else None,
+        #     "directory" if not self.maybe_rewrite() else None,
+        # )
 
     def set_type(self, content_type):
         from dirhunt.processors import INDEX_FILES
 
-        if not self.type and not (content_type or "").startswith("text/html"):
-            self.type = "asset"
+        if not self.url_type and not (content_type or "").startswith("text/html"):
+            self.url_type = "asset"
         if (
-            not self.type
+            not self.url_type
             and (content_type or "").startswith("text/html")
             and self.url.name in INDEX_FILES
         ):
-            self.type = "document"
+            self.url_type = "document"
 
     def maybe_rewrite(self):
-        return self.type not in ["asset", "directory"]
+        return self.url_type not in ["asset", "directory"]
 
     def must_be_downloaded(self, response):
         """The file must be downloaded to obtain information."""
@@ -174,10 +217,6 @@ def weight(self):
             value -= len(list(self.url.breadcrumb())) * 1.5
         return value
 
-    def close(self):
-        self.crawler.processed[self.url.url] = self
-        del self.crawler.processing[self.url.url]
-
     def json(self):
         return {
             "flags": self.flags,
@@ -186,3 +225,14 @@ def json(self):
             "type": self.type,
             "exists": self.exists,
         }
+
+    def __repr__(self):
+        return f"<CrawlerUrl {self.url}>"
+
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, CrawlerUrl):
+            return False
+        return self.url.url == other.url.url
+
+    def __hash__(self):
+        return hash(self.url.url)
diff --git a/dirhunt/processors.py b/dirhunt/processors.py
index 4a07cd9..5a18b49 100644
--- a/dirhunt/processors.py
+++ b/dirhunt/processors.py
@@ -1,6 +1,10 @@
 # -*- coding: utf-8 -*-
 import re
 import sys
+from typing import List, Type
+
+from aiohttp.web_response import Response
+from rich.text import Text
 
 from dirhunt.directory_lists import get_directory_list
 
@@ -12,7 +16,7 @@
 from colorama import Fore, Back
 
 from dirhunt.colors import status_code_colors
-from dirhunt.crawler_url import CrawlerUrl
+from dirhunt.crawler_url import CrawlerUrl, CrawlerUrlRequest
 from dirhunt.url import Url, full_url_address
 from dirhunt.url_loop import is_url_loop
 from dirhunt.utils import colored
@@ -61,63 +65,56 @@
 """
 
 
-class ProcessBase(object):
+class ProcessBase:
     name = ""
     key_name = ""
     index_file = None
-    status_code = 0
+    status_code = 0  # TODO: rename to status
+    requires_content = False
+    # If the processor has descendants, use get_processor after retrieve the content
+    # to get the correct processor
+    has_descendants = False
 
-    def __init__(self, response, crawler_url):
+    def __init__(self, crawler_url_request):
         """
-
-        :type crawler_url: CrawlerUrl or None
+        :type crawler_url_request: CrawlerUrlRequest
         """
-        # TODO: hay que pensar en no pasar response, text y soup por aquí para establecerlo en self,
-        # para no llenar la memoria. Deben ser cosas "volátiles".
-        if response is not None:
-            self.status_code = response.status_code
-        # TODO: procesar otras cosas (css, etc.)
-        self.crawler_url = crawler_url
+        if crawler_url_request.response is not None:
+            self.status_code = crawler_url_request.response.status
+        # The crawler_url_request takes a lot of memory, so we don't save it
+        self.crawler_url = crawler_url_request.crawler_url
         self.keywords_found = set()
 
-    def search_index_files(self):
-        if self.crawler_url.type not in ["directory", None]:
+    async def search_index_files(self):
+        if self.crawler_url.url_type not in ["directory", None]:
             return
         crawler = self.crawler_url.crawler
         for index_file in INDEX_FILES:
             url = self.crawler_url.url.copy()
             url.set_children(index_file)
-            future = self.crawler_url.crawler.add_url(
-                CrawlerUrl(
-                    crawler,
-                    url,
-                    self.crawler_url.depth - 1,
-                    self,
-                    None,
-                    "document",
-                    timeout=self.crawler_url.timeout,
-                ),
-                True,
+            sub_crawler_url = CrawlerUrl(
+                crawler,
+                url,
+                self.crawler_url.depth - 1,
+                self,
+                None,
+                "document",
             )
-            if self.crawler_url.crawler.closing or future is None:
-                return
-            result = future.result()
-            if result.exists:
+            await self.crawler_url.crawler.add_crawler_url(sub_crawler_url)
+            if sub_crawler_url.exists and sub_crawler_url.processor.status_code == 200:
                 self.index_file = url
                 break
 
     def search_keywords(self, text):
-        if sys.version_info > (3,) and isinstance(text, bytes):
-            text = text.decode("utf-8")
-        for keyword in self.crawler_url.crawler.interesting_keywords:
+        for keyword in self.crawler_url.crawler.configuration.interesting_keywords:
             if keyword in text:
                 self.keywords_found.add(keyword)
 
     @classmethod
-    def is_applicable(cls, request, text, crawler_url, soup):
+    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
         raise NotImplementedError
 
-    def process(self, text, soup=None):
+    async def process(self, crawler_url_request: "CrawlerUrlRequest"):
         raise NotImplementedError
 
     @property
@@ -127,39 +124,33 @@ def flags(self):
     def maybe_directory(self):
         return self.crawler_url.maybe_directory()
 
-    def url_line(self):
-        body = colored(
+    def get_url_line_text(self):
+        text = Text()
+        text.append(
             "[{}]".format(self.status_code), status_code_colors(self.status_code)
         )
-        body += " {} ".format(self.crawler_url.url.url)
-        body += colored(
-            " ({})".format(self.name or self.__class__.__name__), Fore.LIGHTYELLOW_EX
-        )
-        return body
+        text.append(" {} ".format(self.crawler_url.url.url))
+        text.append(" ({})".format(self.name or self.__class__.__name__), "gold1")
+        return text
 
-    def add_url(self, url, depth=3, **kwargs):
+    async def add_url(self, url: Url, depth: int = 3, **kwargs):
         if is_url_loop(url):
             return
-        return self.crawler_url.crawler.add_url(
+        await self.crawler_url.crawler.add_crawler_url(
             CrawlerUrl(
-                self.crawler_url.crawler,
-                url,
-                depth,
-                self.crawler_url,
-                timeout=self.crawler_url.timeout,
-                **kwargs
+                self.crawler_url.crawler, str(url), depth, self.crawler_url, **kwargs
             )
         )
 
-    def __str__(self):
-        body = self.url_line()
+    def get_text(self):
+        text = self.get_url_line_text()
         if self.index_file:
-            body += colored("\n    Index file found: ", Fore.BLUE)
-            body += "{}".format(self.index_file.name)
+            text.append("\n    Index file found: ", "blue1")
+            text.append("{}".format(self.index_file.name))
         if self.keywords_found:
-            body += colored("\n    Keywords found: ", Fore.BLUE)
-            body += ", ".join(self.keywords_found)
-        return body
+            text.append("\n    Keywords found: ", "blue1")
+            text.append(", ".join(self.keywords_found))
+        return text
 
     def json(self):
         return {
@@ -174,8 +165,10 @@ class Error(ProcessBase):
     name = "Error"
     key_name = "error"
 
-    def __init__(self, crawler_url, error):
-        super(Error, self).__init__(None, crawler_url)
+    def __init__(
+        self, crawler_url_request: "CrawlerUrlRequest", error
+    ):  # TODO: remove error?
+        super(Error, self).__init__(crawler_url_request)
         self.error = error
 
     def process(self, text, soup=None):
@@ -188,10 +181,11 @@ def __str__(self):
         return body
 
     @classmethod
-    def is_applicable(cls, request, text, crawler_url, soup):
+    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
         pass
 
 
+# TODO: remove this class
 class Message(Error):
     def __init__(self, error, level="ERROR"):
         super(Error, self).__init__(None, CrawlerUrl(None, ""))
@@ -211,8 +205,12 @@ class GenericProcessor(ProcessBase):
     name = "Generic"
     key_name = "generic"
 
-    def process(self, text, soup=None):
-        self.search_index_files()
+    async def process(self, text, soup=None):
+        await self.search_index_files()
+
+    @classmethod
+    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
+        return True
 
 
 class ProcessRedirect(ProcessBase):
@@ -220,19 +218,22 @@ class ProcessRedirect(ProcessBase):
     key_name = "redirect"
     redirector = None
 
-    def __init__(self, response, crawler_url):
-        super(ProcessRedirect, self).__init__(response, crawler_url)
+    def __init__(self, crawler_url_request: "CrawlerUrlRequest"):
+        super(ProcessRedirect, self).__init__(crawler_url_request)
         self.redirector = full_url_address(
-            response.headers.get("Location"), self.crawler_url.url
+            crawler_url_request.response.headers.get("Location"), self.crawler_url.url
         )
 
-    def process(self, text, soup=None):
-        if not self.crawler_url.crawler.not_allow_redirects:
-            self.add_url(self.redirector)
+    async def process(self, crawler_url_request: "CrawlerUrlRequest"):
+        if not self.crawler_url.crawler.configuration.not_allow_redirects:
+            await self.add_url(self.redirector)
 
     @classmethod
-    def is_applicable(cls, request, text, crawler_url, soup):
-        return 300 <= request.status_code < 400
+    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
+        return (
+            crawler_url_request.response is not None
+            and 300 <= crawler_url_request.response.status < 400
+        )
 
     def __str__(self):
         body = super(ProcessRedirect, self).__str__()
@@ -245,12 +246,15 @@ class ProcessNotFound(ProcessBase):
     name = "Not Found"
     key_name = "not_found"
 
-    def process(self, text, soup=None):
-        self.search_index_files()
+    async def process(self, text, soup=None):
+        await self.search_index_files()
 
     @classmethod
-    def is_applicable(cls, request, text, crawler_url, soup):
-        return request.status_code == 404
+    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
+        return (
+            crawler_url_request.response is not None
+            and crawler_url_request.response.status == 404
+        )
 
     def __str__(self):
         body = self.url_line()
@@ -271,6 +275,7 @@ def flags(self):
 class ProcessCssStyleSheet(ProcessBase):
     name = "CSS StyleSheet"
     key_name = "css"
+    requires_content = True
 
     def process(self, text, soup=None):
         if sys.version_info > (3,) and isinstance(text, bytes):
@@ -285,16 +290,20 @@ def process(self, text, soup=None):
         return urls
 
     @classmethod
-    def is_applicable(cls, response, text, crawler_url, soup):
+    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
         return (
-            response.headers.get("Content-Type", "").lower().startswith("text/css")
-            and response.status_code < 300
+            crawler_url_request.response is not None
+            and crawler_url_request.response.headers.get("Content-Type", "")
+            .lower()
+            .startswith("text/css")
+            and crawler_url_request.response.status < 300
         )
 
 
 class ProcessJavaScript(ProcessBase):
     name = "JavaScript"
     key_name = "js"
+    requires_content = True
 
     def process(self, text, soup=None):
         if sys.version_info > (3,) and isinstance(text, bytes):
@@ -309,24 +318,27 @@ def process(self, text, soup=None):
         return urls
 
     @classmethod
-    def is_applicable(cls, response, text, crawler_url, soup):
+    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
         return (
-            response.headers.get("Content-Type", "")
+            crawler_url_request.response is not None
+            and crawler_url_request.response.headers.get("Content-Type", "")
             .lower()
             .startswith("application/javascript")
-            and response.status_code < 300
+            and crawler_url_request.response.status < 300
         )
 
 
 class ProcessHtmlRequest(ProcessBase):
     name = "HTML document"
     key_name = "html"
+    requires_content = True
+    has_descendants = True
 
-    def process(self, text, soup=None):
-        self.search_keywords(text)
-        self.assets(soup)
-        self.links(soup)
-        self.search_index_files()
+    async def process(self, crawler_url_request: "CrawlerUrlRequest"):
+        self.search_keywords(crawler_url_request.content)
+        self.assets(crawler_url_request.soup)
+        self.links(crawler_url_request.soup)
+        await self.search_index_files()
 
     def links(self, soup):
         links = [
@@ -390,11 +402,13 @@ def analyze_asset(self, asset):
             self.crawler_url.depth -= 1
 
     @classmethod
-    def is_applicable(cls, response, text, crawler_url, soup):
+    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
         return (
-            response.headers.get("Content-Type", "").lower().startswith("text/html")
-            and response.status_code < 300
-            and soup is not None
+            crawler_url_request.response is not None
+            and crawler_url_request.response.headers.get("Content-Type", "")
+            .lower()
+            .startswith("text/html")
+            and crawler_url_request.response.status < 300
         )
 
 
@@ -403,6 +417,8 @@ class ProcessIndexOfRequest(ProcessHtmlRequest):
     key_name = "index_of"
     files = None
     index_titles = ("index of", "directory listing for")
+    requires_content = True
+    has_descendants = False
 
     def process(self, text, soup=None):
         self.search_keywords(text)
@@ -456,12 +472,13 @@ def repr_file(cls, file):
         return text
 
     @classmethod
-    def is_applicable(cls, response, text, crawler_url, soup):
-        if not super(ProcessIndexOfRequest, cls).is_applicable(
-            response, text, crawler_url, soup
+    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
+        if (
+            not super(ProcessIndexOfRequest, cls).is_applicable(crawler_url_request)
+            or crawler_url_request.content is None
         ):
             return False
-        title = soup.find("title")
+        title = crawler_url_request.soup.find("title")
         if not title:
             return False
         title = title.text.lower()
@@ -483,11 +500,14 @@ def flags(self):
 class ProcessBlankPageRequest(ProcessHtmlRequest):
     name = "Blank page"
     key_name = "blank"
+    requires_content = True
+    has_descendants = False
 
     @classmethod
-    def is_applicable(cls, response, text, crawler_url, soup):
-        if not super(ProcessBlankPageRequest, cls).is_applicable(
-            response, text, crawler_url, soup
+    def is_applicable(cls, crawler_url_request: "CrawlerUrlRequest"):
+        if (
+            not super(ProcessBlankPageRequest, cls).is_applicable(crawler_url_request)
+            or crawler_url_request.content is None
         ):
             return False
 
@@ -505,7 +525,7 @@ def tag_visible(element):
                 return False
             return True
 
-        texts = soup.findAll(text=True)
+        texts = crawler_url_request.soup.findAll(text=True)
         visible_texts = filter(tag_visible, texts)
         for text in visible_texts:
             if text.strip():
@@ -513,14 +533,13 @@ def tag_visible(element):
         return True
 
 
-def get_processor(response, text, crawler_url, soup):
+def get_processor(crawler_url_request: "CrawlerUrlRequest"):
     for processor_class in PROCESSORS:
-        if processor_class.is_applicable(response, text, crawler_url, soup):
-            # TODO: resp por None
-            return processor_class(response, crawler_url)
+        if processor_class.is_applicable(crawler_url_request):
+            return processor_class(crawler_url_request)
 
 
-PROCESSORS = [
+PROCESSORS: List[Type[ProcessBase]] = [
     ProcessRedirect,
     ProcessNotFound,
     ProcessCssStyleSheet,
@@ -528,4 +547,5 @@ def get_processor(response, text, crawler_url, soup):
     ProcessIndexOfRequest,
     ProcessBlankPageRequest,
     ProcessHtmlRequest,
+    GenericProcessor,
 ]
diff --git a/dirhunt/url.py b/dirhunt/url.py
index f3d91ca..7cb2bc9 100644
--- a/dirhunt/url.py
+++ b/dirhunt/url.py
@@ -204,5 +204,8 @@ def __eq__(self, other):
             other = other.url
         return self.url == other
 
-    def __str__(self):
-        return "<Url {}>".format(self.url)
+    def __repr__(self):
+        return f"<Url {self.url}>"
+
+    def __str__(self) -> str:
+        return self.url
diff --git a/dirhunt/utils.py b/dirhunt/utils.py
index 1fcc2e9..6ad37bc 100644
--- a/dirhunt/utils.py
+++ b/dirhunt/utils.py
@@ -20,6 +20,7 @@ def lrange(start, end):
     return list(range(start, end))
 
 
+# TODO: remove
 def colored(text, *colors):
     return "".join(colors) + text + Fore.RESET + Back.RESET