Issue #83: Use Asyncio.

Nekmo · Aug 11, 2023 · 86613a4 · 86613a4
1 parent 0ba58e3
commit 86613a4
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 6 deletions.
diff --git a/dirhunt/crawler.py b/dirhunt/crawler.py
@@ -15,6 +15,13 @@
 from click import get_terminal_size
 from rich.console import Console
 from rich.text import Text
+from rich.progress import (
+    Progress,
+    TaskProgressColumn,
+    TimeRemainingColumn,
+    BarColumn,
+    TextColumn,
+)
 
 from dirhunt import __version__
 from dirhunt._compat import queue, Queue, unregister
@@ -77,9 +84,19 @@ def __init__(self, configuration: Configuration, loop: asyncio.AbstractEventLoop
         self.processed = {}
         self.add_lock = Lock()
         self.start_dt = datetime.datetime.now()
+        self.total_crawler_urls: int = 0
         self.current_processed_count: int = 0
         self.sources = Sources(self)
         self.domain_protocols: Dict[str, set] = defaultdict(set)
+        self.progress = Progress(
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(complete_style="blue"),
+            TaskProgressColumn(),
+            TimeRemainingColumn(),
+            console=self.console,
+        )
+        self.progress_task = self.progress.add_task("Fetching urls...", total=None)
+        self.progress.start()
 
     async def start(self):
         """Add urls to process."""
@@ -106,20 +123,21 @@ async def restart(self):
 
     async def add_crawler_url(self, crawler_url: CrawlerUrl) -> Optional[asyncio.Task]:
         """Add crawler_url to tasks"""
-        if crawler_url in self.crawler_urls or not self.in_domains(
-            crawler_url.url.domain
+        if (
+            self.total_crawler_urls > self.configuration.limit
+            or crawler_url in self.crawler_urls
+            or not self.in_domains(crawler_url.url.domain)
         ):
             return
-        # TODO: move to CrawlerUrl after retrieve the data
-        self.current_processed_count += 1
         self.crawler_urls.add(crawler_url)
         await self.add_crawler_url_task(crawler_url)
 
     async def add_crawler_url_task(self, crawler_url) -> asyncio.Task:
         """Add crawler_url to tasks"""
+        self.total_crawler_urls += 1
         return self.add_task(
             retry_error(crawler_url.retrieve, KeyboardInterrupt)(),
-            name=f"crawlerurl-{self.current_processed_count}",
+            name=f"crawlerurl-{self.total_crawler_urls}",
         )
 
     async def add_domain(self, domain: str):
@@ -154,6 +172,16 @@ def print_processor(self, processor: ProcessBase):
         if 300 > processor.status >= 200:
             self.add_domain_protocol(processor.crawler_url)
         self.console.print(processor.get_text())
+        self.progress.update(
+            self.progress_task,
+            description=f"Obtained [bold blue]{self.current_processed_count}[/bold blue] urls out of "
+            f"[bold blue]{self.total_crawler_urls}[/bold blue]",
+            completed=self.current_processed_count,
+            refresh=True,
+            total=self.configuration.limit
+            if self.total_crawler_urls > self.configuration.limit
+            else None,
+        )
 
     def add_domain_protocol(self, crawler_url: "CrawlerUrl"):
         """Add domain protocol"""

diff --git a/dirhunt/crawler_url.py b/dirhunt/crawler_url.py
@@ -81,7 +81,6 @@ async def retrieve(self, retries: Optional[int] = None) -> Optional["ProcessBase
                 await asyncio.sleep(RETRIES_WAIT)
                 return await self.retrieve(retries - 1)
             else:
-                self.crawler.current_processed_count += 1
                 self.crawler.print_error(
                     f"Request error to {self.crawler_url.url}: {get_message_from_exception(e)}"
                 )
@@ -159,6 +158,7 @@ async def retrieve(self):
 
         crawler_url_request = CrawlerUrlRequest(self)
         processor = await crawler_url_request.retrieve()
+        self.crawler.current_processed_count += 1
         if (
             processor is not None
             and not isinstance(processor, GenericProcessor)