Skip to content

Commit

Permalink
Issue #83: Use Asyncio.
Browse files Browse the repository at this point in the history
  • Loading branch information
Nekmo committed Aug 11, 2023
1 parent 0ba58e3 commit 86613a4
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 6 deletions.
38 changes: 33 additions & 5 deletions dirhunt/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@
from click import get_terminal_size
from rich.console import Console
from rich.text import Text
from rich.progress import (
Progress,
TaskProgressColumn,
TimeRemainingColumn,
BarColumn,
TextColumn,
)

from dirhunt import __version__
from dirhunt._compat import queue, Queue, unregister
Expand Down Expand Up @@ -77,9 +84,19 @@ def __init__(self, configuration: Configuration, loop: asyncio.AbstractEventLoop
self.processed = {}
self.add_lock = Lock()
self.start_dt = datetime.datetime.now()
self.total_crawler_urls: int = 0
self.current_processed_count: int = 0
self.sources = Sources(self)
self.domain_protocols: Dict[str, set] = defaultdict(set)
self.progress = Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(complete_style="blue"),
TaskProgressColumn(),
TimeRemainingColumn(),
console=self.console,
)
self.progress_task = self.progress.add_task("Fetching urls...", total=None)
self.progress.start()

async def start(self):
"""Add urls to process."""
Expand All @@ -106,20 +123,21 @@ async def restart(self):

async def add_crawler_url(self, crawler_url: CrawlerUrl) -> Optional[asyncio.Task]:
"""Add crawler_url to tasks"""
if crawler_url in self.crawler_urls or not self.in_domains(
crawler_url.url.domain
if (
self.total_crawler_urls > self.configuration.limit
or crawler_url in self.crawler_urls
or not self.in_domains(crawler_url.url.domain)
):
return
# TODO: move to CrawlerUrl after retrieve the data
self.current_processed_count += 1
self.crawler_urls.add(crawler_url)
await self.add_crawler_url_task(crawler_url)

async def add_crawler_url_task(self, crawler_url) -> asyncio.Task:
"""Add crawler_url to tasks"""
self.total_crawler_urls += 1
return self.add_task(
retry_error(crawler_url.retrieve, KeyboardInterrupt)(),
name=f"crawlerurl-{self.current_processed_count}",
name=f"crawlerurl-{self.total_crawler_urls}",
)

async def add_domain(self, domain: str):
Expand Down Expand Up @@ -154,6 +172,16 @@ def print_processor(self, processor: ProcessBase):
if 300 > processor.status >= 200:
self.add_domain_protocol(processor.crawler_url)
self.console.print(processor.get_text())
self.progress.update(
self.progress_task,
description=f"Obtained [bold blue]{self.current_processed_count}[/bold blue] urls out of "
f"[bold blue]{self.total_crawler_urls}[/bold blue]",
completed=self.current_processed_count,
refresh=True,
total=self.configuration.limit
if self.total_crawler_urls > self.configuration.limit
else None,
)

def add_domain_protocol(self, crawler_url: "CrawlerUrl"):
"""Add domain protocol"""
Expand Down
2 changes: 1 addition & 1 deletion dirhunt/crawler_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ async def retrieve(self, retries: Optional[int] = None) -> Optional["ProcessBase
await asyncio.sleep(RETRIES_WAIT)
return await self.retrieve(retries - 1)
else:
self.crawler.current_processed_count += 1
self.crawler.print_error(
f"Request error to {self.crawler_url.url}: {get_message_from_exception(e)}"
)
Expand Down Expand Up @@ -159,6 +158,7 @@ async def retrieve(self):

crawler_url_request = CrawlerUrlRequest(self)
processor = await crawler_url_request.retrieve()
self.crawler.current_processed_count += 1
if (
processor is not None
and not isinstance(processor, GenericProcessor)
Expand Down

0 comments on commit 86613a4

Please sign in to comment.