Skip to content

Commit

Permalink
Issue #83: Use Asyncio (Google source).
Browse files Browse the repository at this point in the history
  • Loading branch information
Nekmo committed Aug 10, 2023
1 parent fe2707d commit 1880a35
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 23 deletions.
4 changes: 1 addition & 3 deletions dirhunt/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from click import get_terminal_size
from rich.console import Console
from rich.text import Text
from rich.traceback import install

from dirhunt import __version__
from dirhunt._compat import queue, Queue, unregister
Expand All @@ -32,7 +31,6 @@


resume_dir = os.path.expanduser("~/.cache/dirhunt/")
install(show_locals=True)


class DomainSemaphore:
Expand Down Expand Up @@ -68,7 +66,7 @@ def __init__(self, configuration: Configuration, loop: asyncio.AbstractEventLoop
self.crawler_urls: Set[CrawlerUrl] = set()
self.domains: Set[str] = set()
self.console = Console(highlight=False)
self.session = Session()
self.session = Session(self)
self.domain_semaphore = DomainSemaphore(configuration.concurrency)
self.results = Queue()
self.index_of_processors = []
Expand Down
16 changes: 13 additions & 3 deletions dirhunt/sessions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
import warnings

from aiohttp import ClientSession
from multidict import CIMultiDict
from requests import Timeout
from requests.adapters import HTTPAdapter
from requests.exceptions import ProxyError
from typing_extensions import TYPE_CHECKING

from dirhunt._compat import Queue

Expand All @@ -16,8 +18,9 @@

from dirhunt.agents import get_random_user_agent

if sys.version_info < (3, 0):
ConnectionError = IOError

if TYPE_CHECKING:
from dirhunt.crawler import Crawler


MAX_NEGATIVE_VOTES = -3
Expand Down Expand Up @@ -318,7 +321,14 @@ def __getitem__(self, item):


class Session(ClientSession):
pass
def __init__(self, crawler: "Crawler", **kwargs):
headers = kwargs.pop("headers", {})
headers = CIMultiDict(headers)
if "User-Agent" not in headers:
headers["User-Agent"] = (
crawler.configuration.user_agent or get_random_user_agent()
)
super().__init__(headers=headers, **kwargs)


class Sessions(object):
Expand Down
2 changes: 1 addition & 1 deletion dirhunt/sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
SOURCE_CLASSES: List[Type["SourceBase"]] = [
# Robots,
# VirusTotal,
# Google,
Google,
CommonCrawl,
CrtSh,
# CertificateSSL,
Expand Down
4 changes: 3 additions & 1 deletion dirhunt/sources/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ async def retrieve_urls(self, domain: str):
try:
urls = await self.search_by_domain(domain)
except ClientError as e:
self.sources.crawler.print_error(str(e))
self.sources.crawler.print_error(
f"Failed to retrieve {domain} using the source {self.get_source_name()}: {e}"
)
urls = []
else:
self.save_to_cache(urls)
Expand Down
105 changes: 90 additions & 15 deletions dirhunt/sources/google.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,95 @@
import asyncio
import datetime
import json
import os
from http.cookies import Morsel, SimpleCookie
from pathlib import Path
from typing import Iterable, Optional

from dirhunt.sources.base import SourceBase
from dirhunt._compat import URLError
from googlesearch import search

STOP_AFTER = 20
TIMEOUT = 10
WAIT = 2
GOOGLE_INDEX_URL = "https://www.google.com/"
GOOGLE_SEARCH_URL = "https://www.google.com/search"


class Google(SourceBase):
def callback(self, domain):
results = search("site:{}".format(domain), stop=STOP_AFTER)
while True:
try:
url = next(results)
except (IOError, URLError) as e:
self.add_error("Error on Google Source: {}".format(e))
break
except StopIteration:
break
else:
self.add_result(url)
@property
def google_cookies(self) -> Optional[Morsel]:
return self.sources.crawler.session.cookie_jar._cookies.get(("google.com", "/"))

@property
def google_consent_cookie(self) -> Optional[Morsel]:
return self.google_cookies.get("CONSENT")

@property
def google_cookies_path(self) -> Path:
return self.cache_dir / "google_cookies.txt"

async def request(self, url: str, params: Optional[dict] = None):
"""Request to Google."""
async with self.sources.crawler.session.get(
url,
params=params,
timeout=TIMEOUT,
headers={
"User-Agent": "'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'"
},
) as response:
response.raise_for_status()
return await response.text()

def save_cookies(self):
"""Save cookies to file."""
data = self.google_cookies.output(header="")
os.makedirs(str(self.google_cookies_path.parent), exist_ok=True)
with open(self.google_cookies_path, "w") as f:
f.write(data)

def load_cookies(self):
"""Load cookies from file."""
with open(self.google_cookies_path, "r") as f:
lines = f.readlines()
cookie = SimpleCookie()
for line in lines:
cookie.load(line)
self.sources.crawler.session.cookie_jar._cookies[
("google.com", "/")
] = cookie

"""Google Source class."""

async def search_by_domain(self, domain: str) -> Iterable[str]:
"""Search by domain in Google."""
# TODO: lock for concurrent requests.
# Load cookies from file if exists or request to Google if not.
cookies_path_exists = self.google_cookies_path.exists()
if not self.google_cookies and cookies_path_exists:
self.load_cookies()
if not self.google_cookies and not cookies_path_exists:
await self.request(GOOGLE_INDEX_URL)
await asyncio.sleep(2)
# Set consent cookie if it is pending.
if self.google_consent_cookie and self.google_consent_cookie.value.startswith(
"PENDING"
):
now = datetime.datetime.now()
cookie_value = f"YES+cb.{now.year}{now.month:02}{now.day:02}-17-p0.de+F+678"
self.google_consent_cookie.set("CONSENT", cookie_value, cookie_value)
# Save cookies to file if not exists.
if self.google_cookies and not cookies_path_exists:
self.save_cookies()
text = await self.request(
GOOGLE_SEARCH_URL,
params={
"q": f"site:{domain}",
"hl": "en",
"tbs": "0",
"safe": "off",
"cr": "",
"btnG": "Google Search",
},
)
# TODO:
return []

0 comments on commit 1880a35

Please sign in to comment.