Skip to content

Commit

Permalink
feat(scraper): add basic auth and cookies (#102)
Browse files Browse the repository at this point in the history
  • Loading branch information
vvatelot authored Dec 19, 2024
1 parent 2821046 commit 9d631d6
Showing 1 changed file with 14 additions and 3 deletions.
17 changes: 14 additions & 3 deletions components/ecoindex/scraper/scrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
from ecoindex.models.compute import PageMetrics, Result, ScreenShot, WindowSize
from ecoindex.models.scraper import MimetypeAggregation, RequestItem, Requests
from ecoindex.utils.screenshots import convert_screenshot_to_webp, set_screenshot_rights
from playwright._impl._api_structures import SetCookieParam
from playwright.async_api import async_playwright
from playwright_stealth import stealth_async
from typing_extensions import deprecated


Expand All @@ -26,6 +26,8 @@ def __init__(
screenshot_gid: int | None = None,
page_load_timeout: int = 20,
headless: bool = True,
basic_auth: str | None = None,
cookies: list[SetCookieParam] = [],
):
self.url = url
self.window_size = window_size
Expand All @@ -41,6 +43,8 @@ def __init__(
f"/tmp/ecoindex-{self.now.strftime('%Y-%m-%d-%H-%M-%S-%f')}-{uuid4()}.har"
)
self.headless = headless
self.basic_auth = basic_auth
self.cookies = cookies

@deprecated("This method is useless with new version of EcoindexScraper")
def init_chromedriver(self):
Expand All @@ -67,12 +71,19 @@ async def get_requests_by_category(self) -> MimetypeAggregation:
async def scrap_page(self) -> PageMetrics:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=self.headless)
self.page = await browser.new_page(
self.context = await browser.new_context(
record_har_path=self.har_temp_file_path,
screen=self.window_size.model_dump(),
ignore_https_errors=True,
http_credentials={
"username": self.basic_auth.split(":")[0],
"password": self.basic_auth.split(":")[1],
}
if self.basic_auth
else None,
)
await stealth_async(self.page)
await self.context.add_cookies(self.cookies)
self.page = await self.context.new_page()
response = await self.page.goto(self.url)
await self.check_page_response(response)

Expand Down

0 comments on commit 9d631d6

Please sign in to comment.