Skip to content

Commit

Permalink
refactor(Playwright Engine): Separate what we can for cleaner code an…
Browse files Browse the repository at this point in the history
…d the async function later

The caching will give a slight performance increase with bulk requests
  • Loading branch information
D4Vinci committed Dec 15, 2024
1 parent 6c17bd8 commit 889c111
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 73 deletions.
145 changes: 73 additions & 72 deletions scrapling/engines/pw.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import json

from scrapling.core._types import Callable, Dict, List, Optional, Union
from scrapling.core.utils import log
from scrapling.core._types import Callable, Dict, Optional, Union
from scrapling.core.utils import log, lru_cache
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
NSTBROWSER_DEFAULT_QUERY)
from scrapling.engines.toolbelt import (Response, StatusText,
check_type_validity, construct_cdp_url,
construct_proxy_dict, do_nothing,
do_nothing_async,
generate_convincing_referer,
generate_headers, intercept_route,
js_bypass_path)
Expand Down Expand Up @@ -94,10 +95,8 @@ def __init__(
# '--disable-extensions',
]

def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
def _cdp_url_logic(self) -> str:
"""Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
:param flags: Chrome flags to be added to NSTBrowser query
:return: CDP URL
"""
cdp_url = self.cdp_url
Expand All @@ -106,7 +105,8 @@ def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
config = self.nstbrowser_config
else:
query = NSTBROWSER_DEFAULT_QUERY.copy()
if flags:
if self.stealth:
flags = self.__set_flags()
query.update({
"args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
})
Expand All @@ -122,6 +122,68 @@ def _cdp_url_logic(self, flags: Optional[List] = None) -> str:

return cdp_url

@lru_cache(typed=True)
def __set_flags(self):
"""Returns the flags that will be used while launching the browser if stealth mode is enabled"""
flags = DEFAULT_STEALTH_FLAGS
if self.hide_canvas:
flags += ('--fingerprinting-canvas-image-data-noise',)
if self.disable_webgl:
flags += ('--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2',)

return flags

def __launch_kwargs(self):
"""Creates the arguments we will use while launching playwright's browser"""
launch_kwargs = {'headless': self.headless, 'ignore_default_args': self.harmful_default_args, 'channel': 'chrome' if self.real_chrome else 'chromium'}
if self.stealth:
launch_kwargs.update({'args': self.__set_flags(), 'chromium_sandbox': True})

return launch_kwargs

def __context_kwargs(self):
"""Creates the arguments for the browser context"""
context_kwargs = {
"proxy": self.proxy,
"locale": self.locale,
"color_scheme": 'dark', # Bypasses the 'prefersLightColor' check in creepjs
"device_scale_factor": 2,
"extra_http_headers": self.extra_headers if self.extra_headers else {},
"user_agent": self.useragent if self.useragent else generate_headers(browser_mode=True).get('User-Agent'),
}
if self.stealth:
context_kwargs.update({
'is_mobile': False,
'has_touch': False,
# I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
'service_workers': 'allow',
'ignore_https_errors': True,
'screen': {'width': 1920, 'height': 1080},
'viewport': {'width': 1920, 'height': 1080},
'permissions': ['geolocation', 'notifications']
})

return context_kwargs

@lru_cache()
def __stealth_scripts(self):
# Basic bypasses nothing fancy as I'm still working on it
# But with adding these bypasses to the above config, it bypasses many online tests like
# https://bot.sannysoft.com/
# https://kaliiiiiiiiii.github.io/brotector/
# https://pixelscan.net/
# https://iphey.com/
# https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
# https://arh.antoinevastel.com/bots/areyouheadless/
# https://prescience-data.github.io/execution-monitor.html
return tuple(
js_bypass_path(script) for script in (
# Order is important
'webdriver_fully.js', 'window_chrome.js', 'navigator_plugins.js', 'pdf_viewer.js',
'notification_permission.js', 'screen_props.js', 'playwright_fingerprint.js'
)
)

def fetch(self, url: str) -> Response:
"""Opens up the browser and do your request based on your chosen options.
Expand All @@ -135,61 +197,14 @@ def fetch(self, url: str) -> Response:
from rebrowser_playwright.sync_api import sync_playwright

with sync_playwright() as p:
# Handle the UserAgent early
if self.useragent:
extra_headers = {}
useragent = self.useragent
else:
extra_headers = {}
useragent = generate_headers(browser_mode=True).get('User-Agent')

# Prepare the flags before diving
flags = DEFAULT_STEALTH_FLAGS
if self.hide_canvas:
flags += ['--fingerprinting-canvas-image-data-noise']
if self.disable_webgl:
flags += ['--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2']

# Creating the browser
if self.cdp_url:
cdp_url = self._cdp_url_logic(flags if self.stealth else None)
cdp_url = self._cdp_url_logic()
browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
else:
if self.stealth:
browser = p.chromium.launch(
headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
)
else:
browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')

# Creating the context
if self.stealth:
context = browser.new_context(
locale=self.locale,
is_mobile=False,
has_touch=False,
proxy=self.proxy,
color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
user_agent=useragent,
device_scale_factor=2,
# I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
service_workers="allow",
ignore_https_errors=True,
extra_http_headers=extra_headers,
screen={"width": 1920, "height": 1080},
viewport={"width": 1920, "height": 1080},
permissions=["geolocation", 'notifications'],
)
else:
context = browser.new_context(
locale=self.locale,
proxy=self.proxy,
color_scheme='dark',
user_agent=useragent,
device_scale_factor=2,
extra_http_headers=extra_headers
)
browser = p.chromium.launch(**self.__launch_kwargs())

context = browser.new_context(**self.__context_kwargs())
# Finally we are in business
page = context.new_page()
page.set_default_navigation_timeout(self.timeout)
Expand All @@ -202,22 +217,8 @@ def fetch(self, url: str) -> Response:
page.route("**/*", intercept_route)

if self.stealth:
# Basic bypasses nothing fancy as I'm still working on it
# But with adding these bypasses to the above config, it bypasses many online tests like
# https://bot.sannysoft.com/
# https://kaliiiiiiiiii.github.io/brotector/
# https://pixelscan.net/
# https://iphey.com/
# https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
# https://arh.antoinevastel.com/bots/areyouheadless/
# https://prescience-data.github.io/execution-monitor.html
page.add_init_script(path=js_bypass_path('webdriver_fully.js'))
page.add_init_script(path=js_bypass_path('window_chrome.js'))
page.add_init_script(path=js_bypass_path('navigator_plugins.js'))
page.add_init_script(path=js_bypass_path('pdf_viewer.js'))
page.add_init_script(path=js_bypass_path('notification_permission.js'))
page.add_init_script(path=js_bypass_path('screen_props.js'))
page.add_init_script(path=js_bypass_path('playwright_fingerprint.js'))
for script in self.__stealth_scripts():
page.add_init_script(path=script)

res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
page.wait_for_load_state(state="domcontentloaded")
Expand Down
3 changes: 2 additions & 1 deletion scrapling/engines/toolbelt/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
check_type_validity, do_nothing, get_variable_name)
check_type_validity, do_nothing, do_nothing_async,
get_variable_name)
from .fingerprints import (generate_convincing_referer, generate_headers,
get_os_name)
from .navigation import (construct_cdp_url, construct_proxy_dict,
Expand Down
5 changes: 5 additions & 0 deletions scrapling/engines/toolbelt/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,3 +302,8 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
def do_nothing(page):
# Just works as a filler for `page_action` argument in browser engines
return page


async def do_nothing_async(page):
# Just works as a filler for `page_action` argument in browser engines
return page

0 comments on commit 889c111

Please sign in to comment.