From 889c111855bfd3c24bf1292cff7d48cddcb286af Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Sun, 15 Dec 2024 23:25:02 +0200 Subject: [PATCH] refactor(Playwright Engine): Separate what we can for cleaner code and the async function later The caching will give a slight performance increase with bulk requests --- scrapling/engines/pw.py | 145 +++++++++++++------------ scrapling/engines/toolbelt/__init__.py | 3 +- scrapling/engines/toolbelt/custom.py | 5 + 3 files changed, 80 insertions(+), 73 deletions(-) diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py index 8b2895c..e210a7b 100644 --- a/scrapling/engines/pw.py +++ b/scrapling/engines/pw.py @@ -1,12 +1,13 @@ import json -from scrapling.core._types import Callable, Dict, List, Optional, Union -from scrapling.core.utils import log +from scrapling.core._types import Callable, Dict, Optional, Union +from scrapling.core.utils import log, lru_cache from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY) from scrapling.engines.toolbelt import (Response, StatusText, check_type_validity, construct_cdp_url, construct_proxy_dict, do_nothing, + do_nothing_async, generate_convincing_referer, generate_headers, intercept_route, js_bypass_path) @@ -94,10 +95,8 @@ def __init__( # '--disable-extensions', ] - def _cdp_url_logic(self, flags: Optional[List] = None) -> str: + def _cdp_url_logic(self) -> str: """Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is - - :param flags: Chrome flags to be added to NSTBrowser query :return: CDP URL """ cdp_url = self.cdp_url @@ -106,7 +105,8 @@ def _cdp_url_logic(self, flags: Optional[List] = None) -> str: config = self.nstbrowser_config else: query = NSTBROWSER_DEFAULT_QUERY.copy() - if flags: + if self.stealth: + flags = self.__set_flags() query.update({ "args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary }) @@ -122,6 +122,68 @@ def _cdp_url_logic(self, flags: Optional[List] = None) -> str: return cdp_url + @lru_cache(typed=True) + def __set_flags(self): + """Returns the flags that will be used while launching the browser if stealth mode is enabled""" + flags = DEFAULT_STEALTH_FLAGS + if self.hide_canvas: + flags += ('--fingerprinting-canvas-image-data-noise',) + if self.disable_webgl: + flags += ('--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2',) + + return flags + + def __launch_kwargs(self): + """Creates the arguments we will use while launching playwright's browser""" + launch_kwargs = {'headless': self.headless, 'ignore_default_args': self.harmful_default_args, 'channel': 'chrome' if self.real_chrome else 'chromium'} + if self.stealth: + launch_kwargs.update({'args': self.__set_flags(), 'chromium_sandbox': True}) + + return launch_kwargs + + def __context_kwargs(self): + """Creates the arguments for the browser context""" + context_kwargs = { + "proxy": self.proxy, + "locale": self.locale, + "color_scheme": 'dark', # Bypasses the 'prefersLightColor' check in creepjs + "device_scale_factor": 2, + "extra_http_headers": self.extra_headers if self.extra_headers else {}, + "user_agent": self.useragent if self.useragent else generate_headers(browser_mode=True).get('User-Agent'), + } + if self.stealth: + context_kwargs.update({ + 'is_mobile': False, + 'has_touch': False, + # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now + 'service_workers': 'allow', + 'ignore_https_errors': True, + 'screen': {'width': 1920, 'height': 1080}, + 'viewport': {'width': 1920, 'height': 1080}, + 'permissions': ['geolocation', 'notifications'] + }) + + return context_kwargs + + @lru_cache() + def __stealth_scripts(self): + # Basic bypasses nothing fancy as I'm still working on it + # But with adding these bypasses to the above config, it bypasses many online tests like + # https://bot.sannysoft.com/ + # https://kaliiiiiiiiii.github.io/brotector/ + # https://pixelscan.net/ + # https://iphey.com/ + # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint + # https://arh.antoinevastel.com/bots/areyouheadless/ + # https://prescience-data.github.io/execution-monitor.html + return tuple( + js_bypass_path(script) for script in ( + # Order is important + 'webdriver_fully.js', 'window_chrome.js', 'navigator_plugins.js', 'pdf_viewer.js', + 'notification_permission.js', 'screen_props.js', 'playwright_fingerprint.js' + ) + ) + def fetch(self, url: str) -> Response: """Opens up the browser and do your request based on your chosen options. @@ -135,61 +197,14 @@ def fetch(self, url: str) -> Response: from rebrowser_playwright.sync_api import sync_playwright with sync_playwright() as p: - # Handle the UserAgent early - if self.useragent: - extra_headers = {} - useragent = self.useragent - else: - extra_headers = {} - useragent = generate_headers(browser_mode=True).get('User-Agent') - - # Prepare the flags before diving - flags = DEFAULT_STEALTH_FLAGS - if self.hide_canvas: - flags += ['--fingerprinting-canvas-image-data-noise'] - if self.disable_webgl: - flags += ['--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2'] - # Creating the browser if self.cdp_url: - cdp_url = self._cdp_url_logic(flags if self.stealth else None) + cdp_url = self._cdp_url_logic() browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url) else: - if self.stealth: - browser = p.chromium.launch( - headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium' - ) - else: - browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium') - - # Creating the context - if self.stealth: - context = browser.new_context( - locale=self.locale, - is_mobile=False, - has_touch=False, - proxy=self.proxy, - color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs - user_agent=useragent, - device_scale_factor=2, - # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now - service_workers="allow", - ignore_https_errors=True, - extra_http_headers=extra_headers, - screen={"width": 1920, "height": 1080}, - viewport={"width": 1920, "height": 1080}, - permissions=["geolocation", 'notifications'], - ) - else: - context = browser.new_context( - locale=self.locale, - proxy=self.proxy, - color_scheme='dark', - user_agent=useragent, - device_scale_factor=2, - extra_http_headers=extra_headers - ) + browser = p.chromium.launch(**self.__launch_kwargs()) + context = browser.new_context(**self.__context_kwargs()) # Finally we are in business page = context.new_page() page.set_default_navigation_timeout(self.timeout) @@ -202,22 +217,8 @@ def fetch(self, url: str) -> Response: page.route("**/*", intercept_route) if self.stealth: - # Basic bypasses nothing fancy as I'm still working on it - # But with adding these bypasses to the above config, it bypasses many online tests like - # https://bot.sannysoft.com/ - # https://kaliiiiiiiiii.github.io/brotector/ - # https://pixelscan.net/ - # https://iphey.com/ - # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint - # https://arh.antoinevastel.com/bots/areyouheadless/ - # https://prescience-data.github.io/execution-monitor.html - page.add_init_script(path=js_bypass_path('webdriver_fully.js')) - page.add_init_script(path=js_bypass_path('window_chrome.js')) - page.add_init_script(path=js_bypass_path('navigator_plugins.js')) - page.add_init_script(path=js_bypass_path('pdf_viewer.js')) - page.add_init_script(path=js_bypass_path('notification_permission.js')) - page.add_init_script(path=js_bypass_path('screen_props.js')) - page.add_init_script(path=js_bypass_path('playwright_fingerprint.js')) + for script in self.__stealth_scripts(): + page.add_init_script(path=script) res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None) page.wait_for_load_state(state="domcontentloaded") diff --git a/scrapling/engines/toolbelt/__init__.py b/scrapling/engines/toolbelt/__init__.py index 595929c..4f31f6a 100644 --- a/scrapling/engines/toolbelt/__init__.py +++ b/scrapling/engines/toolbelt/__init__.py @@ -1,5 +1,6 @@ from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable, - check_type_validity, do_nothing, get_variable_name) + check_type_validity, do_nothing, do_nothing_async, + get_variable_name) from .fingerprints import (generate_convincing_referer, generate_headers, get_os_name) from .navigation import (construct_cdp_url, construct_proxy_dict, diff --git a/scrapling/engines/toolbelt/custom.py b/scrapling/engines/toolbelt/custom.py index 6632b6b..0db3088 100644 --- a/scrapling/engines/toolbelt/custom.py +++ b/scrapling/engines/toolbelt/custom.py @@ -302,3 +302,8 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def def do_nothing(page): # Just works as a filler for `page_action` argument in browser engines return page + + +async def do_nothing_async(page): + # Just works as a filler for `page_action` argument in browser engines + return page