From 213a04405d6d02e3349387d1cfe8dcd25ed7e64e Mon Sep 17 00:00:00 2001 From: morpheus65535 Date: Thu, 7 Mar 2024 21:43:26 -0500 Subject: [PATCH] Rolled back cloudscraper to fix captcha v1 solving issue. --- .../INSTALLER | 0 .../LICENSE | 0 .../METADATA | 110 +--- .../RECORD | 28 +- .../REQUESTED | 0 .../WHEEL | 0 .../top_level.txt | 0 libs/cloudscraper/__init__.py | 577 +++++++++++++++--- libs/cloudscraper/captcha/2captcha.py | 28 +- libs/cloudscraper/captcha/9kw.py | 66 +- libs/cloudscraper/captcha/anticaptcha.py | 226 +++---- libs/cloudscraper/captcha/capmonster.py | 17 +- libs/cloudscraper/captcha/capsolver.py | 188 ------ libs/cloudscraper/captcha/deathbycaptcha.py | 14 +- libs/cloudscraper/cloudflare.py | 490 --------------- libs/version.txt | 2 +- 16 files changed, 676 insertions(+), 1070 deletions(-) rename libs/{cloudscraper-1.2.71.dist-info => cloudscraper-1.2.58.dist-info}/INSTALLER (100%) rename libs/{cloudscraper-1.2.71.dist-info => cloudscraper-1.2.58.dist-info}/LICENSE (100%) rename libs/{cloudscraper-1.2.71.dist-info => cloudscraper-1.2.58.dist-info}/METADATA (92%) rename libs/{cloudscraper-1.2.71.dist-info => cloudscraper-1.2.58.dist-info}/RECORD (53%) rename libs/{cloudscraper-1.2.71.dist-info => cloudscraper-1.2.58.dist-info}/REQUESTED (100%) rename libs/{cloudscraper-1.2.71.dist-info => cloudscraper-1.2.58.dist-info}/WHEEL (100%) rename libs/{cloudscraper-1.2.71.dist-info => cloudscraper-1.2.58.dist-info}/top_level.txt (100%) delete mode 100644 libs/cloudscraper/captcha/capsolver.py delete mode 100644 libs/cloudscraper/cloudflare.py diff --git a/libs/cloudscraper-1.2.71.dist-info/INSTALLER b/libs/cloudscraper-1.2.58.dist-info/INSTALLER similarity index 100% rename from libs/cloudscraper-1.2.71.dist-info/INSTALLER rename to libs/cloudscraper-1.2.58.dist-info/INSTALLER diff --git a/libs/cloudscraper-1.2.71.dist-info/LICENSE b/libs/cloudscraper-1.2.58.dist-info/LICENSE similarity index 100% rename from libs/cloudscraper-1.2.71.dist-info/LICENSE rename to libs/cloudscraper-1.2.58.dist-info/LICENSE diff --git a/libs/cloudscraper-1.2.71.dist-info/METADATA b/libs/cloudscraper-1.2.58.dist-info/METADATA similarity index 92% rename from libs/cloudscraper-1.2.71.dist-info/METADATA rename to libs/cloudscraper-1.2.58.dist-info/METADATA index a248c8208..b75a59542 100644 --- a/libs/cloudscraper-1.2.71.dist-info/METADATA +++ b/libs/cloudscraper-1.2.58.dist-info/METADATA @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: cloudscraper -Version: 1.2.71 +Version: 1.2.58 Summary: A Python module to bypass Cloudflare's anti-bot page. Home-page: https://github.com/venomous/cloudscraper Author: VeNoMouS @@ -82,6 +82,31 @@ We support the following Javascript interpreters/engines. - **[Node.js](https://nodejs.org/)** - **[V8](https://github.com/sony/v8eval/):** We use Sony's [v8eval](https://v8.dev)() python module. +# Updates + +Cloudflare modifies their anti-bot protection page occasionally, So far it has changed maybe once per year on average. + +If you notice that the anti-bot page has changed, or if this module suddenly stops working, please create a GitHub issue so that I can update the code accordingly. + +- Many issues are a result of users not updating to the latest release of this project. Before filing an issue, please run the following command: + +``` +pip show cloudscraper +``` + +If the value of the version field is not the latest release, please run the following to update your package: + +``` +pip install cloudscraper -U +``` + +If you are still encountering a problem, open an issue and please include: + +- The full exception and stack trace. +- The URL of the Cloudflare-protected page which the script does not work on. +- A Pastebin or Gist containing the HTML source of the protected page. +- The version number from `pip show cloudscraper`. + # Usage The simplest way to use cloudscraper is by calling `create_scraper()`. @@ -104,26 +129,6 @@ Consult [Requests' documentation](http://docs.python-requests.org/en/latest/user ## Options -### Disable Cloudflare V1 -#### Description - -If you don't want to even attempt Cloudflare v1 (Deprecated) solving.. - -#### Parameters - - -|Parameter|Value|Default| -|-------------|:-------------:|:-----:| -|disableCloudflareV1|(boolean)|False| - -#### Example - -```python -scraper = cloudscraper.create_scraper(disableCloudflareV1=True) -``` - ------- - ### Brotli #### Description @@ -327,7 +332,6 @@ scraper = cloudscraper.create_scraper(interpreter='nodejs') - **[2captcha](https://www.2captcha.com/)** - **[anticaptcha](https://www.anti-captcha.com/)** -- **[CapSolver](https://capsolver.com/)** - **[CapMonster Cloud](https://capmonster.cloud/)** - **[deathbycaptcha](https://www.deathbycaptcha.com/)** - **[9kw](https://www.9kw.eu/)** @@ -365,6 +369,7 @@ if proxies are set you can disable sending the proxies to 2captcha by setting `n ```python scraper = cloudscraper.create_scraper( + interpreter='nodejs', captcha={ 'provider': '2captcha', 'api_key': 'your_2captcha_api_key' @@ -392,6 +397,7 @@ if proxies are set you can disable sending the proxies to anticaptcha by setting ```python scraper = cloudscraper.create_scraper( + interpreter='nodejs', captcha={ 'provider': 'anticaptcha', 'api_key': 'your_anticaptcha_api_key' @@ -401,29 +407,6 @@ scraper = cloudscraper.create_scraper( ------ -#### CapSolver - -##### Required `captcha` Parameters - -|Parameter|Value|Required|Default| -|-------------|:-------------:|:-----:|:-----:| -|provider|(string) `captchaai`|yes|| -|api_key|(string)|yes|| - - -##### Example - -```python -scraper = cloudscraper.create_scraper( - captcha={ - 'provider': 'capsolver', - 'api_key': 'your_captchaai_api_key' - } -) -``` - ------- - #### CapMonster Cloud ##### Required `captcha` Parameters @@ -442,6 +425,7 @@ if proxies are set you can disable sending the proxies to CapMonster by setting ```python scraper = cloudscraper.create_scraper( + interpreter='nodejs', captcha={ 'provider': 'capmonster', 'clientKey': 'your_capmonster_clientKey' @@ -465,6 +449,7 @@ scraper = cloudscraper.create_scraper( ```python scraper = cloudscraper.create_scraper( + interpreter='nodejs', captcha={ 'provider': 'deathbycaptcha', 'username': 'your_deathbycaptcha_username', @@ -489,6 +474,7 @@ scraper = cloudscraper.create_scraper( ```python scraper = cloudscraper.create_scraper( + interpreter='nodejs', captcha={ 'provider': '9kw', 'api_key': 'your_9kw_api_key', @@ -512,6 +498,7 @@ Use this if you want the requests response payload without solving the Captcha. ##### Example ```python scraper = cloudscraper.create_scraper( + interpreter='nodejs', captcha={'provider': 'return_response'} ) ``` @@ -637,36 +624,3 @@ print( ) ) ``` - -### Cryptography - -#### Description - -Control communication between client and server - -#### Parameters - -Can be passed as an argument to `create_scraper()`. - -|Parameter|Value|Default| -|-------------|:-------------:|:-----:| -|cipherSuite|(string)|None| -|ecdhCurve|(string)|prime256v1| -|server_hostname|(string)|None| - -#### Example - -```python -# Some servers require the use of a more complex ecdh curve than the default "prime256v1" -# It may can solve handshake failure -scraper = cloudscraper.create_scraper(ecdhCurve='secp384r1') -``` - -```python -# Manipulate server_hostname -scraper = cloudscraper.create_scraper(server_hostname='www.somesite.com') -scraper.get( - 'https://backend.hosting.com/', - headers={'Host': 'www.somesite.com'} -) -``` diff --git a/libs/cloudscraper-1.2.71.dist-info/RECORD b/libs/cloudscraper-1.2.58.dist-info/RECORD similarity index 53% rename from libs/cloudscraper-1.2.71.dist-info/RECORD rename to libs/cloudscraper-1.2.58.dist-info/RECORD index 733225bbd..3dcf225c2 100644 --- a/libs/cloudscraper-1.2.71.dist-info/RECORD +++ b/libs/cloudscraper-1.2.58.dist-info/RECORD @@ -1,19 +1,17 @@ -cloudscraper-1.2.71.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 -cloudscraper-1.2.71.dist-info/LICENSE,sha256=luC9NJPEX0JAQUKWkzWlAOaaE69fNKnW1uIuDKmWERc,1091 -cloudscraper-1.2.71.dist-info/METADATA,sha256=ywzk5ZCEv-I8Y9gajnVCsiAR3DrdmeiRLam3EGTJ0UA,19942 -cloudscraper-1.2.71.dist-info/RECORD,, -cloudscraper-1.2.71.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -cloudscraper-1.2.71.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92 -cloudscraper-1.2.71.dist-info/top_level.txt,sha256=OFEsobVl62sa2NzpgNtfHZkIw_qZr_wljhjmlP9oGiM,13 -cloudscraper/__init__.py,sha256=Eg8AqKak2yYcraKqt7O3LJLNmppC2uL7dvAANiyxh5w,15960 -cloudscraper/captcha/2captcha.py,sha256=yyDWvL6HVK4pM69aRpOV9mwzbtPC0yGz_mWkQ7-mkmI,10643 -cloudscraper/captcha/9kw.py,sha256=5EAUyO_vBEuLKsr4sXYa25MSVOm3BXVAdcenF6ZPsgI,7701 +cloudscraper-1.2.58.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 +cloudscraper-1.2.58.dist-info/LICENSE,sha256=luC9NJPEX0JAQUKWkzWlAOaaE69fNKnW1uIuDKmWERc,1091 +cloudscraper-1.2.58.dist-info/METADATA,sha256=q25vkvMHkAxmuZRwak56i4CLAFUuG5EwEzz1oEXOY3U,19537 +cloudscraper-1.2.58.dist-info/RECORD,, +cloudscraper-1.2.58.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +cloudscraper-1.2.58.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92 +cloudscraper-1.2.58.dist-info/top_level.txt,sha256=OFEsobVl62sa2NzpgNtfHZkIw_qZr_wljhjmlP9oGiM,13 +cloudscraper/__init__.py,sha256=gsOMaKAKNfJUR4FkiEefAA2fAHVFuSwkblGgqxClsrw,32790 +cloudscraper/captcha/2captcha.py,sha256=CWF62VmLqb_KvSH-dqzo1XEwCBOQh1Aee-G18cX_7aw,10371 +cloudscraper/captcha/9kw.py,sha256=1dfhRHKeCx8yIE1opWyQ1Q7aHJlXDdkv1bV2Bfzbrf8,7387 cloudscraper/captcha/__init__.py,sha256=VORxm32xqLrEE-zxFWgEhSbtqfigjCfwodChg1VlQ6c,1511 -cloudscraper/captcha/anticaptcha.py,sha256=YUsLviq3ZtbjTUnAPq6zVEieHmeSgnmiXKcqXZeO5qA,6152 -cloudscraper/captcha/capmonster.py,sha256=_9AUr6vHG4c5XLc5XqvnnMqgcvuKnzz1ckJpSySjgKQ,6143 -cloudscraper/captcha/capsolver.py,sha256=x38fO0m_k2W8nO3IppXADZsfCYl0iyvRgajZ5s5iTSU,6060 -cloudscraper/captcha/deathbycaptcha.py,sha256=asUX_quUsjAyWVRc7_8o_ryHZFotN-NP60mQiuN-c1U,8673 -cloudscraper/cloudflare.py,sha256=i1jyJcY-aRy3IQ-7YUly8qGUovO4Nx99M_FKfz4eivQ,19993 +cloudscraper/captcha/anticaptcha.py,sha256=cK8LON8M-8MN1wx_rSMTTqxrpwbL65Z2svH-LtGiA40,3478 +cloudscraper/captcha/capmonster.py,sha256=oVXdv2Wrgh2nWFrYttUzbqW9xZU1j6A4cDDcZINIoVg,5695 +cloudscraper/captcha/deathbycaptcha.py,sha256=UJqkh35gcKVdIhwNqF7N_0ixpIPT2PHiMbT378wEM4w,8073 cloudscraper/exceptions.py,sha256=WSMgI8PRvU3g4KDFrjU-42p83lSAVOw8tN2NSqqIUfw,2397 cloudscraper/help.py,sha256=fNYNGFQjiCL1d-gCpDoulBk4iHOuzNhLBudi7NrOHSg,2100 cloudscraper/interpreters/__init__.py,sha256=mWY8LuzDRYWGGnKz5vYSdrOnoVaeWlixmAtZN8Pq6bY,1734 diff --git a/libs/cloudscraper-1.2.71.dist-info/REQUESTED b/libs/cloudscraper-1.2.58.dist-info/REQUESTED similarity index 100% rename from libs/cloudscraper-1.2.71.dist-info/REQUESTED rename to libs/cloudscraper-1.2.58.dist-info/REQUESTED diff --git a/libs/cloudscraper-1.2.71.dist-info/WHEEL b/libs/cloudscraper-1.2.58.dist-info/WHEEL similarity index 100% rename from libs/cloudscraper-1.2.71.dist-info/WHEEL rename to libs/cloudscraper-1.2.58.dist-info/WHEEL diff --git a/libs/cloudscraper-1.2.71.dist-info/top_level.txt b/libs/cloudscraper-1.2.58.dist-info/top_level.txt similarity index 100% rename from libs/cloudscraper-1.2.71.dist-info/top_level.txt rename to libs/cloudscraper-1.2.58.dist-info/top_level.txt diff --git a/libs/cloudscraper/__init__.py b/libs/cloudscraper/__init__.py index 67abd446f..077747034 100644 --- a/libs/cloudscraper/__init__.py +++ b/libs/cloudscraper/__init__.py @@ -1,14 +1,20 @@ # ------------------------------------------------------------------------------- # import logging +import re import requests import sys import ssl +from collections import OrderedDict +from copy import deepcopy + from requests.adapters import HTTPAdapter from requests.sessions import Session from requests_toolbelt.utils import dump +from time import sleep + # ------------------------------------------------------------------------------- # try: @@ -22,23 +28,37 @@ import copy_reg as copyreg try: - from urlparse import urlparse + from HTMLParser import HTMLParser except ImportError: - from urllib.parse import urlparse + if sys.version_info >= (3, 4): + import html + else: + from html.parser import HTMLParser + +try: + from urlparse import urlparse, urljoin +except ImportError: + from urllib.parse import urlparse, urljoin # ------------------------------------------------------------------------------- # from .exceptions import ( CloudflareLoopProtection, - CloudflareIUAMError + CloudflareCode1020, + CloudflareIUAMError, + CloudflareSolveError, + CloudflareChallengeError, + CloudflareCaptchaError, + CloudflareCaptchaProvider ) -from .cloudflare import Cloudflare +from .interpreters import JavaScriptInterpreter +from .captcha import Captcha from .user_agent import User_Agent # ------------------------------------------------------------------------------- # -__version__ = '1.2.71' +__version__ = '1.2.58' # ------------------------------------------------------------------------------- # @@ -59,8 +79,6 @@ def __init__(self, *args, **kwargs): self.ssl_context = kwargs.pop('ssl_context', None) self.cipherSuite = kwargs.pop('cipherSuite', None) self.source_address = kwargs.pop('source_address', None) - self.server_hostname = kwargs.pop('server_hostname', None) - self.ecdhCurve = kwargs.pop('ecdhCurve', 'prime256v1') if self.source_address: if isinstance(self.source_address, str): @@ -73,34 +91,14 @@ def __init__(self, *args, **kwargs): if not self.ssl_context: self.ssl_context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH) - - self.ssl_context.orig_wrap_socket = self.ssl_context.wrap_socket - self.ssl_context.wrap_socket = self.wrap_socket - - if self.server_hostname: - self.ssl_context.server_hostname = self.server_hostname - self.ssl_context.set_ciphers(self.cipherSuite) - self.ssl_context.set_ecdh_curve(self.ecdhCurve) - - self.ssl_context.minimum_version = ssl.TLSVersion.TLSv1_2 - self.ssl_context.maximum_version = ssl.TLSVersion.TLSv1_3 + self.ssl_context.set_ecdh_curve('prime256v1') + self.ssl_context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 | ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1) super(CipherSuiteAdapter, self).__init__(**kwargs) # ------------------------------------------------------------------------------- # - def wrap_socket(self, *args, **kwargs): - if hasattr(self.ssl_context, 'server_hostname') and self.ssl_context.server_hostname: - kwargs['server_hostname'] = self.ssl_context.server_hostname - self.ssl_context.check_hostname = False - else: - self.ssl_context.check_hostname = True - - return self.ssl_context.orig_wrap_socket(*args, **kwargs) - - # ------------------------------------------------------------------------------- # - def init_poolmanager(self, *args, **kwargs): kwargs['ssl_context'] = self.ssl_context kwargs['source_address'] = self.source_address @@ -120,21 +118,15 @@ class CloudScraper(Session): def __init__(self, *args, **kwargs): self.debug = kwargs.pop('debug', False) - - self.disableCloudflareV1 = kwargs.pop('disableCloudflareV1', False) self.delay = kwargs.pop('delay', None) - self.captcha = kwargs.pop('captcha', {}) - self.doubleDown = kwargs.pop('doubleDown', True) + self.cipherSuite = kwargs.pop('cipherSuite', None) + self.ssl_context = kwargs.pop('ssl_context', None) self.interpreter = kwargs.pop('interpreter', 'native') - + self.captcha = kwargs.pop('captcha', {}) self.requestPreHook = kwargs.pop('requestPreHook', None) self.requestPostHook = kwargs.pop('requestPostHook', None) - - self.cipherSuite = kwargs.pop('cipherSuite', None) - self.ecdhCurve = kwargs.pop('ecdhCurve', 'prime256v1') self.source_address = kwargs.pop('source_address', None) - self.server_hostname = kwargs.pop('server_hostname', None) - self.ssl_context = kwargs.pop('ssl_context', None) + self.doubleDown = kwargs.pop('doubleDown', True) self.allow_brotli = kwargs.pop( 'allow_brotli', @@ -167,10 +159,8 @@ def __init__(self, *args, **kwargs): 'https://', CipherSuiteAdapter( cipherSuite=self.cipherSuite, - ecdhCurve=self.ecdhCurve, - server_hostname=self.server_hostname, - source_address=self.source_address, - ssl_context=self.ssl_context + ssl_context=self.ssl_context, + source_address=self.source_address ) ) @@ -211,6 +201,20 @@ def debugRequest(req): except ValueError as e: print(f"Debug Error: {getattr(e, 'message', e)}") + # ------------------------------------------------------------------------------- # + # Unescape / decode html entities + # ------------------------------------------------------------------------------- # + + @staticmethod + def unescape(html_text): + if sys.version_info >= (3, 0): + if sys.version_info >= (3, 4): + return html.unescape(html_text) + + return HTMLParser().unescape(html_text) + + return HTMLParser().unescape(html_text) + # ------------------------------------------------------------------------------- # # Decode Brotli on older versions of urllib3 manually # ------------------------------------------------------------------------------- # @@ -271,43 +275,479 @@ def request(self, method, url, *args, **kwargs): # ------------------------------------------------------------------------------- # if self.requestPostHook: - newResponse = self.requestPostHook(self, response) + response = self.requestPostHook(self, response) + + if self.debug: + self.debugRequest(response) + + # Check if Cloudflare anti-bot is on + if self.is_Challenge_Request(response): + # ------------------------------------------------------------------------------- # + # Try to solve the challenge and send it back + # ------------------------------------------------------------------------------- # + + if self._solveDepthCnt >= self.solveDepth: + _ = self._solveDepthCnt + self.simpleException( + CloudflareLoopProtection, + f"!!Loop Protection!! We have tried to solve {_} time(s) in a row." + ) + + self._solveDepthCnt += 1 + + response = self.Challenge_Response(response, **kwargs) + else: + if not response.is_redirect and response.status_code not in [429, 503]: + self._solveDepthCnt = 0 + + return response + + # ------------------------------------------------------------------------------- # + # check if the response contains a valid Cloudflare Bot Fight Mode challenge + # ------------------------------------------------------------------------------- # + + @staticmethod + def is_BFM_Challenge(resp): + try: + return ( + resp.headers.get('Server', '').startswith('cloudflare') + and re.search( + r"\/cdn-cgi\/bm\/cv\/\d+\/api\.js.*?" + r"window\['__CF\$cv\$params'\]\s*=\s*{", + resp.text, + re.M | re.S + ) + ) + except AttributeError: + pass + + return False + + # ------------------------------------------------------------------------------- # + # check if the response contains a valid Cloudflare challenge + # ------------------------------------------------------------------------------- # + + @staticmethod + def is_IUAM_Challenge(resp): + try: + return ( + resp.headers.get('Server', '').startswith('cloudflare') + and resp.status_code in [429, 503] + and re.search( + r'
1020', + resp.text, + re.M | re.DOTALL + ) + ) + except AttributeError: + pass + + return False + + # ------------------------------------------------------------------------------- # + # Wrapper for is_Captcha_Challenge, is_IUAM_Challenge, is_Firewall_Blocked + # ------------------------------------------------------------------------------- # + + def is_Challenge_Request(self, resp): + if self.is_Firewall_Blocked(resp): + self.simpleException( + CloudflareCode1020, + 'Cloudflare has blocked this request (Code 1020 Detected).' + ) + + if self.is_New_Captcha_Challenge(resp): + self.simpleException( + CloudflareChallengeError, + 'Detected a Cloudflare version 2 Captcha challenge, This feature is not available in the opensource (free) version.' + ) + + if self.is_New_IUAM_Challenge(resp): + self.simpleException( + CloudflareChallengeError, + 'Detected a Cloudflare version 2 challenge, This feature is not available in the opensource (free) version.' + ) + + if self.is_Captcha_Challenge(resp) or self.is_IUAM_Challenge(resp): + if self.debug: + print('Detected a Cloudflare version 1 challenge.') + return True + + return False + + # ------------------------------------------------------------------------------- # + # Try to solve cloudflare javascript challenge. + # ------------------------------------------------------------------------------- # + + def IUAM_Challenge_Response(self, body, url, interpreter): + try: + formPayload = re.search( + r'.*?="challenge-form" ' + r'action="(?P.*?' + r'__cf_chl_jschl_tk__=\S+)"(.*?))', + body, + re.M | re.DOTALL + ).groupdict() + + if not all(key in formPayload for key in ['form', 'challengeUUID']): + self.simpleException( + CloudflareIUAMError, + "Cloudflare IUAM detected, unfortunately we can't extract the parameters correctly." + ) - if response != newResponse: # Give me walrus in 3.7!!! - response = newResponse - if self.debug: - print('==== requestPostHook Debug ====') - self.debugRequest(response) + payload = OrderedDict() + for challengeParam in re.findall(r'^\s*', formPayload['form'], re.M | re.S): + inputPayload = dict(re.findall(r'(\S+)="(\S+)"', challengeParam)) + if inputPayload.get('name') in ['r', 'jschl_vc', 'pass']: + payload.update({inputPayload['name']: inputPayload['value']}) + except AttributeError: + self.simpleException( + CloudflareIUAMError, + "Cloudflare IUAM detected, unfortunately we can't extract the parameters correctly." + ) + + hostParsed = urlparse(url) + + try: + payload['jschl_answer'] = JavaScriptInterpreter.dynamicImport( + interpreter + ).solveChallenge(body, hostParsed.netloc) + except Exception as e: + self.simpleException( + CloudflareIUAMError, + f"Unable to parse Cloudflare anti-bots page: {getattr(e, 'message', e)}" + ) + + return { + 'url': f"{hostParsed.scheme}://{hostParsed.netloc}{self.unescape(formPayload['challengeUUID'])}", + 'data': payload + } + + # ------------------------------------------------------------------------------- # + # Try to solve the Captcha challenge via 3rd party. + # ------------------------------------------------------------------------------- # + + def captcha_Challenge_Response(self, provider, provider_params, body, url): + try: + formPayload = re.search( + r'
.*?="challenge-form" ' + r'action="(?P.*?__cf_chl_captcha_tk__=\S+)"(.*?))', + body, + re.M | re.DOTALL + ).groupdict() + + if not all(key in formPayload for key in ['form', 'challengeUUID']): + self.simpleException( + CloudflareCaptchaError, + "Cloudflare Captcha detected, unfortunately we can't extract the parameters correctly." + ) + + payload = OrderedDict( + re.findall( + r'(name="r"\svalue|data-ray|data-sitekey|name="cf_captcha_kind"\svalue)="(.*?)"', + formPayload['form'] + ) + ) + + captchaType = 'reCaptcha' if payload['name="cf_captcha_kind" value'] == 're' else 'hCaptcha' + + except (AttributeError, KeyError): + self.simpleException( + CloudflareCaptchaError, + "Cloudflare Captcha detected, unfortunately we can't extract the parameters correctly." + ) + + # ------------------------------------------------------------------------------- # + # Pass proxy parameter to provider to solve captcha. + # ------------------------------------------------------------------------------- # + + if self.proxies and self.proxies != self.captcha.get('proxy'): + self.captcha['proxy'] = self.proxies + + # ------------------------------------------------------------------------------- # + # Pass User-Agent if provider supports it to solve captcha. # ------------------------------------------------------------------------------- # - if not self.disableCloudflareV1: - cloudflareV1 = Cloudflare(self) + self.captcha['User-Agent'] = self.headers['User-Agent'] + + # ------------------------------------------------------------------------------- # + # Submit job to provider to request captcha solve. + # ------------------------------------------------------------------------------- # + + captchaResponse = Captcha.dynamicImport( + provider.lower() + ).solveCaptcha( + captchaType, + url, + payload['data-sitekey'], + provider_params + ) + + # ------------------------------------------------------------------------------- # + # Parse and handle the response of solved captcha. + # ------------------------------------------------------------------------------- # + + dataPayload = OrderedDict([ + ('r', payload.get('name="r" value', '')), + ('cf_captcha_kind', payload['name="cf_captcha_kind" value']), + ('id', payload.get('data-ray')), + ('g-recaptcha-response', captchaResponse) + ]) + + if captchaType == 'hCaptcha': + dataPayload.update({'h-captcha-response': captchaResponse}) + + hostParsed = urlparse(url) + + return { + 'url': f"{hostParsed.scheme}://{hostParsed.netloc}{self.unescape(formPayload['challengeUUID'])}", + 'data': dataPayload + } + # ------------------------------------------------------------------------------- # + # Attempt to handle and send the challenge response back to cloudflare + # ------------------------------------------------------------------------------- # + + def Challenge_Response(self, resp, **kwargs): + if self.is_Captcha_Challenge(resp): # ------------------------------------------------------------------------------- # - # Check if Cloudflare v1 anti-bot is on + # double down on the request as some websites are only checking + # if cfuid is populated before issuing Captcha. # ------------------------------------------------------------------------------- # - if cloudflareV1.is_Challenge_Request(response): - # ------------------------------------------------------------------------------- # - # Try to solve the challenge and send it back - # ------------------------------------------------------------------------------- # + if self.doubleDown: + resp = self.decodeBrotli( + self.perform_request(resp.request.method, resp.url, **kwargs) + ) + + if not self.is_Captcha_Challenge(resp): + return resp + + # ------------------------------------------------------------------------------- # + # if no captcha provider raise a runtime error. + # ------------------------------------------------------------------------------- # + + if not self.captcha or not isinstance(self.captcha, dict) or not self.captcha.get('provider'): + self.simpleException( + CloudflareCaptchaProvider, + "Cloudflare Captcha detected, unfortunately you haven't loaded an anti Captcha provider " + "correctly via the 'captcha' parameter." + ) + + # ------------------------------------------------------------------------------- # + # if provider is return_response, return the response without doing anything. + # ------------------------------------------------------------------------------- # - if self._solveDepthCnt >= self.solveDepth: - _ = self._solveDepthCnt + if self.captcha.get('provider') == 'return_response': + return resp + + # ------------------------------------------------------------------------------- # + # Submit request to parser wrapper to solve captcha + # ------------------------------------------------------------------------------- # + + submit_url = self.captcha_Challenge_Response( + self.captcha.get('provider'), + self.captcha, + resp.text, + resp.url + ) + else: + # ------------------------------------------------------------------------------- # + # Cloudflare requires a delay before solving the challenge + # ------------------------------------------------------------------------------- # + + if not self.delay: + try: + delay = float( + re.search( + r'submit\(\);\r?\n\s*},\s*([0-9]+)', + resp.text + ).group(1) + ) / float(1000) + if isinstance(delay, (int, float)): + self.delay = delay + except (AttributeError, ValueError): self.simpleException( - CloudflareLoopProtection, - f"!!Loop Protection!! We have tried to solve {_} time(s) in a row." + CloudflareIUAMError, + "Cloudflare IUAM possibility malformed, issue extracing delay value." ) - self._solveDepthCnt += 1 + sleep(self.delay) + + # ------------------------------------------------------------------------------- # + + submit_url = self.IUAM_Challenge_Response( + resp.text, + resp.url, + self.interpreter + ) + + # ------------------------------------------------------------------------------- # + # Send the Challenge Response back to Cloudflare + # ------------------------------------------------------------------------------- # + + if submit_url: + + def updateAttr(obj, name, newValue): + try: + obj[name].update(newValue) + return obj[name] + except (AttributeError, KeyError): + obj[name] = {} + obj[name].update(newValue) + return obj[name] + + cloudflare_kwargs = deepcopy(kwargs) + cloudflare_kwargs['allow_redirects'] = False + cloudflare_kwargs['data'] = updateAttr( + cloudflare_kwargs, + 'data', + submit_url['data'] + ) + + urlParsed = urlparse(resp.url) + cloudflare_kwargs['headers'] = updateAttr( + cloudflare_kwargs, + 'headers', + { + 'Origin': f'{urlParsed.scheme}://{urlParsed.netloc}', + 'Referer': resp.url + } + ) + + challengeSubmitResponse = self.request( + 'POST', + submit_url['url'], + **cloudflare_kwargs + ) + + if challengeSubmitResponse.status_code == 400: + self.simpleException( + CloudflareSolveError, + 'Invalid challenge answer detected, Cloudflare broken?' + ) + + # ------------------------------------------------------------------------------- # + # Return response if Cloudflare is doing content pass through instead of 3xx + # else request with redirect URL also handle protocol scheme change http -> https + # ------------------------------------------------------------------------------- # + + if not challengeSubmitResponse.is_redirect: + return challengeSubmitResponse - response = cloudflareV1.Challenge_Response(response, **kwargs) else: - if not response.is_redirect and response.status_code not in [429, 503]: - self._solveDepthCnt = 0 + cloudflare_kwargs = deepcopy(kwargs) + cloudflare_kwargs['headers'] = updateAttr( + cloudflare_kwargs, + 'headers', + {'Referer': challengeSubmitResponse.url} + ) - return response + if not urlparse(challengeSubmitResponse.headers['Location']).netloc: + redirect_location = urljoin( + challengeSubmitResponse.url, + challengeSubmitResponse.headers['Location'] + ) + else: + redirect_location = challengeSubmitResponse.headers['Location'] + + return self.request( + resp.request.method, + redirect_location, + **cloudflare_kwargs + ) + + # ------------------------------------------------------------------------------- # + # We shouldn't be here... + # Re-request the original query and/or process again.... + # ------------------------------------------------------------------------------- # + + return self.request(resp.request.method, resp.url, **kwargs) # ------------------------------------------------------------------------------- # @@ -321,7 +761,7 @@ def create_scraper(cls, sess=None, **kwargs): if sess: for attr in ['auth', 'cert', 'cookies', 'headers', 'hooks', 'params', 'proxies', 'data']: val = getattr(sess, attr, None) - if val is not None: + if val: setattr(scraper, attr, val) return scraper @@ -342,7 +782,7 @@ def get_tokens(cls, url, **kwargs): 'doubleDown', 'captcha', 'interpreter', - 'source_address', + 'source_address' 'requestPreHook', 'requestPostHook' ] if field in kwargs @@ -366,7 +806,6 @@ def get_tokens(cls, url, **kwargs): break else: cls.simpleException( - cls, CloudflareIUAMError, "Unable to find Cloudflare cookies. Does the site actually " "have Cloudflare IUAM (I'm Under Attack Mode) enabled?" @@ -374,6 +813,7 @@ def get_tokens(cls, url, **kwargs): return ( { + '__cfduid': scraper.cookies.get('__cfduid', '', domain=cookie_domain), 'cf_clearance': scraper.cookies.get('cf_clearance', '', domain=cookie_domain) }, scraper.headers['User-Agent'] @@ -402,6 +842,5 @@ def get_cookie_string(cls, url, **kwargs): # ------------------------------------------------------------------------------- # create_scraper = CloudScraper.create_scraper -session = CloudScraper.create_scraper get_tokens = CloudScraper.get_tokens get_cookie_string = CloudScraper.get_cookie_string diff --git a/libs/cloudscraper/captcha/2captcha.py b/libs/cloudscraper/captcha/2captcha.py index 1052e0292..7fae7f306 100644 --- a/libs/cloudscraper/captcha/2captcha.py +++ b/libs/cloudscraper/captcha/2captcha.py @@ -29,11 +29,6 @@ def __init__(self): super(captchaSolver, self).__init__('2captcha') self.host = 'https://2captcha.com' self.session = requests.Session() - self.captchaType = { - 'reCaptcha': 'userrecaptcha', - 'hCaptcha': 'hcaptcha', - 'turnstile': 'turnstile' - } # ------------------------------------------------------------------------------- # @@ -180,16 +175,23 @@ def _checkRequest(response): 'soft_id': 2905 } - data.update({ - 'method': self.captchaType[captchaType], - 'googlekey' if captchaType == 'reCaptcha' else 'sitekey': siteKey - }) + data.update( + { + 'method': 'userrcaptcha', + 'googlekey': siteKey + } if captchaType == 'reCaptcha' else { + 'method': 'hcaptcha', + 'sitekey': siteKey + } + ) if self.proxy: - data.update({ - 'proxy': self.proxy, - 'proxytype': self.proxyType - }) + data.update( + { + 'proxy': self.proxy, + 'proxytype': self.proxyType + } + ) response = polling2.poll( lambda: self.session.post( diff --git a/libs/cloudscraper/captcha/9kw.py b/libs/cloudscraper/captcha/9kw.py index df3589d72..143def818 100644 --- a/libs/cloudscraper/captcha/9kw.py +++ b/libs/cloudscraper/captcha/9kw.py @@ -12,35 +12,30 @@ ) from ..exceptions import ( - CaptchaException, - CaptchaServiceUnavailable, - CaptchaAPIError, - CaptchaTimeout, - CaptchaParameter, - CaptchaBadJobID + reCaptchaServiceUnavailable, + reCaptchaAPIError, + reCaptchaTimeout, + reCaptchaParameter, + reCaptchaBadJobID ) -from . import Captcha +from . import reCaptcha -class captchaSolver(Captcha): +class captchaSolver(reCaptcha): def __init__(self): super(captchaSolver, self).__init__('9kw') self.host = 'https://www.9kw.eu/index.cgi' self.maxtimeout = 180 self.session = requests.Session() - self.captchaType = { - 'reCaptcha': 'recaptchav2', - 'hCaptcha': 'hcaptcha' - } # ------------------------------------------------------------------------------- # @staticmethod def checkErrorStatus(response): if response.status_code in [500, 502]: - raise CaptchaServiceUnavailable( + raise reCaptchaServiceUnavailable( f'9kw: Server Side Error {response.status_code}' ) @@ -103,18 +98,18 @@ def checkErrorStatus(response): if response.text.startswith('{'): if response.json().get('error'): - raise CaptchaAPIError(error_codes.get(int(response.json().get('error')))) + raise reCaptchaAPIError(error_codes.get(int(response.json().get('error')))) else: error_code = int(re.search(r'^00(?P\d+)', response.text).groupdict().get('error_code', 0)) if error_code: - raise CaptchaAPIError(error_codes.get(error_code)) + raise reCaptchaAPIError(error_codes.get(error_code)) # ------------------------------------------------------------------------------- # def requestJob(self, jobID): if not jobID: - raise CaptchaBadJobID( - "9kw: Error bad job id to request against." + raise reCaptchaBadJobID( + "9kw: Error bad job id to request reCaptcha against." ) def _checkRequest(response): @@ -144,7 +139,7 @@ def _checkRequest(response): if response: return response.json().get('answer') else: - raise CaptchaTimeout("9kw: Error failed to solve.") + raise reCaptchaTimeout("9kw: Error failed to solve reCaptcha.") # ------------------------------------------------------------------------------- # @@ -157,6 +152,11 @@ def _checkRequest(response): return None + captchaMap = { + 'reCaptcha': 'recaptchav2', + 'hCaptcha': 'hcaptcha' + } + response = polling.poll( lambda: self.session.post( self.host, @@ -165,7 +165,7 @@ def _checkRequest(response): 'action': 'usercaptchaupload', 'interactive': 1, 'file-upload-01': siteKey, - 'oldsource': self.captchaType[captchaType], + 'oldsource': captchaMap[captchaType], 'pageurl': url, 'maxtimeout': self.maxtimeout, 'json': 1 @@ -180,35 +180,33 @@ def _checkRequest(response): if response: return response.json().get('captchaid') else: - raise CaptchaBadJobID('9kw: Error no valid job id was returned.') + raise reCaptchaBadJobID('9kw: Error no valid job id was returned.') # ------------------------------------------------------------------------------- # - def getCaptchaAnswer(self, captchaType, url, siteKey, captchaParams): - jobID = None - if not captchaParams.get('api_key'): - raise CaptchaParameter("9kw: Missing api_key parameter.") + def getCaptchaAnswer(self, captchaType, url, siteKey, reCaptchaParams): + jobID = None - self.api_key = captchaParams.get('api_key') + if not reCaptchaParams.get('api_key'): + raise reCaptchaParameter("9kw: Missing api_key parameter.") - if captchaParams.get('maxtimeout'): - self.maxtimeout = captchaParams.get('maxtimeout') + self.api_key = reCaptchaParams.get('api_key') - if captchaParams.get('proxy'): - self.session.proxies = captchaParams.get('proxies') + if reCaptchaParams.get('maxtimeout'): + self.maxtimeout = reCaptchaParams.get('maxtimeout') - if captchaType not in self.captchaType: - raise CaptchaException(f'9kw: {captchaType} is not supported by this provider.') + if reCaptchaParams.get('proxy'): + self.session.proxies = reCaptchaParams.get('proxies') try: jobID = self.requestSolve(captchaType, url, siteKey) return self.requestJob(jobID) except polling.TimeoutException: - raise CaptchaTimeout( - f"9kw: solve took to long to execute 'captchaid' {jobID}, aborting." + raise reCaptchaTimeout( + f"9kw: reCaptcha solve took to long to execute 'captchaid' {jobID}, aborting." ) - # ------------------------------------------------------------------------------- # + captchaSolver() diff --git a/libs/cloudscraper/captcha/anticaptcha.py b/libs/cloudscraper/captcha/anticaptcha.py index bfefac201..7550275cc 100644 --- a/libs/cloudscraper/captcha/anticaptcha.py +++ b/libs/cloudscraper/captcha/anticaptcha.py @@ -1,24 +1,31 @@ from __future__ import absolute_import - -import requests +from ..exceptions import ( + CaptchaParameter, + CaptchaTimeout, + CaptchaAPIError +) try: from urlparse import urlparse except ImportError: from urllib.parse import urlparse -from ..exceptions import ( - CaptchaServiceUnavailable, - CaptchaAPIError, - CaptchaTimeout, - CaptchaParameter, - CaptchaBadJobID -) - try: - import polling2 + from python_anticaptcha import ( + AnticaptchaClient, + NoCaptchaTaskProxylessTask, + HCaptchaTaskProxyless, + NoCaptchaTask, + HCaptchaTask, + AnticaptchaException + ) except ImportError: - raise ImportError("Please install the python module 'polling2' via pip") + raise ImportError( + "Please install/upgrade the python module 'python_anticaptcha' via " + "pip install python-anticaptcha or https://github.com/ad-m/python-anticaptcha/" + ) + +import sys from . import Captcha @@ -26,172 +33,75 @@ class captchaSolver(Captcha): def __init__(self): - super(captchaSolver, self).__init__('anticaptcha') - self.host = 'https://api.anti-captcha.com' - self.session = requests.Session() - self.captchaType = { - 'reCaptcha': 'NoCaptchaTask', - 'hCaptcha': 'HCaptchaTask', - 'turnstile': 'TurnstileTask' - } - - # ------------------------------------------------------------------------------- # - - @staticmethod - def checkErrorStatus(response): - if response.status_code in [500, 502]: - raise CaptchaServiceUnavailable( - f'anticaptcha: Server Side Error {response.status_code}' + if sys.modules['python_anticaptcha'].__version__ < '0.6': + raise ImportError( + "Please upgrade the python module 'python_anticaptcha' via " + "pip install -U python-anticaptcha or https://github.com/ad-m/python-anticaptcha/" ) - - payload = response.json() - if payload['errorId'] >= 1: - if 'errorDescription' in payload: - raise CaptchaAPIError( - payload['errorDescription'] - ) - else: - raise CaptchaAPIError(payload['errorCode']) + super(captchaSolver, self).__init__('anticaptcha') # ------------------------------------------------------------------------------- # - def requestJob(self, taskID): - if not taskID: - raise CaptchaBadJobID( - 'anticaptcha: Error bad task id to request Captcha.' - ) + def parseProxy(self, url, user_agent): + parsed = urlparse(url) - def _checkRequest(response): - self.checkErrorStatus(response) - - if response.ok and response.json()['status'] == 'ready': - return True - - return None - - response = polling2.poll( - lambda: self.session.post( - f'{self.host}/getTaskResult', - json={ - 'clientKey': self.clientKey, - 'taskId': taskID - }, - timeout=30 - ), - check_success=_checkRequest, - step=5, - timeout=180 + return dict( + proxy_type=parsed.scheme, + proxy_address=parsed.hostname, + proxy_port=parsed.port, + proxy_login=parsed.username, + proxy_password=parsed.password, + user_agent=user_agent ) - if response: - payload = response.json()['solution'] - if 'token' in payload: - return payload['token'] - else: - return payload['gRecaptchaResponse'] - else: - raise CaptchaTimeout( - "anticaptcha: Error failed to solve Captcha." - ) - # ------------------------------------------------------------------------------- # - def requestSolve(self, captchaType, url, siteKey): - def _checkRequest(response): - self.checkErrorStatus(response) - - if response.ok and response.json()['taskId']: - return True - - return None + def getCaptchaAnswer(self, captchaType, url, siteKey, captchaParams): + if not captchaParams.get('api_key'): + raise CaptchaParameter("anticaptcha: Missing api_key parameter.") - data = { - 'clientKey': self.clientKey, - 'task': { - 'websiteURL': url, - 'websiteKey': siteKey, - 'type': self.captchaType[captchaType] - }, - 'softId': 959 - } + client = AnticaptchaClient(captchaParams.get('api_key')) - if self.proxy: - data['task'].update(self.proxy) - else: - data['task']['type'] = f"{data['task']['type']}Proxyless" - - response = polling2.poll( - lambda: self.session.post( - f'{self.host}/createTask', - json=data, - allow_redirects=False, - timeout=30 - ), - check_success=_checkRequest, - step=5, - timeout=180 - ) + if captchaParams.get('proxy') and not captchaParams.get('no_proxy'): + captchaMap = { + 'reCaptcha': NoCaptchaTask, + 'hCaptcha': HCaptchaTask + } - if response: - return response.json()['taskId'] - else: - raise CaptchaBadJobID( - 'anticaptcha: Error no task id was returned.' + proxy = self.parseProxy( + captchaParams.get('proxy', {}).get('https'), + captchaParams.get('User-Agent', '') ) - # ------------------------------------------------------------------------------- # - - def getCaptchaAnswer(self, captchaType, url, siteKey, captchaParams): - taskID = None - - if not captchaParams.get('clientKey'): - raise CaptchaParameter( - "anticaptcha: Missing clientKey parameter." + task = captchaMap[captchaType]( + url, + siteKey, + **proxy ) + else: + captchaMap = { + 'reCaptcha': NoCaptchaTaskProxylessTask, + 'hCaptcha': HCaptchaTaskProxyless + } + task = captchaMap[captchaType](url, siteKey) - self.clientKey = captchaParams.get('clientKey') - - if captchaParams.get('proxy') and not captchaParams.get('no_proxy'): - hostParsed = urlparse(captchaParams.get('proxy', {}).get('https')) - - if not hostParsed.scheme: - raise CaptchaParameter('Cannot parse proxy correctly, bad scheme') + if not hasattr(client, 'createTaskSmee'): + raise NotImplementedError( + "Please upgrade 'python_anticaptcha' via pip or download it from " + "https://github.com/ad-m/python-anticaptcha/tree/hcaptcha" + ) - if not hostParsed.netloc: - raise CaptchaParameter('Cannot parse proxy correctly, bad netloc') + job = client.createTaskSmee(task, timeout=180) - ports = { - 'http': 80, - 'https': 443 - } + try: + job.join(maximum_time=180) + except (AnticaptchaException) as e: + raise CaptchaTimeout(f"{getattr(e, 'message', e)}") - self.proxy = { - 'proxyType': hostParsed.scheme, - 'proxyAddress': hostParsed.hostname, - 'proxyPort': hostParsed.port if hostParsed.port else ports[self.proxy['proxyType']], - 'proxyLogin': hostParsed.username, - 'proxyPassword': hostParsed.password, - } + if 'solution' in job._last_result: + return job.get_solution_response() else: - self.proxy = None - - try: - taskID = self.requestSolve(captchaType, url, siteKey) - return self.requestJob(taskID) - except polling2.TimeoutException: - try: - if taskID: - self.reportJob(taskID) - except polling2.TimeoutException: - raise CaptchaTimeout( - "anticaptcha: Captcha solve took to long and also failed " - f"reporting the task with task id {taskID}." - ) - - raise CaptchaTimeout( - "anticaptcha: Captcha solve took to long to execute " - f"task id {taskID}, aborting." - ) + raise CaptchaAPIError('Job did not return `solution` key in payload.') # ------------------------------------------------------------------------------- # diff --git a/libs/cloudscraper/captcha/capmonster.py b/libs/cloudscraper/captcha/capmonster.py index 5846b2df9..9e636106d 100644 --- a/libs/cloudscraper/captcha/capmonster.py +++ b/libs/cloudscraper/captcha/capmonster.py @@ -29,11 +29,6 @@ def __init__(self): super(captchaSolver, self).__init__('capmonster') self.host = 'https://api.capmonster.cloud' self.session = requests.Session() - self.captchaType = { - 'reCaptcha': 'NoCaptchaTask', - 'hCaptcha': 'HCaptchaTask', - 'turnstile': 'TurnstileTask' - } # ------------------------------------------------------------------------------- # @@ -84,11 +79,7 @@ def _checkRequest(response): ) if response: - payload = response.json()['solution'] - if 'token' in payload: - return payload['token'] - else: - return payload['gRecaptchaResponse'] + return response.json()['solution']['gRecaptchaResponse'] else: raise CaptchaTimeout( "CapMonster: Error failed to solve Captcha." @@ -110,9 +101,9 @@ def _checkRequest(response): 'task': { 'websiteURL': url, 'websiteKey': siteKey, - 'type': self.captchaType[captchaType] - }, - 'softId': 37 + 'softId': 37, + 'type': 'NoCaptchaTask' if captchaType == 'reCaptcha' else 'HCaptchaTask' + } } if self.proxy: diff --git a/libs/cloudscraper/captcha/capsolver.py b/libs/cloudscraper/captcha/capsolver.py deleted file mode 100644 index 79b70f617..000000000 --- a/libs/cloudscraper/captcha/capsolver.py +++ /dev/null @@ -1,188 +0,0 @@ -from __future__ import absolute_import - -import requests - -try: - from urlparse import urlparse -except ImportError: - from urllib.parse import urlparse - -from ..exceptions import ( - CaptchaServiceUnavailable, - CaptchaAPIError, - CaptchaTimeout, - CaptchaParameter, - CaptchaBadJobID -) - -try: - import polling2 -except ImportError: - raise ImportError("Please install the python module 'polling2' via pip") - -from . import Captcha - - -class captchaSolver(Captcha): - def __init__(self): - super(captchaSolver, self).__init__('capsolver') - self.host = 'https://api.capsolver.com' - self.session = requests.Session() - self.captchaType = { - 'reCaptcha': 'ReCaptchaV2Task', - 'hCaptcha': 'HCaptchaTask', - 'turnstile': 'AntiCloudflareTask' - } - - # ------------------------------------------------------------------------------- # - - @staticmethod - def checkErrorStatus(response, fnct): - if response.status_code in [500, 502]: - raise CaptchaServiceUnavailable(f'CapSolver: Server Side Error {response.status_code}') - - try: - rPayload = response.json() - except Exception: - return - - if rPayload.get('errorDescription', False) and 'Current system busy' not in rPayload['errorDescription']: - raise CaptchaAPIError( - f"CapSolver -> {fnct} -> {rPayload.get('errorDescription')}" - ) - - # ------------------------------------------------------------------------------- # - - def requestJob(self, jobID): - if not jobID: - raise CaptchaBadJobID("CapSolver: Error bad job id to request task result.") - - def _checkRequest(response): - self.checkErrorStatus(response, 'requestJob') - try: - if response.ok and response.json()['status'] == 'ready': - return True - except Exception: - pass - return None - - response = polling2.poll( - lambda: self.session.post( - f'{self.host}/getTaskResult', - json={ - 'clientKey': self.api_key, - 'taskId': jobID - }, - timeout=30 - ), - check_success=_checkRequest, - step=5, - timeout=180 - ) - - if response: - try: - rPayload = response.json()['solution'] - if 'token' in rPayload: - return rPayload['token'] - else: - return rPayload['gRecaptchaResponse'] - except Exception: - pass - - raise CaptchaTimeout( - "CapSolver: Error failed to solve Captcha." - ) - - # ------------------------------------------------------------------------------- # - - def requestSolve(self, captchaType, url, siteKey): - - # ------------------------------------------------------------------------------- # - - def _checkRequest(response): - self.checkErrorStatus(response, 'createTask') - try: - rPayload = response.json() - if response.ok: - if rPayload.get("taskId", False): - return True - except Exception: - pass - return None - - # ------------------------------------------------------------------------------- # - - payload = { - 'clientKey': self.api_key, - 'appId': '9E717405-8C70-49B3-B277-7C2F2196484B', - 'task': { - 'type': self.captchaType[captchaType], - 'websiteURL': url, - 'websiteKey': siteKey - } - } - - if captchaType == 'turnstile': - payload['task']['metadata'] = {'type': 'turnstile'} - - if self.proxy: - payload['task']['proxy'] = self.proxy - else: - payload['task']['type'] = f"{self.captchaType[captchaType]}Proxyless" - - response = polling2.poll( - lambda: self.session.post( - f'{self.host}/createTask', - json=payload, - allow_redirects=False, - timeout=30 - ), - check_success=_checkRequest, - step=5, - timeout=180 - ) - - if response: - rPayload = response.json() - if rPayload.get('taskId'): - return rPayload['taskId'] - - raise CaptchaBadJobID( - 'CapSolver: Error no job id was returned.' - ) - - # ------------------------------------------------------------------------------- # - - def getCaptchaAnswer(self, captchaType, url, siteKey, captchaParams): - if not captchaParams.get('api_key'): - raise CaptchaParameter("CapSolver: Missing api_key parameter.") - self.api_key = captchaParams.get('api_key') - - if captchaParams.get('proxy') and not captchaParams.get('no_proxy'): - hostParsed = urlparse(captchaParams.get('proxy', {}).get('https')) - - if not hostParsed.scheme: - raise CaptchaParameter('Cannot parse proxy correctly, bad scheme') - - if not hostParsed.netloc: - raise CaptchaParameter('Cannot parse proxy correctly, bad netloc') - - self.proxy = captchaParams['proxy']['https'] - else: - self.proxy = None - - try: - jobID = self.requestSolve(captchaType, url, siteKey) - return self.requestJob(jobID) - except polling2.TimeoutException: - raise CaptchaTimeout( - f"CapSolver: Captcha solve (task ID: {jobID}) took to long." - ) - - raise CaptchaAPIError('CapSolver: Job Failure.') - - -# ------------------------------------------------------------------------------- # - -captchaSolver() diff --git a/libs/cloudscraper/captcha/deathbycaptcha.py b/libs/cloudscraper/captcha/deathbycaptcha.py index 724b04df0..33c5ef2d2 100644 --- a/libs/cloudscraper/captcha/deathbycaptcha.py +++ b/libs/cloudscraper/captcha/deathbycaptcha.py @@ -13,7 +13,6 @@ raise ImportError("Please install the python module 'polling2' via pip") from ..exceptions import ( - CaptchaException, CaptchaServiceUnavailable, CaptchaTimeout, CaptchaParameter, @@ -30,10 +29,6 @@ def __init__(self): super(captchaSolver, self).__init__('deathbycaptcha') self.host = 'http://api.dbcapi.me/api' self.session = requests.Session() - self.captchaType = { - 'reCaptcha': '4', - 'hCaptcha': '7' - } # ------------------------------------------------------------------------------- # @@ -186,7 +181,7 @@ def _checkRequest(response): }) data.update({ - 'type': self.captchaType[captchaType], + 'type': '4', 'token_params': json.dumps(jPayload) }) else: @@ -202,7 +197,7 @@ def _checkRequest(response): }) data.update({ - 'type': self.captchaType[captchaType], + 'type': '7', 'hcaptcha_params': json.dumps(jPayload) }) @@ -251,9 +246,6 @@ def getCaptchaAnswer(self, captchaType, url, siteKey, captchaParams): else: self.proxy = None - if captchaType not in self.captchaType: - raise CaptchaException(f'DeathByCaptcha: {captchaType} is not supported by this provider.') - try: jobID = self.requestSolve(captchaType, url, siteKey) return self.requestJob(jobID) @@ -270,7 +262,7 @@ def getCaptchaAnswer(self, captchaType, url, siteKey, captchaParams): f"DeathByCaptcha: Captcha solve took to long to execute job id {jobID}, aborting." ) -# ------------------------------------------------------------------------------- # +# ------------------------------------------------------------------------------- # captchaSolver() diff --git a/libs/cloudscraper/cloudflare.py b/libs/cloudscraper/cloudflare.py deleted file mode 100644 index 0172d7a0f..000000000 --- a/libs/cloudscraper/cloudflare.py +++ /dev/null @@ -1,490 +0,0 @@ -# Cloudflare V1 - -import re -import sys -import time - -from copy import deepcopy -from collections import OrderedDict - -# ------------------------------------------------------------------------------- # - -try: - from HTMLParser import HTMLParser -except ImportError: - if sys.version_info >= (3, 4): - import html - else: - from html.parser import HTMLParser - -try: - from urlparse import urlparse, urljoin -except ImportError: - from urllib.parse import urlparse, urljoin - -# ------------------------------------------------------------------------------- # - -from .exceptions import ( - CloudflareCode1020, - CloudflareIUAMError, - CloudflareSolveError, - CloudflareChallengeError, - CloudflareCaptchaError, - CloudflareCaptchaProvider -) - -# ------------------------------------------------------------------------------- # - -from .captcha import Captcha -from .interpreters import JavaScriptInterpreter - -# ------------------------------------------------------------------------------- # - - -class Cloudflare(): - - def __init__(self, cloudscraper): - self.cloudscraper = cloudscraper - - # ------------------------------------------------------------------------------- # - # Unescape / decode html entities - # ------------------------------------------------------------------------------- # - - @staticmethod - def unescape(html_text): - if sys.version_info >= (3, 0): - if sys.version_info >= (3, 4): - return html.unescape(html_text) - - return HTMLParser().unescape(html_text) - - return HTMLParser().unescape(html_text) - - # ------------------------------------------------------------------------------- # - # check if the response contains a valid Cloudflare challenge - # ------------------------------------------------------------------------------- # - - @staticmethod - def is_IUAM_Challenge(resp): - try: - return ( - resp.headers.get('Server', '').startswith('cloudflare') - and resp.status_code in [429, 503] - and re.search(r'/cdn-cgi/images/trace/jsch/', resp.text, re.M | re.S) - and re.search( - r'''
1020', - resp.text, - re.M | re.DOTALL - ) - ) - except AttributeError: - pass - - return False - - # ------------------------------------------------------------------------------- # - # Wrapper for is_Captcha_Challenge, is_IUAM_Challenge, is_Firewall_Blocked - # ------------------------------------------------------------------------------- # - - def is_Challenge_Request(self, resp): - if self.is_Firewall_Blocked(resp): - self.cloudscraper.simpleException( - CloudflareCode1020, - 'Cloudflare has blocked this request (Code 1020 Detected).' - ) - - if self.is_New_Captcha_Challenge(resp): - self.cloudscraper.simpleException( - CloudflareChallengeError, - 'Detected a Cloudflare version 2 Captcha challenge, This feature is not available in the opensource (free) version.' - ) - - if self.is_New_IUAM_Challenge(resp): - self.cloudscraper.simpleException( - CloudflareChallengeError, - 'Detected a Cloudflare version 2 challenge, This feature is not available in the opensource (free) version.' - ) - - if self.is_Captcha_Challenge(resp) or self.is_IUAM_Challenge(resp): - if self.cloudscraper.debug: - print('Detected a Cloudflare version 1 challenge.') - return True - - return False - - # ------------------------------------------------------------------------------- # - # Try to solve cloudflare javascript challenge. - # ------------------------------------------------------------------------------- # - - def IUAM_Challenge_Response(self, body, url, interpreter): - try: - formPayload = re.search( - r'.*?="challenge-form" ' - r'action="(?P.*?' - r'__cf_chl_f_tk=\S+)"(.*?))', - body, - re.M | re.DOTALL - ).groupdict() - - if not all(key in formPayload for key in ['form', 'challengeUUID']): - self.cloudscraper.simpleException( - CloudflareIUAMError, - "Cloudflare IUAM detected, unfortunately we can't extract the parameters correctly." - ) - - payload = OrderedDict() - for challengeParam in re.findall(r'^\s*', formPayload['form'], re.M | re.S): - inputPayload = dict(re.findall(r'(\S+)="(\S+)"', challengeParam)) - if inputPayload.get('name') in ['r', 'jschl_vc', 'pass']: - payload.update({inputPayload['name']: inputPayload['value']}) - - except AttributeError: - self.cloudscraper.simpleException( - CloudflareIUAMError, - "Cloudflare IUAM detected, unfortunately we can't extract the parameters correctly." - ) - - hostParsed = urlparse(url) - - try: - payload['jschl_answer'] = JavaScriptInterpreter.dynamicImport( - interpreter - ).solveChallenge(body, hostParsed.netloc) - except Exception as e: - self.cloudscraper.simpleException( - CloudflareIUAMError, - f"Unable to parse Cloudflare anti-bots page: {getattr(e, 'message', e)}" - ) - - return { - 'url': f"{hostParsed.scheme}://{hostParsed.netloc}{self.unescape(formPayload['challengeUUID'])}", - 'data': payload - } - - # ------------------------------------------------------------------------------- # - # Try to solve the Captcha challenge via 3rd party. - # ------------------------------------------------------------------------------- # - - def captcha_Challenge_Response(self, provider, provider_params, body, url): - try: - formPayload = re.search( - r'
.*?="challenge-form" ' - r'action="(?P.*?__cf_chl_captcha_tk__=\S+)"(.*?))', - body, - re.M | re.DOTALL - ).groupdict() - - if not all(key in formPayload for key in ['form', 'challengeUUID']): - self.cloudscraper.simpleException( - CloudflareCaptchaError, - "Cloudflare Captcha detected, unfortunately we can't extract the parameters correctly." - ) - - payload = OrderedDict( - re.findall( - r'(name="r"\svalue|data-ray|data-sitekey|name="cf_captcha_kind"\svalue)="(.*?)"', - formPayload['form'] - ) - ) - - captchaType = 'reCaptcha' if payload['name="cf_captcha_kind" value'] == 're' else 'hCaptcha' - - except (AttributeError, KeyError): - self.cloudscraper.simpleException( - CloudflareCaptchaError, - "Cloudflare Captcha detected, unfortunately we can't extract the parameters correctly." - ) - - # ------------------------------------------------------------------------------- # - # Pass proxy parameter to provider to solve captcha. - # ------------------------------------------------------------------------------- # - - if self.cloudscraper.proxies and self.cloudscraper.proxies != self.cloudscraper.captcha.get('proxy'): - self.cloudscraper.captcha['proxy'] = self.proxies - - # ------------------------------------------------------------------------------- # - # Pass User-Agent if provider supports it to solve captcha. - # ------------------------------------------------------------------------------- # - - self.cloudscraper.captcha['User-Agent'] = self.cloudscraper.headers['User-Agent'] - - # ------------------------------------------------------------------------------- # - # Submit job to provider to request captcha solve. - # ------------------------------------------------------------------------------- # - - captchaResponse = Captcha.dynamicImport( - provider.lower() - ).solveCaptcha( - captchaType, - url, - payload['data-sitekey'], - provider_params - ) - - # ------------------------------------------------------------------------------- # - # Parse and handle the response of solved captcha. - # ------------------------------------------------------------------------------- # - - dataPayload = OrderedDict([ - ('r', payload.get('name="r" value', '')), - ('cf_captcha_kind', payload['name="cf_captcha_kind" value']), - ('id', payload.get('data-ray')), - ('g-recaptcha-response', captchaResponse) - ]) - - if captchaType == 'hCaptcha': - dataPayload.update({'h-captcha-response': captchaResponse}) - - hostParsed = urlparse(url) - - return { - 'url': f"{hostParsed.scheme}://{hostParsed.netloc}{self.unescape(formPayload['challengeUUID'])}", - 'data': dataPayload - } - - # ------------------------------------------------------------------------------- # - # Attempt to handle and send the challenge response back to cloudflare - # ------------------------------------------------------------------------------- # - - def Challenge_Response(self, resp, **kwargs): - if self.is_Captcha_Challenge(resp): - # ------------------------------------------------------------------------------- # - # double down on the request as some websites are only checking - # if cfuid is populated before issuing Captcha. - # ------------------------------------------------------------------------------- # - - if self.cloudscraper.doubleDown: - resp = self.cloudscraper.decodeBrotli( - self.cloudscraper.perform_request(resp.request.method, resp.url, **kwargs) - ) - - if not self.is_Captcha_Challenge(resp): - return resp - - # ------------------------------------------------------------------------------- # - # if no captcha provider raise a runtime error. - # ------------------------------------------------------------------------------- # - - if ( - not self.cloudscraper.captcha - or not isinstance(self.cloudscraper.captcha, dict) - or not self.cloudscraper.captcha.get('provider') - ): - self.cloudscraper.simpleException( - CloudflareCaptchaProvider, - "Cloudflare Captcha detected, unfortunately you haven't loaded an anti Captcha provider " - "correctly via the 'captcha' parameter." - ) - - # ------------------------------------------------------------------------------- # - # if provider is return_response, return the response without doing anything. - # ------------------------------------------------------------------------------- # - - if self.cloudscraper.captcha.get('provider') == 'return_response': - return resp - - # ------------------------------------------------------------------------------- # - # Submit request to parser wrapper to solve captcha - # ------------------------------------------------------------------------------- # - - submit_url = self.captcha_Challenge_Response( - self.cloudscraper.captcha.get('provider'), - self.cloudscraper.captcha, - resp.text, - resp.url - ) - else: - # ------------------------------------------------------------------------------- # - # Cloudflare requires a delay before solving the challenge - # ------------------------------------------------------------------------------- # - - if not self.cloudscraper.delay: - try: - delay = float( - re.search( - r'submit\(\);\r?\n\s*},\s*([0-9]+)', - resp.text - ).group(1) - ) / float(1000) - if isinstance(delay, (int, float)): - self.cloudscraper.delay = delay - except (AttributeError, ValueError): - self.cloudscraper.simpleException( - CloudflareIUAMError, - "Cloudflare IUAM possibility malformed, issue extracing delay value." - ) - - time.sleep(self.cloudscraper.delay) - - # ------------------------------------------------------------------------------- # - - submit_url = self.IUAM_Challenge_Response( - resp.text, - resp.url, - self.cloudscraper.interpreter - ) - - # ------------------------------------------------------------------------------- # - # Send the Challenge Response back to Cloudflare - # ------------------------------------------------------------------------------- # - - if submit_url: - - def updateAttr(obj, name, newValue): - try: - obj[name].update(newValue) - return obj[name] - except (AttributeError, KeyError): - obj[name] = {} - obj[name].update(newValue) - return obj[name] - - cloudflare_kwargs = deepcopy(kwargs) - cloudflare_kwargs['allow_redirects'] = False - cloudflare_kwargs['data'] = updateAttr( - cloudflare_kwargs, - 'data', - submit_url['data'] - ) - - urlParsed = urlparse(resp.url) - cloudflare_kwargs['headers'] = updateAttr( - cloudflare_kwargs, - 'headers', - { - 'Origin': f'{urlParsed.scheme}://{urlParsed.netloc}', - 'Referer': resp.url - } - ) - - challengeSubmitResponse = self.cloudscraper.request( - 'POST', - submit_url['url'], - **cloudflare_kwargs - ) - - if challengeSubmitResponse.status_code == 400: - self.cloudscraper.simpleException( - CloudflareSolveError, - 'Invalid challenge answer detected, Cloudflare broken?' - ) - - # ------------------------------------------------------------------------------- # - # Return response if Cloudflare is doing content pass through instead of 3xx - # else request with redirect URL also handle protocol scheme change http -> https - # ------------------------------------------------------------------------------- # - - if not challengeSubmitResponse.is_redirect: - return challengeSubmitResponse - - else: - cloudflare_kwargs = deepcopy(kwargs) - cloudflare_kwargs['headers'] = updateAttr( - cloudflare_kwargs, - 'headers', - {'Referer': challengeSubmitResponse.url} - ) - - if not urlparse(challengeSubmitResponse.headers['Location']).netloc: - redirect_location = urljoin( - challengeSubmitResponse.url, - challengeSubmitResponse.headers['Location'] - ) - else: - redirect_location = challengeSubmitResponse.headers['Location'] - - return self.cloudscraper.request( - resp.request.method, - redirect_location, - **cloudflare_kwargs - ) - - # ------------------------------------------------------------------------------- # - # We shouldn't be here... - # Re-request the original query and/or process again.... - # ------------------------------------------------------------------------------- # - - return self.cloudscraper.request(resp.request.method, resp.url, **kwargs) - - # ------------------------------------------------------------------------------- # diff --git a/libs/version.txt b/libs/version.txt index 966f61590..0e1dd8c78 100644 --- a/libs/version.txt +++ b/libs/version.txt @@ -109,7 +109,7 @@ pysrt==1.1.2 stevedore==5.2.0 # Required-by: subliminal_patch -cloudscraper==1.2.71 +cloudscraper==1.2.58 # newer version dropped captcha v1 support decorator==5.1.1 dnspython==2.6.1 enzyme==0.4.1