Skip to content

Commit

Permalink
WIP solve amazon captcha with 2captcha
Browse files Browse the repository at this point in the history
Use coordinate method
Only search for relevant info on page
  • Loading branch information
Meatplay authored and codders committed Dec 17, 2024
1 parent 511001b commit b3c805c
Show file tree
Hide file tree
Showing 9 changed files with 104 additions and 14 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ gunicorn = "23.0.0"
flask-api = {editable = true, ref = "develop", git = "git+https://github.com/flask-api/flask-api.git"}
setuptools = "==75.6.0"
certifi = "==2024.12.14"
2captcha-python = "*"

[dev-packages]
exceptiongroup = "*"
Expand Down
19 changes: 14 additions & 5 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

63 changes: 62 additions & 1 deletion flathunter/abstract_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from typing import Optional, Any
import json

from io import BytesIO
import base64

import backoff
import requests
# pylint: disable=unused-import
Expand All @@ -13,10 +16,11 @@
from bs4 import BeautifulSoup

from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver import Chrome
from selenium.webdriver import Chrome, Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains

from flathunter import proxies
from flathunter.captcha.captcha_solver import CaptchaUnsolvableError
Expand Down Expand Up @@ -196,6 +200,7 @@ def resolve_geetest(self, driver):
driver.refresh()
raise

# pylint: disable=too-many-locals
@backoff.on_exception(wait_gen=backoff.constant,
exception=CaptchaUnsolvableError,
max_tries=3)
Expand Down Expand Up @@ -268,6 +273,62 @@ def log_filter(log_):
driver.refresh()
raise

@backoff.on_exception(wait_gen=backoff.constant,
exception=CaptchaUnsolvableError,
max_tries=3)
def resolve_amazon(self, driver):
"""Resolve Amazon Captcha"""
try:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(3)
shadowelement = driver.execute_script(
"return document.querySelector('awswaf-captcha').shadowRoot"
)
my_img = shadowelement.find_element(By.ID, "root")
size = my_img.size
select_l = my_img.find_element(By.TAG_NAME, "select")
select_l.click()
sleep(1)
select_l.send_keys(Keys.DOWN)
sleep(3)
shadowelement = driver.execute_script(
"return document.querySelector('awswaf-captcha').shadowRoot"
)
my_img = shadowelement.find_element(By.ID, "root")
screenshot = my_img.screenshot_as_png
screenshot_bytes = BytesIO(screenshot)
base64_screenshot = base64.b64encode(screenshot_bytes.getvalue()).decode('utf-8')
# Send image in 2captcha service
result = self.captcha_solver.solve_amazon(base64_screenshot)
logger.info(result.token)
l = result.token.split(':')[1].split(';')
l = [[int(val.split('=')[1]) for val in coord.split(',')] for coord in l]
button_coord = [size['width'] - 30, size['height'] - 30]
l.append(button_coord)
actions = ActionChains(driver)
for i in l:
actions.move_to_element_with_offset(my_img, i[0] - 160, i[1] - 211).click()
actions.perform()
sleep(0.5)
actions.reset_actions()
sleep(1)
try:
confirm_button = my_img.find_element(By.ID, "amzn-btn-verify-internal")
actions.move_to_element_with_offset(confirm_button, 40, 15).click()
actions.perform()
sleep(4)
except NoSuchElementException:
pass
try:
driver.find_element(By.TAG_NAME, "awswaf-captcha")
except NoSuchElementException:
logger.info("Captcha solved")
else:
raise CaptchaUnsolvableError()
except Exception as ex:
driver.refresh()
raise CaptchaUnsolvableError() from ex

@backoff.on_exception(wait_gen=backoff.constant,
exception=CaptchaUnsolvableError,
max_tries=3)
Expand Down
5 changes: 2 additions & 3 deletions flathunter/captcha/capmonster_solver.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""Captcha solver for CapMonster Captcha Solving Service (https://capmonster.cloud)"""
import json
from typing import Dict
from time import sleep
import backoff
Expand All @@ -8,8 +7,6 @@
from flathunter.logging import logger
from flathunter.captcha.captcha_solver import (
CaptchaSolver,
CaptchaBalanceEmpty,
CaptchaUnsolvableError,
GeetestResponse,
AwsAwfResponse,
RecaptchaResponse,
Expand All @@ -26,6 +23,8 @@ def solve_recaptcha(self, google_site_key: str, page_url: str) -> RecaptchaRespo
"""Should be implemented in subclass"""
raise NotImplementedError("Recaptcha captcha solving is not implemented for Capmonster")

# pylint: disable=too-many-arguments
# pylint: disable=too-many-positional-arguments
def solve_awswaf(
self,
sitekey: str,
Expand Down
3 changes: 2 additions & 1 deletion flathunter/captcha/captcha_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ class AwsAwfResponse:
"""Response from AWS WAF"""
token: str


class CaptchaSolver:
"""Interface for Captcha solvers"""

Expand All @@ -39,6 +38,8 @@ def solve_geetest(self, geetest: str, challenge: str, page_url: str) -> GeetestR
"""Should be implemented in subclass"""
raise NotImplementedError()

# pylint: disable=too-many-arguments
# pylint: disable=too-many-positional-arguments
def solve_awswaf(
self,
sitekey: str,
Expand Down
2 changes: 2 additions & 0 deletions flathunter/captcha/imagetyperz_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ def solve_recaptcha(self, google_site_key: str, page_url: str) -> RecaptchaRespo
)
return RecaptchaResponse(self.__retrieve_imagetyperz_result(captcha_id))

# pylint: disable=too-many-arguments
# pylint: disable=too-many-positional-arguments
def solve_awswaf(
self,
sitekey: str,
Expand Down
18 changes: 16 additions & 2 deletions flathunter/captcha/twocaptcha_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from time import sleep
import backoff
import requests
from twocaptcha import TwoCaptcha

from flathunter.logging import logger
from flathunter.captcha.captcha_solver import (
Expand Down Expand Up @@ -47,6 +48,8 @@ def solve_recaptcha(self, google_site_key: str, page_url: str) -> RecaptchaRespo
captcha_id = self.__submit_2captcha_request(params)
return RecaptchaResponse(self.__retrieve_2captcha_result(captcha_id))

# pylint: disable=too-many-arguments
# pylint: disable=too-many-positional-arguments
def solve_awswaf(
self,
sitekey: str,
Expand All @@ -56,8 +59,19 @@ def solve_awswaf(
captcha_script: str,
page_url: str
) -> AwsAwfResponse:
"""Should be implemented at some point"""
raise NotImplementedError("AWS WAF captchas not supported for 2Captcha")
"""Using the `solve_amazon` method instead"""
raise NotImplementedError()

def solve_amazon(
self,
image_b64: str
) -> AwsAwfResponse:
"""Solve AWS WAF by processing an image"""
solver = TwoCaptcha(self.api_key, defaultTimeout=60, pollingInterval=5)
result = solver.coordinates(image_b64, lang='en')
if result is None:
raise CaptchaUnsolvableError("Got None from 2captcha solve")
return AwsAwfResponse(result["code"])

@backoff.on_exception(**CaptchaSolver.backoff_options)
def __submit_2captcha_request(self, params: Dict[str, str]) -> str:
Expand Down
5 changes: 3 additions & 2 deletions flathunter/crawler/immobilienscout.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from flathunter.abstract_crawler import Crawler
from flathunter.logging import logger
from flathunter.chrome_wrapper import get_chrome_driver
from flathunter.captcha.twocaptcha_solver import TwoCaptchaSolver
from flathunter.exceptions import DriverLoadException

STATIC_URL_PATTERN = re.compile(r'https://www\.immobilienscout24\.de')
Expand All @@ -35,7 +34,7 @@ class Immobilienscout(Crawler):

URL_PATTERN = STATIC_URL_PATTERN

JSON_PATH_PARSER_ENTRIES = parse("$..['resultlist.realEstate']")
JSON_PATH_PARSER_ENTRIES = parse("$..['resultlistEntries']..['resultlist.realEstate']")
JSON_PATH_PARSER_IMAGES = parse("$..galleryAttachments"
"..attachment[?'@xsi.type'=='common:Picture']"
"..['@href'].`sub(/(.*\\\\.jpe?g).*/, \\\\1)`")
Expand Down Expand Up @@ -117,6 +116,8 @@ def get_results(self, search_url, max_pages=None):

def get_entries_from_javascript(self):
"""Get entries from JavaScript"""
if "Warum haben wir deine Anfrage blockiert?" in self.get_driver_force().page_source:
self.resolve_amazon(self.get_driver_force())
try:
result_json = self.get_driver_force().execute_script('return window.IS24.resultList;')
except JavascriptException:
Expand Down
2 changes: 2 additions & 0 deletions flathunter/gmaps_duration_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ def process_expose(self, expose):

def get_formatted_durations(self, address):
"""Return a formatted list of GoogleMaps durations"""
if address is None:
return ""
out = ""
for duration in self.config.get('durations', []):
if 'destination' in duration and 'name' in duration:
Expand Down

0 comments on commit b3c805c

Please sign in to comment.