Skip to content

Commit

Permalink
Merge pull request #2 from leoncvlt/master
Browse files Browse the repository at this point in the history
New Captcha Behaviour and dependencies
  • Loading branch information
rocketinventor authored Apr 5, 2021
2 parents 325ae51 + 9febcab commit 04424a3
Show file tree
Hide file tree
Showing 8 changed files with 132 additions and 467 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,6 @@ logs
*.cmd
*.lnk
cookies.pkl

# Poetry .lock file
poetry.lock
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ optional arguments:
--chromedriver CHROMEDRIVER
Path to a specific chromedriver executable instead of
the built-in one
--no-ublock Disable the uBlock Chrome extension. Might be needed
to solve captcha
-v, --verbose Increases logging verbosity
```

Expand Down
2 changes: 1 addition & 1 deletion bin/ublock/ublock-settings.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
"wyciwyg-scheme"
],
"netWhitelist": "about-scheme\nchrome-extension-scheme\nchrome-scheme\nmoz-extension-scheme\nopera-scheme\nvivaldi-scheme\nwyciwyg-scheme",
"dynamicFilteringString": "behind-the-scene * * noop\nbehind-the-scene * inline-script noop\nbehind-the-scene * 1p-script noop\nbehind-the-scene * 3p-script noop\nbehind-the-scene * 3p-frame noop\nbehind-the-scene * image noop\nbehind-the-scene * 3p noop\n* * 3p block\nwww.blinkist.com blinkist.io * allow\nwww.blinkist.com d17pjsg7x52x9r.cloudfront.net * allow\nwww.blinkist.com jsdelivr.net * block\nwww.blinkist.com hcaptcha.com * block",
"dynamicFilteringString": "behind-the-scene * * noop\nbehind-the-scene * inline-script noop\nbehind-the-scene * 1p-script noop\nbehind-the-scene * 3p-script noop\nbehind-the-scene * 3p-frame noop\nbehind-the-scene * image noop\nbehind-the-scene * 3p noop\n* * 3p block\nwww.blinkist.com blinkist.io * allow\nwww.blinkist.com d17pjsg7x52x9r.cloudfront.net * allow\nwww.blinkist.com jsdelivr.net * block\nwww.blinkist.com hcaptcha.com * allow",
"urlFilteringString": "",
"hostnameSwitchesString": "no-large-media: behind-the-scene false\nno-remote-fonts: * true",
"userFilters": ""
Expand Down
8 changes: 7 additions & 1 deletion blinkistscraper/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,12 @@ def check_cooldown(value):
"--chromedriver",
help="Path to a specific chromedriver executable instead of the built-in one",
)
parser.add_argument(
"--no-ublock",
action="store_true",
default=False,
help="Disable the uBlock Chrome extension. Might be needed to solve captcha",
)
parser.add_argument(
"-v", "--verbose", action="store_true", help="Increases logging verbosity"
)
Expand Down Expand Up @@ -274,7 +280,7 @@ def finish(start_time, processed_books, driver=None):
match_language = args.language if args.match_language else ""
start_headless = args.headless
# add uBlock (except on headless)
use_ublock = not args.headless
use_ublock = not args.no_ublock and not args.headless
driver = scraper.initialize_driver(
headless=start_headless,
with_ublock=use_ublock,
Expand Down
46 changes: 35 additions & 11 deletions blinkistscraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,29 +131,53 @@ def initialize_driver(headless=True, with_ublock=False, chromedriver_path=None):

def login(driver, language, email, password):
# we need to navigate to a page first in order to load eventual cookies
driver.get(f"https://www.blinkist.com/{language}")
driver.get(f"https://www.blinkist.com/{language}/nc/login")
is_logged_in = False

# if we have any stored login cookie, load them into the driver
if has_login_cookies():
load_login_cookies(driver)

# assume that a captcha needs to be solved, if no blinkist logo appears within 5sec
try:
WebDriverWait(driver, 5).until(
EC.presence_of_element_located(
(By.CLASS_NAME, "header__logo")
)
)
except TimeoutException as ex:
log.info("Please solve captcha to proceed!")

# fail if captcha not solved within 60sec
try:
WebDriverWait(driver, 60).until(
EC.presence_of_element_located(
(By.CLASS_NAME, "header__logo")
)
)
except TimeoutException as ex:
log.error("Error. Captcha needs to be solved within 1 minute")
return False

# navigate to the login page and check for the login email input
# if not found, assume we're logged in
# navigate to the login page
sign_in_url = f"https://www.blinkist.com/{language}/nc/login"
driver.get(sign_in_url)

# click on cookie banner, if necessary
time.sleep(1.0)
try:
cookiebanner = driver.find_element_by_class_name("cookie-disclaimer__cta")
except:
pass
else:
cookiebanner.click()

# check for the login email input. if not found, assume we're logged in
try:
driver.find_element_by_id("login-form_login_email")
except NoSuchElementException:
is_logged_in = True
# try:
# WebDriverWait(driver, 360).until(
# EC.presence_of_element_located((By.ID, "login-form_login_email"))
# )
# except TimeoutException as ex:
# log.error("Error logging in.")
# return False


# if not logged in, autofill the email and password inputs with the
# provided login credentials
if not is_logged_in:
Expand Down
Loading

0 comments on commit 04424a3

Please sign in to comment.