Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Collect Amazon product IDs and safe in *.json #48

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ The script will automatically try to download and use the appropriate chromedriv
usage: blinkistscraper [-h] [--language {en,de}] [--match-language]
[--cooldown COOLDOWN] [--headless] [--audio]
[--concat-audio] [--keep-noncat] [--no-scrape]
[--book BOOK] [--daily-book] [--books BOOKS]
[--book-category BOOK_CATEGORY]
[--get-amazon-url] [--book BOOK] [--daily-book]
[--books BOOKS] [--book-category BOOK_CATEGORY]
[--categories CATEGORIES [CATEGORIES ...]]
[--ignore-categories IGNORE_CATEGORIES [IGNORE_CATEGORIES ...]]
[--create-html] [--create-epub] [--create-pdf]
Expand Down Expand Up @@ -49,6 +49,8 @@ optional arguments:
--no-scrape Don't scrape the website, only process existing json
files in the dump folder. Do not provide email or
password with this option.
--get-amazon-url Scrape Amazon product ID as well. Will additionally
scrape https://.../en/books/ for this
--book BOOK Scrapes this book only, takes the blinkist url for the
book(e.g. https://www.blinkist.com/en/books/... or
https://www.blinkist.com/en/nc/reader/...)
Expand Down
10 changes: 8 additions & 2 deletions blinkistscraper/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@ def check_cooldown(value):
default=False,
help="Don't scrape the website, only process existing json files in the dump folder. Do not provide email or password with this option.",
)
parser.add_argument(
"--get-amazon-url",
action="store_true",
default=False,
help="Scrape Amazon product ID as well. Will additionally scrape https://.../en/books/ for this",
)
parser.add_argument(
"--book",
default=False,
Expand Down Expand Up @@ -212,7 +218,7 @@ def generate_book_outputs(book_json, cover_img=False):

def scrape_book(driver, processed_books, book_url, category, match_language):
book_json, dump_exists = scraper.scrape_book_data(
driver, book_url, category=category, match_language=match_language
driver, book_url, args.get_amazon_url, category=category, match_language=match_language
)
if book_json:
cover_img_file = False
Expand Down Expand Up @@ -350,7 +356,7 @@ def finish(start_time, processed_books, driver=None):
driver,
processed_books,
book_url,
category={"label": "Uncategorized"},
category={"label": "Uncategorized", "id": -1},
match_language=match_language,
)
if not dump_exists:
Expand Down
43 changes: 32 additions & 11 deletions blinkistscraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def get_categories(driver, language, specified_categories=None, ignored_categori

# parse the invidual category links
categories_items = categories_list.find_elements_by_tag_name("li")
for item in categories_items:
for id, item in enumerate(categories_items):
link = item.find_element_by_tag_name("a")
href = link.get_attribute("href")
label = link.find_element_by_tag_name("span").get_attribute("innerHTML")
Expand All @@ -260,7 +260,7 @@ def get_categories(driver, language, specified_categories=None, ignored_categori
if list(filter(lambda ic: ic.lower() in label.lower(), ignored_categories)):
continue

category = {"label": " ".join(label.split()).replace("&", "&"), "url": href}
category = {"label": " ".join(label.split()).replace("&", "&"), "url": href, 'id':id}
categories_links.append(category)
log.info(
f"Scraping categories: {', '.join([c['label'] for c in categories_links])}"
Expand Down Expand Up @@ -321,22 +321,23 @@ def detect_needs_upgrade(driver):


def scrape_book_data(
driver, book_url, match_language="", category={"label": "Uncategorized"}, force=False
driver, url, get_amazon_url, match_language="", category={"label": "Uncategorized", "id":-1}, force=False
):
# check if this book has already been dumped, unless we are forcing scraping
# if so return the content of the dump, alonside with a flash saying it already existed
if os.path.exists(get_book_dump_filename(book_url)) and not force:
log.debug(f"Json dump for book {book_url} already exists, skipping scraping...")
with open(get_book_dump_filename(book_url)) as f:
if os.path.exists(get_book_dump_filename(url)) and not force:
log.debug(f"Json dump for book {url} already exists, skipping scraping...")
with open(get_book_dump_filename(url)) as f:
return json.load(f), True

# if not, proceed scraping the reader page
log.info(f"Scraping book at {book_url}")
if "/nc/reader/" not in book_url:
book_url = book_url.replace("/books/", "/nc/reader/")
log.info(f"Scraping book at {url}")
if "/nc/reader/" not in url:
url = url.replace("/books/", "/nc/reader/")

if not driver.current_url == book_url:
driver.get(book_url)
# go to /nc/reader/... url, if not already there
if not driver.current_url == url:
driver.get(url)

# check for re-direct to the upgrade page
detect_needs_upgrade(driver)
Expand Down Expand Up @@ -393,9 +394,29 @@ def scrape_book_data(
supplement_text = supplement_content.get_attribute("innerHTML")
chapter_json["supplement"] = supplement_text
break

# if the --get-amazon-url cli-switch is enabled, go to ../books/.. page and extract amazon product id
if get_amazon_url:
books_url = url.replace("/nc/reader/", "/books/")
driver.get(books_url)

try:
WebDriverWait(driver, 5).until(
EC.element_to_be_clickable(
(By.CLASS_NAME, "buy-book-button")
)
)
except TimeoutException as ex:
log.warning("No 'Buy' button found. No Amazon ASIN collected.")
else:
buy_button = driver.find_element_by_class_name("buy-book-button")
if buy_button.is_displayed():
amazon_url = buy_button.get_attribute("href")
book["amazon_id"] = sanitize_amazon_id(amazon_url)

# if we are scraping by category, add it to the book metadata
book["category"] = category["label"]
book["category_id"] = category["id"]

# store the book json metadata for future use
dump_book(book)
Expand Down
7 changes: 7 additions & 0 deletions blinkistscraper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@ def sanitize_name(name):
return re.sub(r'[\\/*?:"<>|.]', "", name).strip()


def sanitize_amazon_id(amazon_url):
amazon_url = amazon_url.replace("https://www.amazon.de/dp/", "")
amazon_url = amazon_url.replace("https://www.amazon.com/dp/", "")
amazon_id = amazon_url.split('?tag=')[0].replace("/","")
return amazon_id


def get_book_dump_filename(book_json_or_url):
if "blinkist.com" in book_json_or_url:
return os.path.join("dump", book_json_or_url.split("/")[-1] + ".json")
Expand Down