diff --git a/README.md b/README.md index ff81ce0..98d483b 100644 --- a/README.md +++ b/README.md @@ -18,8 +18,8 @@ The script will automatically try to download and use the appropriate chromedriv usage: blinkistscraper [-h] [--language {en,de}] [--match-language] [--cooldown COOLDOWN] [--headless] [--audio] [--concat-audio] [--keep-noncat] [--no-scrape] - [--book BOOK] [--daily-book] [--books BOOKS] - [--book-category BOOK_CATEGORY] + [--get-amazon-url] [--book BOOK] [--daily-book] + [--books BOOKS] [--book-category BOOK_CATEGORY] [--categories CATEGORIES [CATEGORIES ...]] [--ignore-categories IGNORE_CATEGORIES [IGNORE_CATEGORIES ...]] [--create-html] [--create-epub] [--create-pdf] @@ -49,6 +49,8 @@ optional arguments: --no-scrape Don't scrape the website, only process existing json files in the dump folder. Do not provide email or password with this option. + --get-amazon-url Scrape Amazon product ID as well. Will additionally + scrape https://.../en/books/ for this --book BOOK Scrapes this book only, takes the blinkist url for the book(e.g. https://www.blinkist.com/en/books/... or https://www.blinkist.com/en/nc/reader/...) diff --git a/blinkistscraper/__main__.py b/blinkistscraper/__main__.py index 23a486f..cc1ab13 100644 --- a/blinkistscraper/__main__.py +++ b/blinkistscraper/__main__.py @@ -100,6 +100,12 @@ def check_cooldown(value): default=False, help="Don't scrape the website, only process existing json files in the dump folder. Do not provide email or password with this option.", ) + parser.add_argument( + "--get-amazon-url", + action="store_true", + default=False, + help="Scrape Amazon product ID as well. Will additionally scrape https://.../en/books/ for this", + ) parser.add_argument( "--book", default=False, @@ -212,7 +218,7 @@ def generate_book_outputs(book_json, cover_img=False): def scrape_book(driver, processed_books, book_url, category, match_language): book_json, dump_exists = scraper.scrape_book_data( - driver, book_url, category=category, match_language=match_language + driver, book_url, args.get_amazon_url, category=category, match_language=match_language ) if book_json: cover_img_file = False @@ -350,7 +356,7 @@ def finish(start_time, processed_books, driver=None): driver, processed_books, book_url, - category={"label": "Uncategorized"}, + category={"label": "Uncategorized", "id": -1}, match_language=match_language, ) if not dump_exists: diff --git a/blinkistscraper/scraper.py b/blinkistscraper/scraper.py index 2e02a51..8c65e73 100644 --- a/blinkistscraper/scraper.py +++ b/blinkistscraper/scraper.py @@ -244,7 +244,7 @@ def get_categories(driver, language, specified_categories=None, ignored_categori # parse the invidual category links categories_items = categories_list.find_elements_by_tag_name("li") - for item in categories_items: + for id, item in enumerate(categories_items): link = item.find_element_by_tag_name("a") href = link.get_attribute("href") label = link.find_element_by_tag_name("span").get_attribute("innerHTML") @@ -260,7 +260,7 @@ def get_categories(driver, language, specified_categories=None, ignored_categori if list(filter(lambda ic: ic.lower() in label.lower(), ignored_categories)): continue - category = {"label": " ".join(label.split()).replace("&", "&"), "url": href} + category = {"label": " ".join(label.split()).replace("&", "&"), "url": href, 'id':id} categories_links.append(category) log.info( f"Scraping categories: {', '.join([c['label'] for c in categories_links])}" @@ -321,22 +321,23 @@ def detect_needs_upgrade(driver): def scrape_book_data( - driver, book_url, match_language="", category={"label": "Uncategorized"}, force=False + driver, url, get_amazon_url, match_language="", category={"label": "Uncategorized", "id":-1}, force=False ): # check if this book has already been dumped, unless we are forcing scraping # if so return the content of the dump, alonside with a flash saying it already existed - if os.path.exists(get_book_dump_filename(book_url)) and not force: - log.debug(f"Json dump for book {book_url} already exists, skipping scraping...") - with open(get_book_dump_filename(book_url)) as f: + if os.path.exists(get_book_dump_filename(url)) and not force: + log.debug(f"Json dump for book {url} already exists, skipping scraping...") + with open(get_book_dump_filename(url)) as f: return json.load(f), True # if not, proceed scraping the reader page - log.info(f"Scraping book at {book_url}") - if "/nc/reader/" not in book_url: - book_url = book_url.replace("/books/", "/nc/reader/") + log.info(f"Scraping book at {url}") + if "/nc/reader/" not in url: + url = url.replace("/books/", "/nc/reader/") - if not driver.current_url == book_url: - driver.get(book_url) + # go to /nc/reader/... url, if not already there + if not driver.current_url == url: + driver.get(url) # check for re-direct to the upgrade page detect_needs_upgrade(driver) @@ -393,9 +394,29 @@ def scrape_book_data( supplement_text = supplement_content.get_attribute("innerHTML") chapter_json["supplement"] = supplement_text break + + # if the --get-amazon-url cli-switch is enabled, go to ../books/.. page and extract amazon product id + if get_amazon_url: + books_url = url.replace("/nc/reader/", "/books/") + driver.get(books_url) + + try: + WebDriverWait(driver, 5).until( + EC.element_to_be_clickable( + (By.CLASS_NAME, "buy-book-button") + ) + ) + except TimeoutException as ex: + log.warning("No 'Buy' button found. No Amazon ASIN collected.") + else: + buy_button = driver.find_element_by_class_name("buy-book-button") + if buy_button.is_displayed(): + amazon_url = buy_button.get_attribute("href") + book["amazon_id"] = sanitize_amazon_id(amazon_url) # if we are scraping by category, add it to the book metadata book["category"] = category["label"] + book["category_id"] = category["id"] # store the book json metadata for future use dump_book(book) diff --git a/blinkistscraper/utils.py b/blinkistscraper/utils.py index 68d04da..54309a3 100644 --- a/blinkistscraper/utils.py +++ b/blinkistscraper/utils.py @@ -14,6 +14,13 @@ def sanitize_name(name): return re.sub(r'[\\/*?:"<>|.]', "", name).strip() +def sanitize_amazon_id(amazon_url): + amazon_url = amazon_url.replace("https://www.amazon.de/dp/", "") + amazon_url = amazon_url.replace("https://www.amazon.com/dp/", "") + amazon_id = amazon_url.split('?tag=')[0].replace("/","") + return amazon_id + + def get_book_dump_filename(book_json_or_url): if "blinkist.com" in book_json_or_url: return os.path.join("dump", book_json_or_url.split("/")[-1] + ".json")