leoncvlt · johndoe-dev00 · Mar 29, 2021 · Apr 8, 2021 · Apr 8, 2021
diff --git a/README.md b/README.md
@@ -18,8 +18,8 @@ The script will automatically try to download and use the appropriate chromedriv
 usage: blinkistscraper [-h] [--language {en,de}] [--match-language]
                        [--cooldown COOLDOWN] [--headless] [--audio]
                        [--concat-audio] [--keep-noncat] [--no-scrape]
-                       [--book BOOK] [--daily-book] [--books BOOKS]
-                       [--book-category BOOK_CATEGORY]
+                       [--get-amazon-url] [--book BOOK] [--daily-book]
+                       [--books BOOKS] [--book-category BOOK_CATEGORY]
                        [--categories CATEGORIES [CATEGORIES ...]]
                        [--ignore-categories IGNORE_CATEGORIES [IGNORE_CATEGORIES ...]]
                        [--create-html] [--create-epub] [--create-pdf]
@@ -49,6 +49,8 @@ optional arguments:
   --no-scrape           Don't scrape the website, only process existing json
                         files in the dump folder. Do not provide email or
                         password with this option.
+  --get-amazon-url      Scrape Amazon product ID as well. Will additionally
+                        scrape https://.../en/books/ for this
   --book BOOK           Scrapes this book only, takes the blinkist url for the
                         book(e.g. https://www.blinkist.com/en/books/... or
                         https://www.blinkist.com/en/nc/reader/...)

diff --git a/blinkistscraper/__main__.py b/blinkistscraper/__main__.py
@@ -100,6 +100,12 @@ def check_cooldown(value):
         default=False,
         help="Don't scrape the website, only process existing json files in the dump folder. Do not provide email or password with this option.",
     )
+    parser.add_argument(
+        "--get-amazon-url",
+        action="store_true",
+        default=False,
+        help="Scrape Amazon product ID as well. Will additionally scrape https://.../en/books/ for this",
+    )
     parser.add_argument(
         "--book",
         default=False,
@@ -212,7 +218,7 @@ def generate_book_outputs(book_json, cover_img=False):
 
     def scrape_book(driver, processed_books, book_url, category, match_language):
         book_json, dump_exists = scraper.scrape_book_data(
-            driver, book_url, category=category, match_language=match_language
+            driver, book_url, args.get_amazon_url, category=category, match_language=match_language
         )
         if book_json:
             cover_img_file = False
@@ -350,7 +356,7 @@ def finish(start_time, processed_books, driver=None):
                         driver,
                         processed_books,
                         book_url,
-                        category={"label": "Uncategorized"},
+                        category={"label": "Uncategorized", "id": -1},
                         match_language=match_language,
                     )
                     if not dump_exists:

diff --git a/blinkistscraper/scraper.py b/blinkistscraper/scraper.py
@@ -244,7 +244,7 @@ def get_categories(driver, language, specified_categories=None, ignored_categori
 
     # parse the invidual category links
     categories_items = categories_list.find_elements_by_tag_name("li")
-    for item in categories_items:
+    for id, item in enumerate(categories_items):
         link = item.find_element_by_tag_name("a")
         href = link.get_attribute("href")
         label = link.find_element_by_tag_name("span").get_attribute("innerHTML")
@@ -260,7 +260,7 @@ def get_categories(driver, language, specified_categories=None, ignored_categori
         if list(filter(lambda ic: ic.lower() in label.lower(), ignored_categories)):
             continue
 
-        category = {"label": " ".join(label.split()).replace("&amp;", "&"), "url": href}
+        category = {"label": " ".join(label.split()).replace("&amp;", "&"), "url": href, 'id':id}
         categories_links.append(category)
     log.info(
         f"Scraping categories: {', '.join([c['label'] for c in categories_links])}"
@@ -321,22 +321,23 @@ def detect_needs_upgrade(driver):
 
 
 def scrape_book_data(
-    driver, book_url, match_language="", category={"label": "Uncategorized"}, force=False
+    driver, url, get_amazon_url, match_language="", category={"label": "Uncategorized", "id":-1}, force=False
 ):
     # check if this book has already been dumped, unless we are forcing scraping
     # if so return the content of the dump, alonside with a flash saying it already existed
-    if os.path.exists(get_book_dump_filename(book_url)) and not force:
-        log.debug(f"Json dump for book {book_url} already exists, skipping scraping...")
-        with open(get_book_dump_filename(book_url)) as f:
+    if os.path.exists(get_book_dump_filename(url)) and not force:
+        log.debug(f"Json dump for book {url} already exists, skipping scraping...")
+        with open(get_book_dump_filename(url)) as f:
             return json.load(f), True
 
     # if not, proceed scraping the reader page
-    log.info(f"Scraping book at {book_url}")
-    if "/nc/reader/" not in book_url:
-        book_url = book_url.replace("/books/", "/nc/reader/")
+    log.info(f"Scraping book at {url}")
+    if "/nc/reader/" not in url:
+        url = url.replace("/books/", "/nc/reader/")
 
-    if not driver.current_url == book_url:
-        driver.get(book_url)
+    # go to /nc/reader/... url, if not already there
+    if not driver.current_url == url:
+        driver.get(url)
 
     # check for re-direct to the upgrade page
     detect_needs_upgrade(driver)
@@ -393,9 +394,29 @@ def scrape_book_data(
                         supplement_text = supplement_content.get_attribute("innerHTML")
                         chapter_json["supplement"] = supplement_text
                     break
+
+    # if the --get-amazon-url cli-switch is enabled, go to ../books/.. page and extract amazon product id
+    if get_amazon_url:
+        books_url = url.replace("/nc/reader/", "/books/")
+        driver.get(books_url)
+
+        try:
+            WebDriverWait(driver, 5).until(
+                EC.element_to_be_clickable(
+                    (By.CLASS_NAME, "buy-book-button")
+                )
+            )
+        except TimeoutException as ex:
+            log.warning("No 'Buy' button found. No Amazon ASIN collected.")
+        else:
+            buy_button = driver.find_element_by_class_name("buy-book-button")
+            if buy_button.is_displayed():
+                amazon_url = buy_button.get_attribute("href")
+                book["amazon_id"] = sanitize_amazon_id(amazon_url)
 
     # if we are scraping by category, add it to the book metadata
     book["category"] = category["label"]
+    book["category_id"] = category["id"]
 
     # store the book json metadata for future use
     dump_book(book)

diff --git a/blinkistscraper/utils.py b/blinkistscraper/utils.py
@@ -14,6 +14,13 @@ def sanitize_name(name):
     return re.sub(r'[\\/*?:"<>|.]', "", name).strip()
 
 
+def sanitize_amazon_id(amazon_url):
+    amazon_url = amazon_url.replace("https://www.amazon.de/dp/", "")
+    amazon_url = amazon_url.replace("https://www.amazon.com/dp/", "")
+    amazon_id = amazon_url.split('?tag=')[0].replace("/","")
+    return amazon_id
+
+
 def get_book_dump_filename(book_json_or_url):
     if "blinkist.com" in book_json_or_url:
         return os.path.join("dump", book_json_or_url.split("/")[-1] + ".json")