-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathamazon.py
84 lines (79 loc) · 3.89 KB
/
amazon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import re
import difflib
import math
from webbot import Browser
import time
import numpy as np
import settings
# book_file = 'Eureka Sale Sept 2021.csv'
# book_df = pd.read_csv(os.getcwd() + '/' + book_file)
# linked_URLs = pd.read_csv(os.getcwd() + '/Linked_URLs.csv')
def getAmazonMatches(books, web, refresh_existing=False):
linked_URLs = pd.read_csv(os.getcwd() + "/data/" + settings.LINKED_URLS)
for book in books:
print(book)
amazon_link = None
search_url = (
"https://www.amazon.com/s?k="
+ book.title.replace(" ", "+").replace("'", "-")
+ "+"
+ book.author.replace(" ", "+")
.replace("PhD", "")
.replace("MD", "")
.replace("Dr", "")
.replace("translator", "")
.replace("foreword", "")
.replace("featuring", "")
.replace("introduction", "")
.replace("note", "")
.replace("afterword", "")
+ "&i=audible"
)
web.go_to(search_url)
page = web.get_page_source()
soup = BeautifulSoup(page, "html.parser")
htmlItemSnippetList = soup.select(
"body > div#a-page > div#search > div.s-desktop-content > div.sg-col > div.sg-col-inner > span > div.s-result-list > div.s-result-item > div.sg-col-inner > span > div > div.a-section > div > div.sg-col > div.sg-col-inner > div.a-section > div.a-section > h2"
)
if len(htmlItemSnippetList) > 0:
# print(htmlItemSnippetList[0].find_all('a'))
amazon_link = htmlItemSnippetList[0].find_all("a")[0].get("href")
print(amazon_link)
amazon_link = (
"https://www.amazon.com" + amazon_link + "&tag=listenerslist-20"
)
book.amazon_link = amazon_link
return books
# for index, item in book_df.iterrows():
# if True:
# print(index)
# title = item['Audible_Title']
# print(title)
# author = item['Audible_Author'].replace('The Great Courses', '')
# amazon_link = ''
# print(item['Amazon_Link'])
# if ((linked_URLs['Audible_Title'] == title) & (linked_URLs['Audible_Author'] == author)).any():
# amazon_link = linked_URLs[(linked_URLs['Audible_Title'] == title) & (
# linked_URLs['Audible_Author'] == author)].iloc[0]['Amazon_Link']
# print('In Linked URLs')
# else: # Get from Amazon search
# search_url = 'https://www.amazon.com/s?k=' + title.replace(' ', '+').replace("'", "-") + '+' + author.replace(' ', '+').replace('PhD', '').replace('MD', '').replace(
# 'Dr', '').replace('translator', '').replace('foreword', '').replace('featuring', '').replace('introduction', '').replace('note', '').replace('afterword', '') + '&i=audible'
# web.go_to(search_url)
# page = web.get_page_source()
# soup = BeautifulSoup(page, 'html.parser')
# htmlItemSnippetList = soup.select(
# 'body > div#a-page > div#search > div.s-desktop-content > div.sg-col > div.sg-col-inner > span > div.s-result-list > div.s-result-item > div.sg-col-inner > div > div > div > div.sg-row > div.sg-col-8-of-16 > div.sg-col-inner > div.a-section > div.a-section > h2')
# # print(htmlItemSnippetList)
# if len(htmlItemSnippetList) > 0:
# print(htmlItemSnippetList[0].find_all('a'))
# amazon_link = htmlItemSnippetList[0].find_all('a')[
# 0].get('href')
# print(amazon_link)
# amazon_link = 'https://www.amazon.com' + amazon_link + '&tag=listenerslist-20'
# book_df.loc[index, 'Amazon_Link'] = amazon_link
# return book_df