-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmain_scraper.py
190 lines (120 loc) · 6.03 KB
/
main_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# -*- coding: utf-8 -*-
"""
Created on Sun Oct 25 16:24:40 2020
@author: Anastasia
"""
#Import statements
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
#Import functions and options from other docs
from scraper_functions import open_page, get_url_list, get_links, scrape_link_details, get_main_page, next_page
from scraper_options import PATH, search_terms, page_counter_limit
#Establish path to Chromedriver
driver = webdriver.Chrome(PATH)
#Establish connection to URLs and get list of terms to search
urls, terms = get_url_list(search_terms)
#Initialize term counter
term_counter = 0
#Create empty lists to hold results
titles = []
shop_names = []
is_ad = []
star_ratings = []
num_reviews = []
prices = []
bestseller = []
category = []
num_sales = []
num_basket = []
descriptions = []
days_to_arrival = []
cost_delivery = []
returns_accepted = []
dispatch_from = []
count_images = []
total_records = 0
#Loop through search term URLs
for url in urls:
#Open the URL and close popup
open_page(driver,url)
#Initialize record and page counters
page_counter = 1
record_counter = 0
#Loop through pages in the URL until you reach your desired page limit - note that the Etsy limit is 240
while page_counter < page_counter_limit:
#Implement retry loop to prevent IP block stalling
retries = 0
while retries <= 5:
#Wait until page loads search contents before scraping
try:
main = WebDriverWait(driver,10).until(
EC.presence_of_element_located((By.ID, 'content')))
time.sleep(4)
except:
driver.refresh()
retries += 1
#Get a list of links to individual listings
link_list = get_links(driver)
#We only want to keep first 65 links because the rest are your 'recently viewed' links
link_list = link_list[:65]
#Loop over links and get pertinent information
for link in link_list:
try:
appenders = scrape_link_details(driver,link)
mylists = [num_sales, num_basket, descriptions, days_to_arrival, cost_delivery, returns_accepted, dispatch_from, count_images]
for x, lst in zip(appenders, mylists):
lst.append(x)
except:
break
# Get the listing containers and loop through them
main = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.ID, 'content')))
#Get only the first 65 listing containers - the rest are your 'recently viewed' items
results = main.find_elements_by_xpath('//li[starts-with(@class, "wt-list-unstyled wt-grid__item-xs-6 wt-grid__item-md-4 wt-grid__item")]')[:65]
#Add the results you've scraped to your record counter
record_counter += len(results)
#Loop over each listing and get its details
for result in results:
try:
info = get_main_page(driver, result, terms[term_counter])
info_list = [titles, is_ad, shop_names, star_ratings, num_reviews, prices, bestseller, category]
for x, lst in zip(info, info_list):
lst.append(x)
except:
break
#For each page, print out a statement telling you how many pages have been scraped, how many records have been scraped, and whether all scraped categories are equal.
print('Finished scraping page ' + str(page_counter) + ' of "' + terms[term_counter] + '" search term.')
print('Scraped ' + str(record_counter) + ' records' + ' of "' + terms[term_counter] + '" search term.')
print('Are all lists equal? ' + str(len(count_images) == record_counter))
print()
#Add one to the page count
page_counter += 1
#Click into the next page
try:
next_page(driver, page_counter)
except:
print('no next page')
break
#Get the total record count
total_records += record_counter
#Print the total number of records scraped for this term, as well as the total records overall.
print('Finished scraping search term. Total records for "' + terms[term_counter] + '": ' + str(record_counter))
print('Total records in scrape: ' + str(total_records))
print('Are all lists equal? ' + str(len(count_images) == total_records))
print()
#Add one to the term counter so the next term is scraped
term_counter += 1
# Close driver
driver.quit()
#Create a dict from our lists
data = {'Title': titles, 'Shop_Name':shop_names,'Is_Ad': is_ad, 'Star_Rating': star_ratings, 'Num_Reviews': num_reviews, 'Price': prices, 'Is_Bestseller': bestseller, 'Num_Sales': num_sales, 'Num_Basket': num_basket, 'Description': descriptions, 'Days_to_Arrival': days_to_arrival, 'Cost_Delivery': cost_delivery, 'Returns_Accepted': returns_accepted, 'Dispatched_From': dispatch_from, 'Num_Images': count_images, 'Category': category}
#Create dataframe from our dictionary
df = pd.DataFrame(data)
#Anonymize the shop names
df['Shop_Name'] = df['Shop_Name'].astype('category').cat.codes
#Save dataframe to a new CSV
df.to_csv('raw_data.csv')