This repository has been archived by the owner on May 24, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
tripadvisor.py
327 lines (284 loc) · 14 KB
/
tripadvisor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
# Function to scrape hotel listing
import re
import time
import dateparser
import requests
from selenium.common.exceptions import NoSuchElementException
from tqdm import tqdm
def scrape_hotel_listing(page_url, driver):
# Get's page set in page_url and opens it in Chrome
# Function to checks xpath exists
driver.get(page_url)
def check_exists_by_xpath(xpath):
try:
driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return True
data = {}
data['listing'] = []
data['reviews'] = []
# Find the name of the listing and captures in name
name = driver.find_element_by_xpath("//h1[@id='HEADING']").text
# Find the total number of reviews and captures in number_of_reviews
number_of_reviews = re.findall("\d+",
driver.find_element_by_xpath("//span[@class='_33O9dg0j']").text.replace(',', ''))[0]
# Find the average review score
average_review = \
re.findall("\d+", driver.find_element_by_xpath("//span[@class='_3cjYfwwQ']").text.replace('.', ''))[
0]
# Checks address is provided on listing
if check_exists_by_xpath("//div[@class='_1sPw_t0w _3sCS_WGO']//span[@class='_3ErVArsu jke2_wbp']"):
# Set's address as address on listing
address = driver.find_element_by_xpath(
"//div[@class='_1sPw_t0w _3sCS_WGO']//span[@class='_3ErVArsu jke2_wbp']").text
else:
# Set's address as None
address = None
# Checks telephone number provided on listing
if check_exists_by_xpath("//span[@class='_1_CU9UZ6 _3ErVArsu jke2_wbp']"):
# Set's telephone as telephone on listing
telephone = driver.find_element_by_xpath("//span[@class='_1_CU9UZ6 _3ErVArsu jke2_wbp']").text
else:
# Set's telephone as None
telephone = None
# Checks website url provide on listing
if check_exists_by_xpath("//span[contains(text(),'Visit hotel website')]"):
# Set's website as website on listing
redirect_url = driver.find_element_by_xpath(
"//div[@class='_1sPw_t0w _3sCS_WGO']//a[@class='_2wKz--mA _2U17tZ2G _1YIGmrPj _2-kzHHGX']").get_attribute(
"href")
website = requests.get(redirect_url).url
else:
# Set's website as None
website = None
# Add's the listing details above as json object
data['listing'].append({
'name': name,
'number_of_reviews': number_of_reviews,
'average_review': average_review,
'address': address,
'telephone': telephone,
'website': website,
})
# Scrapes total number of pages element
total_pages = driver.find_element_by_xpath("//a[6]")
# Extract's total page numbers from element
total_pages_numbers = int(total_pages.text)
# Loop by number of total pages
for z in tqdm(range(total_pages_numbers)):
# Scrapes review ids from page
ids = driver.find_elements_by_xpath("//div[@class='oETBfkHU']")
# Creates review ids json list
review_ids = []
# Loop appends each review id to list
for i in ids:
review_ids.append(i.get_attribute('data-reviewid'))
# Loop for each review on listing by review id
for x in review_ids:
# Checks if review has user id, not all do, mainly old reviews
if check_exists_by_xpath('//div[@data-reviewid="' + x + '"]/..//a[@class="ui_header_link _1r_My98y"]'):
# Scrapes user id element
userid_element = driver.find_element_by_xpath(
'//div[@data-reviewid="' + x + '"]/..//a[@class="ui_header_link _1r_My98y"]')
# Extracts text from user id element
userid = userid_element.text
else:
# Sets user id as none
userid = None
# Scrapes review date element
review_date_element = driver.find_element_by_xpath(
'//div[@data-reviewid="' + x + '"]/..//div[@class="_2fxQ4TOx"]')
# Extract's review date from title attribute
review_date_cleansed = review_date_element.text.rsplit('review ')[1]
review_date = dateparser.parse(review_date_cleansed).strftime('%B %Y')
# Scrapes date of stay element
date_of_stay_element = driver.find_element_by_xpath(
'//div[@data-reviewid="' + x + '"]//span[@class="_34Xs-BQm"]')
# Check if date of stay has text
if date_of_stay_element.text:
# Extracts the date only from the text
date_of_stay = date_of_stay_element.text.replace("Date of stay: ", "")
else:
# Sets date of visit as None
date_of_visit = None
# Scrapes title of review
review_title_element = driver.find_element_by_xpath(
'//div[@data-reviewid="' + x + '"]//a[@class="ocfR3SKN"]')
# Extract title of review
review_title = review_title_element.text
# Checks if review needs to be expanded (More...)
if check_exists_by_xpath('//div[@data-reviewid="' + x + '"]//div[@class="XUVJZtom"]'):
time.sleep(1)
# Review is expanded
if driver.find_element_by_xpath(
'//div[@data-reviewid="' + x + '"]//div[@class="XUVJZtom"]').text == "Read more":
driver.find_element_by_xpath('//div[@data-reviewid="' + x + '"]//div[@class="XUVJZtom"]').click()
# Provide time for review to expand
time.sleep(1)
# Scrapes review body element
review_body_element = driver.find_element_by_xpath(
'//div[@data-reviewid="' + x + '"]//q[@class="IRsGHoPm"]')
# Extracts review body text
review_body = review_body_element.text
# Scrapes review rating element
review_rating_element = driver.find_element_by_xpath(
'//div[@data-reviewid="' + x + '"]//span[starts-with(@class, "ui_bubble_rating bubble_")]')
# Extracts review rating from class, the end of the class name is the rating e.g. bubble_40
review_rating = re.findall("\d+", review_rating_element.get_attribute("class"))[0]
# Append the list with the data extracted
data['reviews'].append({
'userid': userid,
'review_date': str(review_date),
'date_of_stay': date_of_stay,
'review_title': review_title,
'review_body': review_body,
'review_rating': review_rating
})
# Checks if there is a next page to scrape
if total_pages_numbers > 1:
# Finds Next button and clicks it
driver.find_element_by_xpath("//a[contains(text(),'Next')]").click()
# Minus page number after clicked, will stop scraping when no more pages available
total_pages_numbers = total_pages_numbers - 1
# Allow page to load
time.sleep(1)
driver.quit()
return data
# Function to scrape restaurant listing
def scrape_restaurant_listing(page_url, driver):
# Get's page set in page_url and opens it in Chrome
driver.get(page_url)
def check_exists_by_xpath(xpath):
try:
driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return True
data = {}
data['listing'] = []
data['reviews'] = []
# Find the name of the listing and captures in name
name = driver.find_element_by_xpath("//h1[contains(@class,'_3a1XQ88S')]").text
# Find the total number of reviews and captures in number_of_reviews
number_of_reviews = re.findall("\d+",
driver.find_element_by_xpath("//span[@class='_3Wub8auF']").text.replace(',', ''))[0]
# Find the average review score
average_review = re.findall("\d+", driver.find_element_by_xpath("//span[@class='r2Cf69qf']").text.replace('.', ''))[
0]
# Checks address is provided on listing
if check_exists_by_xpath("//a[@href='#MAPVIEW']"):
# Set's address as address on listing
address = driver.find_element_by_xpath("//a[@href='#MAPVIEW']").text
else:
# Set's address as None
address = None
# Checks telephone number provided on listing
if check_exists_by_xpath("//span[@class='_15QfMZ2L']"):
# Set's telephone as telephone on listing
telephone = driver.find_element_by_xpath("//span[@class='_15QfMZ2L']").text
else:
# Set's telephone as None
telephone = None
# Checks website url provide on listing
if check_exists_by_xpath("//a[contains(text(),'Website')]"):
# Set's website as website on listing
website = driver.find_element_by_xpath("//a[contains(text(),'Website')]").get_attribute("href")
else:
# Set's website as None
website = None
# Add's the listing details above as json object
data['listing'].append({
'name': name,
'number_of_reviews': number_of_reviews,
'average_review': average_review,
'address': address,
'telephone': telephone,
'website': website,
})
# Scrapes total number of pages element
total_pages = driver.find_element_by_xpath("//*[starts-with(@class,'pageNum last')]")
# Extract's total page numbers from element
total_pages_numbers = int(total_pages.text)
# Loop by number of total pages
for z in tqdm(range(total_pages_numbers)):
# Scrapes review ids from page
ids = driver.find_elements_by_xpath("//*[starts-with(@id,'review_')]")
# Creates review ids json list
review_ids = []
# Loop appends each review id to list
for i in ids:
review_ids.append(i.get_attribute('id'))
# Loop for each review on listing by review id
for x in review_ids:
# Checks if review has user id, not all do, mainly old reviews
if check_exists_by_xpath('//*[@id="' + x + '"]//div[@class="info_text pointer_cursor"]/div[1]'):
# Scrapes user id element
userid_element = driver.find_element_by_xpath(
'//*[@id="' + x + '"]//div[@class="info_text pointer_cursor"]/div[1]')
# Extracts text from user id element
userid = userid_element.text
else:
# Sets user id as none
userid = None
# Scrapes review date element
review_date_element = driver.find_element_by_xpath('//*[@id="' + x + '"]//span[@class="ratingDate"]')
# Extract's review date from title attribute
review_date = review_date_element.get_attribute("title")
# Checks if the date of visit is available for the review
if check_exists_by_xpath('//*[@id="' + x + '"]//div[@class="prw_rup prw_reviews_stay_date_hsx"]'):
# Scrapes date of visit element
date_of_visit_element = driver.find_element_by_xpath(
'//*[@id="' + x + '"]//div[@class="prw_rup prw_reviews_stay_date_hsx"]')
# Check if date of visit has text
if date_of_visit_element.text:
# Extracts the date only from the text
date_of_visit = date_of_visit_element.text.replace("Date of visit: ", "")
else:
# Sets date of visit as None
date_of_visit = None
# Checks if review title is available
if check_exists_by_xpath('//*[@id="' + x + '"]//div[@class="quote"]'):
# Scrapes title of review
review_title_element = driver.find_element_by_xpath('//*[@id="' + x + '"]//div[@class="quote"]')
# Checks if review title is available, newest review has isNew in class name
if check_exists_by_xpath('//*[@id="' + x + '"]//div[@class="quote isNew"]'):
# Scrapes title of review
review_title_element = driver.find_element_by_xpath('//*[@id="' + x + '"]//div[@class="quote isNew"]')
# Extract title of review
review_title = review_title_element.text
# Checks if review needs to be expanded (More...)
if check_exists_by_xpath('//*[@id="' + x + '"]/div/div[2]/div[2]/div/p/span'):
# Review is expanded
driver.find_element_by_xpath('//*[@id="' + x + '"]/div/div[2]/div[2]/div/p/span').click()
# Provide time for review to expand
time.sleep(1)
# Scrapes review body element
review_body_element = driver.find_element_by_xpath(
'//*[@id="' + x + '"]//div[@class="prw_rup prw_reviews_text_summary_hsx"]/div/p')
# Extracts review body text
review_body = review_body_element.text
# Scrapes review rating element
review_rating_element = driver.find_element_by_xpath(
'//*[@id="' + x + '"]//span[starts-with(@class, "ui_bubble_rating bubble_")]')
# Extracts review rating from class, the end of the class name is the rating e.g. bubble_40
review_rating = re.findall("\d+", review_rating_element.get_attribute("class"))[0]
# Append the list with the data extracted
data['reviews'].append({
'userid': userid,
'review_date': review_date,
'date_of_visit': date_of_visit,
'review_title': review_title,
'review_body': review_body,
'review_rating': review_rating
})
# Checks if there is a next page to scrape
if total_pages_numbers > 1:
# Finds Next button and clicks it
driver.find_element_by_xpath("//a[contains(text(),'Next')]").click()
# Minus page number after clicked, will stop scraping when no more pages available
total_pages_numbers = total_pages_numbers - 1
# Allow page to load
time.sleep(1)
driver.quit()
return data