-
Notifications
You must be signed in to change notification settings - Fork 0
/
review_crawler.py
77 lines (62 loc) · 2.76 KB
/
review_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import json
import scrapy
import sys
import pandas as pd
from scrapy.crawler import CrawlerProcess
class TestSpider(scrapy.Spider):
def __init__(self, url,short_url):
self.url = url
self.short_url = short_url
self.name = 'test'
self.start_urls = [self.url]
all_reviews = list()
def parse(self, response):
section = response.xpath("/html/body/div[1]/div/div/div/main/div/div[4]/section")
for div in section.css('div[class="styles_cardWrapper__LcCPA styles_show__HUXRb styles_reviewCard__9HxJJ"]'):
name = div.xpath('article/aside/div/a/span/text()').extract()[0]
no_reviews = div.xpath('article/aside/div/a/div/span/text()').extract()[0]
location = div.xpath('article/aside/div/a/div/div/span/text()').extract()[0]
stars = div.xpath('article/section/div/div/img/@alt').extract()[0]
stars = stars.split('out')[0].split('Rated')[1]
date_of_review = div.xpath('article/section/div/div[2]/time/text()').extract()
if len(date_of_review) > 0:
date_of_review = date_of_review[0]
header = div.xpath('article/section/div[2]/a/h2/text()').extract()[0]
body = div.xpath('article/section/div[2]/p/text()').extract()[0]
date_of_experience = div.xpath('article/section/div[2]/p[2]/text()').extract()
if len(date_of_experience) > 0:
date_of_experience = date_of_experience[1]
review = {
# "name": name,
# "number of reviews": no_reviews,
# "location": location,
# "stars": stars,
# "date of review": date_of_review,
# "header": header,
"body": body
# "date of experience": date_of_experience
}
self.all_reviews.append(review)
next_page = section.xpath('div[contains(@class, "styles_pagination__6VmQv")]/nav/a[5]/@href').extract()
if next_page:
full_next_page_url = f"{self.short_url}{next_page[0]}"
print(full_next_page_url)
yield scrapy.Request(
url = full_next_page_url,
callback=self.parse
)
else:
with open('reviews.json', 'w') as f:
json.dump(obj=self.all_reviews, fp=f)
# json.dump(obj={"reviews": self.all_reviews}, fp=f)
df = pd.read_json('reviews.json')
df.to_csv('reviews1.csv', index=False)
print("""
That is all for today
""")
if __name__ == "__main__":
url = sys.argv[1]
short_url = url.split('/review')[0]
process = CrawlerProcess()
process.crawl(TestSpider,url,short_url)
process.start()