-
Notifications
You must be signed in to change notification settings - Fork 38
/
webcrawler_g2.py
128 lines (117 loc) Β· 5.2 KB
/
webcrawler_g2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from __future__ import division
from bs4 import BeautifulSoup
import urllib2,cookielib
def g2crowd(search_term):
"""
G2 Crowd Webcrawler
parameters: portion of g2 crowd link between `products/` and `/reviews` in string form
"""
link, n = search(search_term)
if n > 0:
# TODO: handle no reviews
# n = 30 # number of pages of reviews to crawl
review_count = 0
reviews = []
replace_chars = {'\n': '',
' .': '.',
'.': '. ',
'[': '',
'[': '',
':': '',
'/': ' and ',
' ': '',
'1.': '',
'2.': '',
'3.': '',
'4.': '',
'5.': '',
'6.': '',
'7.': '',
'8.': '',
'9.': '',
'10.': ''
}
# ---------------------------- PRODUCT 1 ---------------------------- #
print("Product: " + search_term)
for i in range(1, n+1):
site= ("https://www.g2crowd.com"+link+"?page="+str(i))
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
}
req = urllib2.Request(site, headers=hdr)
try:
page = urllib2.urlopen(req)
content = page.read()
soup = BeautifulSoup(content, 'html.parser')
for review in soup.find_all(itemprop='review'):
review_count += 1
datetime = review.time['datetime']
for row in review.find_all('p', attrs={"class" : "formatted-text"}):
text = row.text.strip().encode('ascii', 'ignore') # save and encode text
for key, value in replace_chars.iteritems(): # clean text by replacing chars
text = text.replace(key, value)
if not text.endswith('.'): # add periods if necessary
text = text + '.'
reviews.append((text, datetime)) # append review and datetime
except urllib2.HTTPError:
print('No reviews exist for %s ' % search_term)
quit()
else:
reviews = []
review_count = 0
print("Number of Reviews: " + str(review_count))
return reviews, review_count
def search(crawl_term):
"""
Crawls G2 Crowd and returns a list of reviews with their timestamps for any product that contains 'crawl_term' in the product name
"""
crawl_link = crawl_term.replace(' ', '+')
site ='https://www.g2crowd.com/search/products?max=10&query=' + crawl_link
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
}
req = urllib2.Request(site, headers=hdr)
try:
page = urllib2.urlopen(req)
except urllib2.HTTPError as e:
print(e)
content = page.read()
soup = BeautifulSoup(content, 'html.parser')
results = soup.find_all('div', {'class':"slat-right"})
if results:
for result in results:
product = result.a.text
# If the search term is in the product name we have a match
if crawl_term.lower() in product.lower():
# Find the review page start link
review_link = result.a['href']
# Open review page and find last link
site = 'https://www.g2crowd.com' + review_link
hdr = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
}
req = urllib2.Request(site, headers=hdr)
try:
page = urllib2.urlopen(req)
except urllib2.HTTPError as e:
print(e)
content = page.read()
soup = BeautifulSoup(content, 'html.parser')
links = soup.find_all('a', {"class":"pjax"})
for l in links:
text = l.text
if 'Last' in text:
link = l['href'].split('/')[-1].split('?')[-1]
last = [int(part.replace('page=','')) for part in link.split('&') if 'page=' in part][0]
else:
last = 0
else:
# If product not in any of the results, review link and last are null and 0
review_link = ""
last = 0
else:
# If the search returns nothing, review link and last are null and 0
review_link = ""
last = 0
return review_link, last