-
Notifications
You must be signed in to change notification settings - Fork 0
/
getweb.py
91 lines (72 loc) · 3.45 KB
/
getweb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 1 15:00:40 2017
@author: He Song
"""
from bs4 import BeautifulSoup
import time
import requests
def reviews(url,type,name):
count=0;
restname=[]
fw=open(name,'w') # output file
for p in range(1,80):
pageLink=url+'/search?find_loc=Los+Angeles,+CA&start='+str(p*10)+'&cflt=' +type# make the page url
for i in range(5): # try 5 times
try:
#use the browser to access the url
response=requests.get(pageLink,headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', })
html=response.content # get the html
break # we got the file, break the loops
except Exception as e:# browser.open() threw an exception, the attempt to get the response failed
print ('failed attempt',i)
time.sleep(2) # wait 2 secs
if not html:continue # couldnt get the page, ignore
soup = BeautifulSoup(html.decode('ascii', 'ignore'),'lxml') # parse the html
divget=soup.findAll('div',{'class':'media-story'})
for div in divget:
#print(review)
#print(div)
try:
restaurant=div.find('a', {'class':'biz-name js-analytics-click'}) # get all the review divs
kind=div.find('span',{'class':'category-str-list'})
#print(restaurant)
if restaurant:
if 'ad_business_id' not in restaurant['href']:
kind_s=kind.findAll('a')
if len(kind_s)==1:
reviewnum=div.find('span',{'class':'review-count rating-qualifier'})
reviewnum=reviewnum.text.replace('reviews','')
reviewnum=reviewnum.replace(' ','')
if float(reviewnum)>200 and count<=50:
#print(kind)
print(reviewnum)
nextse=restaurant['href']
print(nextse)
sp=restaurant.find('span')
print(sp.text)
if sp.text not in restname:
fw.write(str(count)+'\t'+reviewnum+'\t'+nextse+'\t'+sp.text+'\n')
restname.append(sp.text)
count+=1
print('\n')
elif count>50 :
fw.close()
return
except Exception as e:
continue
<<<<<<< Updated upstream
import os.path
def run(outputfile, type=None, override=False):
if type == None:
parts = type.split('.')
type = parts[0]
if override == True or os.path.isfile(outputfile) == False:
reviews('https://www.yelp.com', type, outputfile)
if __name__ == '__main__':
run('chinese')
run('italian')
run('mexican')
=======
reviews('https://www.yelp.com','chinese','chinese.txt')
>>>>>>> Stashed changes