-
Notifications
You must be signed in to change notification settings - Fork 2
/
1.py
171 lines (170 loc) · 5.88 KB
/
1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
from urllib import request
from bs4 import BeautifulSoup
import re
import json
shequList=[{
"name":"重庆社区",
"img":["","",""]
}]
shequName=["重庆社区","南京社区"]
p=0
djpage=0
fileName="data.sql"
def glyh(cs):
cs.replace("'", "\\'")
cs.replace("\"", "\\'")
return cs
def okpa(http) :
if __name__ == "__main__":
response = request.urlopen(http)
html = response.read()
global p
p=p+1
print("当前第"+str(int(djpage))+"页 第"+str(int(p))+"条");
print(http);
bf = BeautifulSoup(html,"lxml")
if bf.find_all("iframe"):
src = bf.find_all("iframe")[0].attrs['src']
else:
src=http
src=src.replace("https:","")
src=src.replace("salerinfo.html","")
#print(bf.find_all("div",class_="fix-im-cate")[0].get_text())
response2 = request.urlopen("https:"+src)
html2 = response2.read()
bf2 = BeautifulSoup(html2,"lxml")
if bf2.find_all("img",id="head-img"):
img=bf2.find_all("img",id="head-img")[0].attrs['src'] #头像地址
elif bf2.find_all("div",class_="w-head-pic"):
img=bf2.find_all("div",class_="w-head-pic")[0].find('img').attrs['src'] #头像地址
else:
img="无"
if bf2.find_all("h1",class_="title"):
title=bf2.find_all("h1",class_="title")[0].get_text() #店铺名称
elif bf2.find_all("div",class_="w-head-pic"):
title=bf2.find_all("div",class_="w-head-pic")[0].find('img').attrs['alt'] #店铺名称
else:
title="无"
if(len(bf2.find_all("div",class_="no-about"))>0):
jianjie="暂无店铺介绍"
elif bf2.find_all("pre",class_="content-item morestatus content-item-info1"):
jianjie= bf2.find_all("pre",class_="content-item morestatus content-item-info1")[0].get_text() #公司简介
elif bf2.find_all("p",class_="introduce-company-msg"):
jianjie=bf2.find_all("p",class_="introduce-company-msg")[0].get_text()
else:
jianjie="无"
if(bf2.find_all("div",class_="info-content")):
gongsi=bf2.find_all("div",class_="info-content")[0].get_text() #公司名称
address=bf2.find_all("div",class_="info-content")[3].get_text() #公司地址
else:
gongsi="无"
address="无"
fuwuarray=[]
if(bf2.find_all("div",class_="category-item")):
fuwu=bf2.find_all("div",class_="category-item") #公司服务
for o in fuwu:
fuwuarray.append(o.get_text()) #存储为数组
else:
fuwuarray.append("无")
if(bf2.find_all("img",class_="certificate-img lazy")):
yingye = bf2.find_all("img",class_="certificate-img lazy")[0].attrs['data-original']; #营业执照
else:
yingye="无"
shequAddress=""
sname=""
sphone=""
if(bf2.find_all("a",class_="zworks-item")):
shequlen=len(bf2.find_all("a",class_="zworks-item"))
shequ=bf2.find_all("a",class_="zworks-item")[0]
sqName=shequ.get_text(); #社区名字
sqsrc = shequ.attrs['href'] #社区地址
response3 = request.urlopen(sqsrc)
html3 = response3.read()
bf3 = BeautifulSoup(html3,"lxml")
gwimg=[]
gwimgList=bf3.find_all("img",style="width:100%;height: 100%;")
shequList = bf3.find_all("div",class_="info-form-head-right")
bsq=[] #社区用户信息
for d in shequList:
sname=d.find("div",class_="head-title").get_text()
sphone=d.find("div",class_="head-title-phone").get_text()
a={}
a['name']=sname
a['phone']=sphone
bsq.append(a)
if bf3.find_all("p",class_="zwork-positon"):
shequAddress=bf3.find_all("p",class_="zwork-positon")[0].get_text() #社区地址
else:
shequAddress="无"
for i in gwimgList:
gwimg.append(i.attrs['src'])
else:
sqsrc="无"
sqName="无"
gwimg=""
bsq=""
'''
for i in range(shequlen):
shequ=bf2.find_all("a",class_="zworks-item")[i]
sqName=shequ.get_text();
if sqName in shequName:
dqshequ.append(sqName)
else:
dqshequ.append(sqName)
shequName.append(sqName)
'''
img=glyh(img)
bsq=json.dumps(bsq)
title=glyh(title)
jianjie=glyh(jianjie)
gongsi=glyh(gongsi)
address=glyh(address)
yingye=glyh(yingye)
shequAddress=glyh(shequAddress)
sqName=glyh(sqName)
sqsrc=glyh(sqsrc)
sql='INSERT INTO `gongsi`.`gsinfo` SET `title`="'+title+'",`jianjie`="'+jianjie+'",`gsname`="'+gongsi+'",`address`="'+address+'",`yingye`=\''+yingye+'\',`fuwu`=\''+json.dumps(fuwuarray)+'\',`url`=\''+glyh(url)+'\',`logo`="'+img+'",`shequ`="'+sqName+'",`shequUser`=\''+bsq+'\',`shequAddress`="'+shequAddress+'",`shequimg`=\''+json.dumps(gwimg)+'\';'
with open(fileName,'a',encoding='utf-8') as f: #设置文件对象
f.write(sql) #将字符串写入文件中
def palist(http):
if __name__ == "__main__":
response = request.urlopen(http)
html = response.read()
print("开始"+http);
bf = BeautifulSoup(html,"lxml")
global djpage
listindex=bf.find_all("div",class_="pagination")[0].find_all("li",class_="active")[0].find('a').get_text();
djpage=listindex
list=bf.find_all("a",class_="name")
listlen=len(bf.find_all("a",class_="name"))
z=0
for i in list:
z=z+1
if z>=listlen:
palist(paindex(http))
print(paindex(http));
else:
i=i.attrs['href'].replace("?fr=djwy", "")
newurl="https:"+i+"salerinfo.html";
#print(str(z)+" " + str(listlen));
okpa(newurl)
def paindex(http):
if __name__ == "__main__":
response = request.urlopen(http)
html = response.read()
#print("开始");
bf = BeautifulSoup(html,"lxml")
listindex=len(bf.find_all("div",class_="pagination")[0].find_all("li"));
w=0
for d in range(listindex):
w=w+1
t="active"
s=str(bf.find_all("div",class_="pagination")[0].find_all("li")[d])
if(t in s):
break
list=bf.find_all("div",class_="pagination")[0].find_all("li")
page=list[w].find("a").attrs['href'] #下一个
return "https://"+http.split("/")[2]+page;
url = input("输入爬取的列表地址:")
fileName = input("输入保存的文件名称(例如:data.sql):")
palist(url);