-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtaojindi.py
171 lines (154 loc) · 5.74 KB
/
taojindi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
from utils.make_sessions import create_session
import time
from utils.models import TaoJin
from utils.sqlbackends import session_scope
import traceback
from functools import wraps
import math
from urllib.parse import urlparse
def second_run(func):
count = 0
@wraps(func)
def decorate(*args, **kwargs):
nonlocal count
try:
res = func(*args, **kwargs)
except Exception as e:
print(traceback.print_exc())
while True:
time.sleep(2)
print("run again {} {}".format(count, args))
count = count + 1
if count >= 5:
count = 0
return {}
res = decorate(*args, **kwargs)
break
count = 0
return res
return decorate
class TaoJinDi(object):
url_home = "http://hy.taojindi.com"
def __init__(self):
self.session = create_session()
self.jump = ""
self.status = False
def _province(self):
r = self.session.get(self.url_home)
soup = BeautifulSoup(r.text, "lxml")
div = soup.find("div", class_="info-list info-list2")
mas = div.find_all("a")
for a in mas:
d_u = "http://hy.taojindi.com" + a.get("href")
self._sec_cate(d_u, a.text[: -4])
def _sec_cate(self, url, province):
time.sleep(0.2)
r = self.session.get(url)
soup = BeautifulSoup(r.text, "lxml")
div = soup.find("div", class_="info-list info-list5")
mas = div.find_all("a")
temp = []
temp.append((url, province + "最新企业推荐"))
for a in mas:
d_u = "http://hy.taojindi.com" + a.get("href")
temp.append((d_u, a.text))
for item in temp:
self._tatal_pages(*item)
@second_run
def _tatal_pages(self, url, category):
print("total {}".format(url))
r = self.session.get(url)
soup = BeautifulSoup(r.text, "lxml")
div = soup.find("div", class_="paging")
span = div.find("span", class_="total orange ml5 mr5")
total = int(span.text)
if not total:
return
total_pages = math.ceil(total/10)
count = 1
while count < total_pages + 1:
d_u = url[:-1] + "_{}/".format(count)
if self.jump in d_u:
self.status = True
if not self.status:
count = count + 1
print("tiaoguo {}".format(d_u))
continue
self._plist(d_u, category)
count = count + 1
@second_run
def _plist(self, url, category):
print("list {}".format(url))
time.sleep(0.2)
r = self.session.get(url)
soup = BeautifulSoup(r.text, "lxml")
div = soup.find("div", class_="company-info")
lis = div.find_all("li")
for li in lis:
res = {}
res["category"] = category
tel = li.find("div", class_="tel")
res["phone"] = tel.text
a = li.find("a")
res["enterpriseName"] = a.text
d_u = "http://hy.taojindi.com" + a.get("href")
res["url"] = d_u
div = li.find("div", class_="info")
res["about"] = div.text
div = li.find("div", class_="address")
temp = div.text.split()
for item in temp:
if "地址:" in item:
res["address"] = item[len("地址:" ):]
elif "主营产品:" in item:
res["products"] = item[len("主营产品:"):]
with session_scope() as sess:
cns = sess.query(TaoJin).filter(TaoJin.url == res["url"]).first()
if not cns:
resu = self._detail(res["url"])
res.update(resu)
cn = TaoJin(**res)
sess.add(cn)
@second_run
def _detail(self, url):
time.sleep(0.2)
print("detail {}".format(url))
res = {}
r = self.session.get(url)
soup = BeautifulSoup(r.text, "lxml")
div = soup.find("div", class_="company-basic clearfix")
lis = div.find_all("li")
temp = []
for li in lis:
temp.append(li.text)
div = soup.find("div", class_="company-intro p20")
table = div.find("table")
trs = table.find_all("tr")
for tr in trs:
temp.append(" ".join(map(lambda x: x.strip(), tr.text.strip().split("\n"))))
ss_contact = "联系人:"
ss_phone = ["移动电话:", "电话:"]
ss_enterpriseType = "公司类型:"
ss_location = "所属省市:"
ss_industry = "主营行业:"
ss_registeredFunds = "注册资金:"
ss_representative = "企业法人:"
ss_establishedTime = "成立日期:"
ss_address = "地址:"
ss_products = "主营产品:"
tem = locals()
tem2 = {}
for item in temp:
for k in tem.keys():
if "ss" in k and isinstance(tem.get(k), str) and tem.get(k) in item:
res[k.split("_")[-1]] = " ".join(item[item.find(tem.get(k))+len(tem.get(k)):].strip().split())
elif "ss" in k and isinstance(tem.get(k), list):
for tt in tem.get(k):
if tt in item:
tem2[item] = tt
res["phone"] = " ".join([k for k in tem2.keys()])
return res
if __name__ == "__main__":
TaoJinDi()._province()