-
Notifications
You must be signed in to change notification settings - Fork 0
/
CrealerCar.py
93 lines (80 loc) · 3.38 KB
/
CrealerCar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import requests
from bs4 import BeautifulSoup
import re
from multiprocessing import Pool
url = 'https://www.autohome.com.cn/grade/carhtml/{}.html'
def year(endurl, ent3, ent1):
endresu = []
res = requests.get(endurl)
soup = BeautifulSoup(res.text, 'html.parser')
f = open('CarInfo.txt', "a+", encoding='utf-8')
for ant0 in soup.select('#divSeries'):
for ant1 in ant0.select('.interval01'):
for ant2 in ant1.select('.interval01-list'):
for ant3 in ant2.select('li'):
resu2 = {}
resu2['车系'] = ent3.select('h4 a')[0].text.replace("·", "")
resu2['品牌'] = ent1.select('dt div a')[0].text.replace("·", "")
resu2['年款'] = ant3.select('.interval01-list-cars p a')[0].text.replace("·", "")
if ant2.select('.interval01-list-guidance div'):
resu2['价格'] = ant3.select('.interval01-list-guidance div')[0].text
else:
resu2['价格'] = '暂无报价'
print(resu2)
f.write(ent1.select('dt div a')[0].text.replace("·", "")+" "+ent3.select('h4 a')[0].text.replace("·", "")+" "+ant3.select('.interval01-list-cars p a')[0].text.replace("·", "")+"\n")
endresu.append(resu2)
f.close()
return endresu
def fn(carurl):
endresult1 = []
res = requests.get(carurl)
soup = BeautifulSoup(res.text, 'html.parser')
for ent1 in soup.select('dl'):
# print(ent1)
for ent2 in ent1.select('.rank-list-ul'):
# print(ent2)
for ent3 in ent2.select('li'):
# print(ent3)
if ent3.select('h4'):
m = re.search('<a href="//car.autohome.com.cn/price(.*)">报价', str(ent3))
if m:
ypnum = m.group(1)
endresult1.extend(year("https://car.autohome.com.cn/price" + ypnum, ent3, ent1))
return endresult1
if __name__ == '__main__':
endresult = []
# res=[]
# 多线程(快)(适用于网络密集型请求)
# for i in range(97, 123):
# p = MyThread(chr(i).upper())
# # res.append(p)
# p.start()
# 多进程(还可以)(cpu密集型,io密集型,网络密集型都适用,但相对占用CPU资源)
# for i in range(97,123):
# p=MyProcess(chr(i).upper())
# p.start()
# 进程池(慢)
p = Pool(4)
for i in range(97, 123):
newurl=url.format(chr(i).upper())
# newurl = url.format(chr(97).upper())
res = p.apply(fn, args=(newurl,))
endresult.extend(res)
# p.close()
# p.join()
print(endresult.__len__())
# newurl = url.format(chr(97).upper())
# fn(newurl)
# for x in res:
# x.join()
# endresult.extend(x.get_result())
# df = pandas.DataFrame(endresult)
# print(df)
# df.to_excel('car18.xlsx')
# f = open('CarInfo.txt', encoding='utf-8')
# for i in f:
# print(i)
# IO密集型代码(文件处理、网络爬虫等),
# 多线程能够有效提升效率(单线程下有IO操作会进行IO等待,造成不必要的时间浪费,
# 而开启多线程能在线程A等待时,自动切换到线程B,可以不浪费CPU的资源,从而能提升程序执行效率)。
# 所以python的多线程对IO密集型代码比较友好。