-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathgzjianguang.py
157 lines (140 loc) · 5 KB
/
gzjianguang.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# author:gorquan
# date:2018-8-11
from urllib import request
from bs4 import BeautifulSoup as bs
import time
import os
import re
'''
用来爬取网站网页
实现功能:url深度抓取,保存每个页面的css、html、js等文件
'''
# 深度爬取当前页面子网站子网站
def get_urls(url, baseurl, urls):
with request.urlopen(url) as f:
data = f.read().decode('utf-8')
link = bs(data).find_all('a')
for i in link:
suffix = i.get('href')
# 设置排除写入的子连接
if suffix == '#' or suffix == '#carousel-example-generic' or 'javascript:void(0)' in suffix:
continue
else:
# 构建urls
childurl = baseurl + suffix
if childurl not in urls:
urls.append(childurl)
# 抓取全站url
def getallUrl(url, baseurl, urls):
get_urls(url, baseurl, urls)
end = len(urls)
start = 0
while(True):
if start == end:
break
for i in range(start, end):
get_urls(urls[i], baseurl, urls)
time.sleep(1)
start = end
end = len(urls)
# 创建文件夹
def mkdir(title, basedir):
path = basedir + '\\' + title
if os.path.exists(path):
os.rmdir(path)
os.mkdir(path)
print(path + " 目录创建成功")
# 获取每个页面代码以及获取页面上的css,js,img路径
def get_source(url, path):
with request.urlopen(url) as f:
html_source = f.read().decode()
# 添加时间截以区分文件夹名字
timeStr = str(int(time.time()))
pattertitile = '<title>(.*?)</title>'
patterncss = '<link rel="stylesheet" href="(.*?)"'
patternjs = '<script src="(.*?)"'
patternimg = '<img src="(.*?)"'
titleStr = re.compile(pattertitile, re.S).findall(html_source)[0]
if '|' in titleStr:
title = (titleStr.split("|")[1]).split(' ')[1] + timeStr
else:
title = titleStr + timeStr
mkdir(title, path)
path = basedir + '\\' + title
filename = path + '\\' + title + '.html'
# 获取css,js,img地址
cssHerf = re.compile(patterncss, re.S).findall(html_source)
jsHref = re.compile(patternjs, re.S).findall(html_source)
imgHref = re.compile(patternimg, re.S).findall(html_source)
# 保存html
try:
with open(filename, 'w') as f:
f.write(html_source)
except:
print("文件无法保存,请检查参数配置")
exit(1)
print(title + ".html文件保存成功")
# 保存css,js,img
save_css(cssHerf, path)
save_js(jsHref, path)
save_img(imgHref, path)
print(url + "源码保存成功")
time.sleep(1)
# 保存css文件
def save_css(href, path):
for i in range(0, len(href)):
url = "http://www.gzjianguang.com" + href[i]
patternCssTitle = '(/?.*?.css?)'
filename = path + '\\' + re.compile(patternCssTitle, re.S).findall(url)[1][1:]
try:
with request.urlopen(url) as w:
css_source = w.read().decode()
with open(filename, 'w') as f:
f.write(css_source)
print(re.compile(patternCssTitle, re.S).findall(url)[1][1:] + " css文件保存成功!")
time.sleep(1)
except:
print("该" + re.compile(patternCssTitle, re.S).findall(url)[1][1:] + " css文件无法下载")
# 保存js文件
def save_js(href, path):
for i in range(0, len(href)):
url = "http://www.gzjianguang.com" + href[i]
filename = path + '\\' + href[i].split('/')[-1]
try:
with request.urlopen(url) as w:
js_source = w.read().decode()
with open(filename, 'w') as f:
f.write(js_source)
print(href[i].split('/')[-1] + " js文件保存成功")
time.sleep(1)
except:
print("该" + href[i].split('/')[-1] + " js文件无法下载")
continue
# 保存img文件
def save_img(href, path):
for i in range(0, len(href)):
url = "http://www.gzjianguang.com" + href[i]
filename = path + '\\' + href[i].split('/')[-1]
try:
with request.urlopen(url) as w:
img_source = w.read()
with open(filename, 'wb') as f:
f.write(img_source)
print(href[i].split('/')[-1] + " 图像文件保存成功")
time.sleep(1)
except:
print("该" + href[i].split('/')[-1] + " 图像无法下载")
continue
if __name__ == '__main__':
# 抓取网址
url = 'http://www.gzjianguang.com'
# 相对路径地址
baseurl = 'http://www.gzjianguang.com'
# 文件保存位置
basedir = r'E:\pythonCode\gzjianguang'
urls = []
# 获取所有地址
getallUrl(url, baseurl, urls)
# 获取代码
for u in urls:
get_source(u,basedir)