-
Notifications
You must be signed in to change notification settings - Fork 0
/
pedly.py
76 lines (65 loc) · 1.92 KB
/
pedly.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python
#coding=utf-8
import traceback
from bs4 import BeautifulSoup
import save_html
import save_pdf
# url_format = "https://bbs.pediy.com/search-原创-1-%d.htm"
url_format = "https://bbs.pediy.com/new-digest-%d.htm"
referer = "https://bbs.pediy.com/"
max_page = 1219
def get_article_link(html):
article_link_list = []
soup = BeautifulSoup(html, "lxml")
subject_list = soup.select(".subject")
for subject in subject_list:
a_list = subject.select("a")
if a_list[2].contents[0] != "工具下载".decode("utf-8"):
link = referer + a_list[0]["href"]
print link
article_link_list.append(link)
return article_link_list
def main():
index = 50
while index < 468:
url = url_format % (index)
try:
html = save_html.get_data(url)
article_link_list = get_article_link(html)
fhandle = open("pediy.txt", "a")
fhandle.write("[%5d]\n" % index);
fhandle.write("\n".join(article_link_list))
fhandle.write("\n")
fhandle.close()
for article_link in article_link_list:
try:
save_pdf.exclude_tags_default.append(".avatar_info")
save_pdf.exclude_tags_default.append(".avatar-1")
save_pdf.exclude_tags_default.append(".post > .vtop")
save_pdf.save_pdf_by_url(article_link, ["div .card", "div.card.p-1 > div"], directory="../../Document/pedly/")
except Exception as e:
traceback.print_exc()
# raise e
print article_link
fhandle = open("pediy_err.log", "a")
fhandle.write("[start]\n")
fhandle.write(article_link + "\n")
fhandle.write(traceback.format_exc())
fhandle.write("[end]\n")
fhandle.close()
index += 1
pass
except Exception as e:
print e.message
print index
traceback.print_exc()
# exit(0)
fhandle = open("pediy.log", "a")
fhandle.write("[start]\n")
fhandle.write(url + "\n")
fhandle.write(traceback.format_exc())
fhandle.write("[end]\n")
fhandle.close()
pass
if __name__ == '__main__':
main()