-
Notifications
You must be signed in to change notification settings - Fork 0
/
one.py
152 lines (129 loc) · 5.42 KB
/
one.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# -*- coding: utf-8 -*-
# +-----------------------------------------------------------------------------
# | File: one.py
# | Author: Joseph
# | E-mail: [email protected]
# | Created: 2014-02-20
# | Last modified: 2014-02-20
# | Description:
# | Description for one.py
# |
# | License GPLv3
# +-----------------------------------------------------------------------------
import re
import os
import urllib2
from urllib import urlretrieve
import bs4
from bs4 import BeautifulSoup
start=1 # 起始期刊号
stop=10 # 终止期刊号
# 将字符串中的</br>和<p></p>去掉
# 并将相关的特殊字符转换为tex中能够被识别的字符
# 并将各种未能正确处理掉的html标签去掉
# ‘一个’的官方页面上有些乱七八糟的代码
def text_with_newlines(elem):
text = ''
for e in elem.recursiveChildGenerator():
if isinstance(e, basestring):
text += e.strip()
elif e.name == 'br':
text += '\n\n'
text = text.replace('\\','\\textbackslash ')
text = text.replace('%','\%')
text = text.replace('_','\_')
text = text.replace('#','\#')
text = text.replace('&','\&')
text = text.replace('<div>','')
text = text.replace('</div>','')
text = text.replace('</p>','')
text = text.replace('<p class="p0" style="margin-bottom:0pt; margin-top:0pt; ">','')
text = text.replace('</span>','')
text = text.replace('<span style=" font-size:10.5000pt; ;">','')
text = text.replace('<span style="font-size: 10.5pt;">','')
# 这儿找不到更好的方式把这些标签去除掉,只能这样一个一个的替换掉 -_-|||
return text
def main():
# 定义图片文件夹
imagepath=os.getcwd()+'\images'
if os.path.exists(imagepath) is False:
os.mkdir(imagepath)
# 定义保存的tex文件
filepath="D:\\project\\python\\test\\"+str(start)+r"_"+str(stop-1)+r".tex"
f=open(filepath,'w+')
# 在这里可以对tex文件写入一些格式控制代码
f.write(r'\begin{document}'+'\n')
for titlenumb in range(start,stop):
contenturl=r"http://wufazhuce.com/one/vol."+str(titlenumb)
req=urllib2.Request(contenturl)
response=urllib2.urlopen(req).read()
contentsoup=BeautifulSoup(''.join(response))
vol = contentsoup.find("div",{"class" : "one-titulo"}).string.strip()
print vol
# 写入期刊号到tex文档,并添加链接
f.write(r'\section{\href{http://wufazhuce.com/one/vol.'+str(titlenumb)\
+'}{'+vol+r'}}'+'\n')
# 获得发布日期
date = contentsoup.find("div",{"class" : "one-pubdate"})
day, month = date.findAll('p')
day = text_with_newlines(day)
month = text_with_newlines(month)
############# 一个:图片
# 获得图片的标题和作者信息
imgauthor = text_with_newlines(contentsoup.find("div",{"class" : "one-imagen-leyenda"}))
# 获得每日图片链接并下载
imgsoup=contentsoup.find("div", {"class" : "one-imagen"})
imageurl= imgsoup.find('img')['src']
temp=imagepath+'\%s.jpg' %titlenumb
urlretrieve(imageurl,temp) # 下载图片
# 得到图片说明
imagedescript = contentsoup.find("div",{"class":"one-cita"}).string.strip()
# \oneimage{img}{day}{month}{title and author}{description}
f.write(r'\oneimage{'+str(titlenumb)+r'.jpg}{'+ day.encode('utf-8') +'}{' + month.encode('utf-8') \
+ '}{' + imgauthor.encode('utf-8') + '}{' + imagedescript.encode('utf-8') + '}'+'\n')
################# 一个:文章
# 得到“一个:文章”标题写入tex文档并添加链接
contenttitle=contentsoup.find("h2", {"class" : "articulo-titulo"}).string.strip()
f.write(r'\subsection{\href{http://wufazhuce.com/one/vol.'+str(titlenumb)\
+r'\#articulo}{'+contenttitle.encode('utf-8')+'}}'+'\n')
print contenttitle
# 得到“一个:文章”作者写入tex文档并添加链接
contentauthor=contentsoup.find("p", {"class" : "articulo-autor"}).string.strip()
f.write(r'\centerline{\heiti '+contentauthor.encode('utf-8')+'}'+'\n')
# 得到文章的全部(包含若干tag)
textsoup=contentsoup.find("div", {"class" : "articulo-contenido"})
if len(textsoup.findAll('p')):
for each_p in textsoup.findAll('p'):
f.write(text_with_newlines(each_p).encode('utf-8'))
f.write('\n\n')
elif len(textsoup.findAll('div')) :
for each_p in textsoup.findAll('div'):
f.write(text_with_newlines(each_p).encode('utf-8'))
f.write('\n\n')
else:
f.write(text_with_newlines(textsoup).encode('utf-8'))
f.write('\n\n')
################## 一个:问题
asktitle=contentsoup.find("div", {"class" : "one-cuestion"})
title_asker=asktitle.findAll('h4')
title=title_asker[0].string.strip()
answerer=title_asker[1].string.strip()
print title # 得到“一个:问题”标题title 和回答者answerer
f.write(r'\subsection{\href{http://wufazhuce.com/one/vol.'+str(titlenumb)\
+r'\#cuestion}{'+title.encode('utf-8')+'}}'+'\n')
# 得到“一个:问题”的问题内容
askdetail=contentsoup.findAll("div",{"class" : "cuestion-contenido"})
question = text_with_newlines(askdetail[0])
f.write(question.encode('utf-8')+r'\\[.5ex]'+'\n')
f.write(r'\textbf{'+answerer.encode('utf-8')+'}'+'\n\n')
if len(askdetail[1].findAll('p')):
for each_soup in askdetail[1].findAll('p'):
f.write(text_with_newlines(each_soup).encode('utf-8')+'\n\n')
else:
f.write(text_with_newlines(askdetail[1]).encode('utf-8')+'\n\n')
f.write(r'\newpage'+'\n')
f.write(r'\end{document}')
f.close()
#--------------------------------#
if __name__=="__main__":
main()