-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
146 lines (101 loc) · 4.98 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from datetime import datetime, timedelta
import requests
import json
import re
import news
def dealstr(data):
#\D+ replace search more not-number-str, and [^a href="/] is let a href=" not appear, and \d+ replace search more number-str, (?=.html) is search str that have '.html' end str
get_url = []
for item in data:
for goal in re.findall(r'\D[^a href="/]+\d+(?=.html)', item):
get_url.append(goal.replace('/', '') + '.')
return get_url
def againdeal(url_list, output, base_url):
#deal with data, use append add to store_class , findally return
store_class = news.List_news()
i = 1
json_list = []
topic_split = re.compile('<h1 class=\"headline\">.*</h1>')
author_split = re.compile('<span class=\"provider org\">.*</span>')
date_split = re.compile('<abbr title=.*</abbr>')
text_split = re.compile('<p class=\"first\">.*</p>|<p>.*</p>') #It is very difficult thought for a long, but can be found with union
for url in url_list:
nextweb = requests.get(base_url + str(url) + 'html')
nextweb.encoding = 'utf-8'
information = nextweb.text
#Prevent coding problems
try:
#uer "str" , because list not use
topic = str(topic_split.findall(information)).replace('<h1 class=\"headline\">', '').replace('</h1>', '').replace('\\u3000', '', 20).replace('╱', '', 10).replace('[', '', 10).replace(']', '',10)
author = str(author_split.findall(information)).replace('<span class=\"provider org\">', '').replace('</span>', '').replace('[', '', 10).replace(']', '',10)
date = str(date_split.findall(information)).replace('>', '<', 10).split('<')[2] #this is so trouble, it is ["", "<abbr title = ...", "date", "</abbr>", ""], so is data[2]
text = str(text_split.findall(information)).replace('<p class=\"first\">', '').replace('</p>', '', 100).replace(' ', '', 100).replace('<p>', '', 100).replace('[', '', 10).replace(']', '',10).replace('\',\'', '', 10)
#deal with date
if '下' in date:
date = date.replace('下午', '')
try:
date = datetime.strptime(date, '%Y年%m月%d日 %H:%M') + timedelta(hours = 12)
except:
date = datetime.strptime(date, '%Y年%m月%d日 %H:%M') + timedelta(days = 1, hours = -12)
else:
date = date.replace('上午', '')
date = datetime.strptime(date, '%Y年%m月%d日 %H:%M')
store_class.append(news.News(topic, author, date, text))
json_list.append(store_class.news[i - 1].toDict())
print('第', i, '則新聞已擷取完')
i += 1
except:
continue
output.write(json.dumps(json_list, ensure_ascii = False))
print('讀取完畢!')
return store_class
def using_keyword(class_list):
keyword = input('請輸入關鍵字:')
result = class_list.search_topic(keyword)
return result
def using_time(class_list):
first_goal_time = int(input('請輸入時段的開題(0-23):'))
end_goal_time = int(input('請輸入時段的結尾(1-24):'))
result = class_list.search_time(first_goal_time, end_goal_time)
return result
def using_author(class_list):
goal = input('請輸入作者:')
result = class_list.search_author(goal)
return result
def leave(class_list):
print('謝謝使用!')
exit()
def error(class_list):
print('ERROR')
def save(data):
json_list_save = []
with open(input('請輸入檔名:') + '.json', 'wt', encoding = 'utf-8') as output:
for item in data:
json_list_save.append(item.toDict())
output.write(json_list_save, ensure_ascii = False)
def output_data(data):
for item in data:
print(str(item))
def main():
url_list = [] #put into first web url list
class_list = [] #every web data stroe in class and retrun it as list
base_url = 'https://tw.news.yahoo.com/' #this is yahoo news head url
function_dict = {'1':using_keyword, '2':using_time, '3':using_author, '4':leave}
firstweb = requests.get('https://tw.news.yahoo.com/society/')
firstweb.encoding = 'utf-8'
book = firstweb.text
m = re.findall('<a href=\"/.*html\" class=\"title \"', book) #m is search all urs list
url_list = dealstr(m)
with open('result.json', 'wt', encoding = 'utf-8') as output:
class_list = againdeal(url_list, output, base_url)
while True:
print( ' \n 1.找標題\n 2.找一段時間 \n 3.找作者 \n 4.離開')
search_result = function_dict.get(input('請輸入數字:'), error)(class_list)
print('\n S.儲存\n O.輸出\n L.離開')
cmd = input('請輸入文字:')
if cmd == 'S': save(search_result)
if cmd == 'O': output_data(search_result)
if cmd == 'L': leave(class_list)
if cmd not in 'SOL': error(class_list)
if __name__ == '__main__':
main()