-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcrawl.py
104 lines (93 loc) · 3.11 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import requests
import os
import re
import json
from tqdm import tqdm
from time import sleep
url = "http://jwfw.fudan.edu.cn/eams/stdSyllabus!search.action"
sess = requests.Session()
sess.headers.update({
"Accept":"*/*",
"Accept-Encoding":"gzip, deflate",
"Accept-Language":"en-US,en;q=0.9",
"Connection":"keep-alive",
"Content-Length":"67",
"Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
"Cookie":"", # !!! modify here
"Host":"jwfw.fudan.edu.cn",
"Origin":"http://jwfw.fudan.edu.cn",
"Referer":"http://jwfw.fudan.edu.cn/eams/stdSyllabus!search.action",
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"X-Requested-With":"XMLHttpRequest"
})
def getInfo(pageNo):
try:
res = sess.post(url, data={
"lesson.project.id":1,
"lesson.semester.id":325, # !!! modify here
"_":1517105070988,
"pageNo":pageNo
}).text
return res
except:
sleep(2)
return getInfo(pageNo)
ptr = re.compile('<tr>(.*?)<\/tr>', re.S)
plessonIds = re.compile('value="(\d+)"')
pnum = re.compile('>(\w*\d*\.\d*)<')
pname = re.compile('查看任务详细信息">(.*)<\/a>')
pcredit = re.compile('<\/a><\/td><td>(\d*)<\/td>')
ptutor = re.compile('<\/a><\/td><td>\d*<\/td><td>(.*?)<\/td>')
ptitle = re.compile('<\/a><\/td><td>\d*<\/td><td>.*?<\/td><td>(.*?)<')
plimit = re.compile('<\/a><\/td><td>\d*<\/td><td>.*?<\/td><td>.*?<\/td><td>(\d*)<\/td')
pplace = re.compile('"longTextFormat">(.*?)<', re.S)
ptiming = re.compile('"longTextFormat">.*?<\/td><td>(.*?)<\/td><td>', re.S)
pdepart = re.compile('"longTextFormat">.*?<\/td><td>.*?<\/td><td>(.*?)<', re.S)
def extract(text):
def get(l):
if len(l) > 0:
return l[0]
return ' '
try:
lessonIds = plessonIds.findall(text)[0]
cid = lessonIds
num = get(pnum.findall(text))
name = get(pname.findall(text))
credit = get(pcredit.findall(text))
tutor = get(ptutor.findall(text))
title = get(ptitle.findall(text))
limit = get(plimit.findall(text))
place = get(pplace.findall(text))
timing = get(ptiming.findall(text))
depart = get(pdepart.findall(text))
return {lessonIds: {
'cid': cid,
'num': num,
'name': name,
'credit': credit,
'tutor': tutor,
'title': title,
'limit': limit,
'place': place,
'timing': timing,
'depart': depart
}}
except:
return {}
errcnt = 0
def parse(text):
global errcnt
lines = ptr.findall(text)
res = {}
for line in lines:
res.update(extract(line))
if len(res) != len(lines) - 2:
open('err' + str(errcnt) + '.html', 'w').write(text)
errcnt = errcnt + 1
return res
res = {}
for pageNo in tqdm(range(1, 162)): # !!! modify here
res.update(parse(getInfo(pageNo)))
sleep(0.2)
json.dump(res, open('courses.json', 'w'))
print("finished, totally {} course entry crawled!".format(len(res)))