-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathgettitles.py
102 lines (69 loc) · 2.34 KB
/
gettitles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from requests_html import HTMLSession
import time
import json
import sqlite3
import re
import string
conn_BS = sqlite3.connect('titles_BS.db')
conn_D = sqlite3.connect('titles_D.db')
cur_BS = conn_BS.cursor()
cur_D = conn_D.cursor()
useragent = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Safari/605.1.15'}
hpsession = HTMLSession()
hpsession.headers.update(useragent)
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
def login():
with open('pwd.json','r') as f:
data = json.load(f)
USERNAME = data['USERNAME']
PWD = data['PWD']
loginurl = 'https://www.4d4y.com/forum/logging.php?action=login&loginsubmit=yes&inajax=1'
data = {'loginfield': 'username', 'username': USERNAME, 'password': PWD}
result = hpsession.post(loginurl, data=data)
print(result.text)
def get_title(page,cur,fid):
# fid 2:D版, 6: BS版, 59: E版
baseurl = 'https://www.4d4y.com/forum/forumdisplay.php?orderby=dateline&fid='
listurl = baseurl + str(fid) + '&page=' + str(page)
# print(listurl)
listpage = hpsession.get(listurl)
print(listpage.status_code)
titletrs = listpage.html.find('table.datatable tbody tr')
for titletr in titletrs:
try:
title = titletr.find('th.subject span a',first=True)
if title == None:
continue
postdate = titletr.find('td em',first=True)
print(postdate.text)
if not title.text.isnumeric():
href = title.attrs['href']
tid = re.findall(r'\d+',href)[0]
print(tid)
print(title.text)
# 插入数据库
cur.execute('insert or ignore into titles(title,tid,postdate) values(?, ?,?)', (title.text, tid,postdate.text))
except:
continue
def mainwork():
login()
for page in range(1,5):
get_title(page,cur_BS,6)
conn_BS.commit()
time.sleep(0.5)
conn_BS.close()
for page in range(1,10):
get_title(page,cur_D,2)
conn_D.commit()
time.sleep(0.5)
conn_D.close()
mainwork()