This repository has been archived by the owner on Jul 1, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Items.py
64 lines (56 loc) · 2.17 KB
/
Items.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# -*- coding: utf-8 -*-
import re
import datetime
from ruia import AttrField, TextField, Item
class BiliBiliItem(Item):
target_item = TextField(css_select='li.small-item.fakeDanmu-item')
title = TextField(css_select='a.title')
url = AttrField(css_select='a.title', attr='href')
time = TextField(css_select='span.time')
async def clean_url(self, value):
try:
index = value.find(r'//')
if index < 0:
return value
elif index == 0:
newValue = value.replace(r'//', '')
return newValue
except Exception as ex:
print(ex)
async def clean_time(self, value):
try:
# 说明日期不符合格式,估计是中文,比如,昨天
if value.find('-') < 0:
return None
vlist = value.split('-')
# Y M D
if len(vlist) == 3:
date = datetime.datetime.strptime(value.strip(), '%Y-%m-%d')
nowTime = date.strftime("%Y-%m-%d")
return nowTime
elif len(vlist) == 2:
date = str(datetime.datetime.now().year) + "-" + value.strip()
date = datetime.datetime.strptime(date.strip(), '%Y-%m-%d')
nowTime = date.strftime("%Y-%m-%d")
return nowTime
else:
raise Exception("Error:BiliBiliItem clean_time else " + nowTime)
except Exception as ex:
print(ex)
class UserItem(Item):
target_item = TextField(css_select='div.h-user')
UserName = TextField(xpath_select='//span[@id="h-name"]')
class PageItem(Item):
target_item = TextField(css_select='ul.be-pager')
count = TextField(css_select='span.be-pager-total')
async def clean_count(self, value):
nowpgCount = 1
pgc = re.findall(r"\d+\.?\d*", value)
if pgc:
nowpgCount = int(pgc[0])
else:
raise Exception("Error:PageItem re.findall -> pageInfo.count")
return nowpgCount
class VideoCountItem(Item):
target_item = TextField(css_select='li.contribution-item.cur')
count = TextField(css_select='span.num')