-
Notifications
You must be signed in to change notification settings - Fork 244
/
tumblr.py
93 lines (81 loc) · 2.87 KB
/
tumblr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding=utf-8 -*-
import grequests
import re
import os
import sys
from time import clock
from app import db
from app.models import Context
# search for url of maxium size of a picture, which starts with '<photo-url max-width="1280">' and ends with '</photo-url>'
extractpicre = re.compile(
r'(?<=<photo-url max-width="1280">).+?(?=</photo-url>)', flags=re.S)
extractvideore = re.compile(
'''poster='(.*?)'[\w\W]*?/tumblr_(.*?)" type="video/mp4"''')
video_links = []
pic_links = []
vhead = 'https://vt.tumblr.com/tumblr_%s.mp4'
api_url = 'http://%s.tumblr.com/api/read?&num=50&start='
query_urls = []
def getpost(uid, query_urls):
import requests
url = 'http://%s.tumblr.com/api/read?&num=50' % uid
r = requests.get(url)
total = re.findall('<posts start="0" total="(.*?)">', r.content)[0]
total = int(total)
print uid + ':' + str(total)
a = [i * 50 for i in range(total / 50 + 1)]
ul = api_url % uid
for i in a:
query_url = ul + str(i)
query_urls.append(query_url)
def run(query_urls):
rs = [grequests.get(url) for url in query_urls]
responses = grequests.map(rs, size=10)
for resp in responses:
content = resp.content
videos = extractvideore.findall(content)
video_links.extend([(v[0], vhead % v[1]) for v in videos])
pic_links.extend(extractpicre.findall(content))
def write(name):
videos = [(i[0], i[1].replace('/480', '')) for i in video_links]
pictures = pic_links
for url in list(set(videos)):
poster, video = url
data = Context.query.filter_by(id=name, urls=video).first()
if not data:
data = Context(id=name, urls=video, isvideo=1, poster=poster)
db.session.add(data)
else:
data = Context.query.filter_by(id=name, urls=video).first()
data.poster = poster
db.session.add(data)
for url in list(set(pictures)):
dat = Context.query.filter_by(id=name, urls=url).first()
if not dat:
data = Context(id=name, urls=url, isvideo=0, poster=url)
db.session.add(data)
else:
data = Context.query.filter_by(id=name, urls=url).first()
data.poster = url
data.urls = url
db.session.add(data)
db.session.commit()
def TumblrGet(name):
now = clock()
getpost(name, query_urls)
print len(query_urls)
parts = len(query_urls) / 50 + 1
print parts
for part in range(parts):
urls = query_urls[part:(part + 1) * 50]
run(urls)
write(name)
print "%sparse complete, cose %.1fs" % (name, clock() - now)
print "pictures %d,videos %d" % (len(pic_links), len(video_links))
if __name__ == '__main__':
name = sys.argv[1]
name = name.strip()
# name=raw_input()
# now=clock()
TumblrGet(name)
# print u"图片%d张,视频%d部"%(len(pic_links),len(video_links))