-
Notifications
You must be signed in to change notification settings - Fork 83
/
Copy pathsearch.py
128 lines (118 loc) · 5.21 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import requests
from dy_utils.dy_util import js, get_headers, get_search_params, splice_url, check_info, handle_search_info_each, download_media, check_and_create_path, norm_str, save_video_detail
class Search:
def __init__(self, info=None):
if info is None:
self.info = check_info()
else:
self.info = info
self.search_url = "https://www.douyin.com/aweme/v1/web/general/search/single/"
self.headers = get_headers()
def get_search_data(self, query, number, sort_type='0'):
params = get_search_params()
params['sort_type'] = sort_type
params['keyword'] = query
params['count'] = '25'
params['webid'] = self.info['webid']
params['msToken'] = self.info['msToken']
splice_url_str = splice_url(params)
xs = js.call('get_dy_xb', splice_url_str)
params['X-Bogus'] = xs
video_list = []
while len(video_list) < number:
post_url = self.search_url + '?' + splice_url(params)
response = requests.get(post_url, headers=self.headers, cookies=self.info['cookies'])
res = response.json()
for item in res['data']:
if item['type'] == 1:
try:
video_detail = handle_search_info_each(item['aweme_info'])
video_list.append(video_detail)
if len(video_list) >= number:
break
except:
continue
if not res['has_more']:
print(f'搜索结果数量为 {len(video_list)}, 不足 {number}')
break
params['offset'] = str(int(params.get('offset', 0)) + 10)
params['count'] = '10'
return video_list
def save_search_data(self, query, number, sort_type, publish_time, need_cover=False):
params = get_search_params()
params['sort_type'] = sort_type
params['publish_time'] = publish_time
params['keyword'] = query
params['count'] = '25'
params['webid'] = self.info['webid']
params['msToken'] = self.info['msToken']
splice_url_str = splice_url(params)
xs = js.call('get_dy_xb', splice_url_str)
params['X-Bogus'] = xs
index = 0
while index < number:
post_url = self.search_url + '?' + splice_url(params)
response = requests.get(post_url, headers=self.headers, cookies=self.info['cookies'])
res = response.json()
for item in res['data']:
if item['type'] == 1:
try:
video_detail = handle_search_info_each(item['aweme_info'])
self.save_one_video_info(video_detail, need_cover)
index += 1
if index >= number:
break
except:
continue
if not res['has_more']:
print(f'搜索结果数量为 {index}, 不足 {number}')
break
params['offset'] = str(int(params.get('offset', 0)) + 10)
params['count'] = '10'
print(f'搜索结果全部下载完成,共 {index} 个视频')
# 工具类,用于保存信息
def save_one_video_info(self, video, need_cover=False):
try:
title = norm_str(video.title)
if title.strip() == '':
title = f'无标题'
if len(title) > 50:
title = title[:50]
path = f'./search_datas/{video.nickname}_{video.sec_uid}/{title}_{video.awemeId}'
exist = check_and_create_path(path)
if exist and not need_cover:
print(f'用户: {video.nickname}, 标题: {title} 本地已存在,跳过保存')
return
save_video_detail(path, video)
if len(video.images) > 0:
for img_index, image in enumerate(video.images):
download_media(path, f'image_{img_index}', image['url_list'][0], 'image', f'第{img_index}张图片')
else:
download_media(path, 'cover', video.video_cover, 'image', '视频封面')
download_media(path, 'video', video.video_addr, 'video')
print(f'用户: {video.nickname}, 标题: {title} 保存成功')
except:
print(f'用户: {video.nickname}, 标题: {norm_str(video.title)} 保存失败')
def main(self, info):
query = info['query']
number = info['number']
sort_type = info['sort_type']
publish_time = info['publish_time']
self.save_search_data(query, number, sort_type, publish_time)
if __name__ == '__main__':
search = Search()
# 搜索关键词
query = '阔澜 可以点评四周的风景'
# 0 智能排序, type='1' 热门排序, type='2' 最新排序
sort_type = '0'
# 搜索的数量(前多少个)
number = 20
# 0为不限时间,其余数字为限制时间,如1是1天内的视频,666是666天内的视频
publish_time = '0'
info = {
'query': query,
'number': number,
'sort_type': sort_type,
'publish_time': publish_time,
}
search.main(info)